def parse_cmdln(): parser = get_args() args = parser.parse_args() if args.program == 'gpu_temp': assertion(nvmlInit, ImportError('nvidia-ml-py is required for this program.')) assertion(mpl, ImportError('matplotlib is required for this program.')) assertion(args.deviceID, AssertionError('GPU index must be declared.')) nvmlInit() args.handle = nvmlDeviceGetHandleByIndex(args.deviceID) if args.program == 'cpu_usage': assertion(psutil, ImportError('psutil is required for this program.')) if args.program == 'screen_glow': assertion(PIL, ImportError('PIL is required for this program.')) return args
def __check_gpu(self): """ Check if the process list contains GPU processes and determine if GPUs exists. Add GPU processes to the processes list if required.""" if not self.exp.meta_data.plugin_list._contains_gpu_processes(): return try: import pynvml as pv except: logging.debug("pyNVML module not found") raise Exception("pyNVML module not found") try: pv.nvmlInit() count = int(pv.nvmlDeviceGetCount()) logging.debug("%s GPUs have been found.", count) except: logging.debug("No GPUs have been found.") raise Exception("The process list contains GPU plugins, but " " no GPUs have been found.") processes = self.exp.meta_data.get_meta_data('processes') if not [i for i in processes if 'GPU' in i]: logging.debug("GPU processes missing. GPUs found so adding them.") cpus = ['CPU'+str(i) for i in range(count)] gpus = ['GPU'+str(i) for i in range(count)] for i in range(min(count, len(processes))): processes[processes.index(cpus[i])] = gpus[i] self.exp.meta_data.set_meta_data('processes', processes)
def __check_gpu(self): """ Check if the process list contains GPU processes and determine if GPUs exists. Add GPU processes to the processes list if required.""" if not self.exp.meta_data.plugin_list._contains_gpu_processes(): return try: import pynvml as pv except: logging.debug("pyNVML module not found") raise Exception("pyNVML module not found") try: pv.nvmlInit() count = int(pv.nvmlDeviceGetCount()) logging.debug("%s GPUs have been found.", count) if not self.exp.meta_data.get('test_state'): for i in range(count): handle = pv.nvmlDeviceGetHandleByIndex(i) if pv.nvmlDeviceGetComputeRunningProcesses(handle): raise Exception("Unfortunately, GPU %i is busy. Try \ resubmitting the job to the queue." % i) except Exception as e: raise Exception("Unable to run GPU plugins: %s", e.message) self.__set_gpu_processes(count)
def initialize(self): """ Initialize pynvml """ if not self.initialized: if IS_MACOS: if self.logger: self.logger.debug("macOS Detected. Using pynvx") try: pynvx.cudaInit() except RuntimeError: self.initialized = True return else: try: if self.logger: self.logger.debug("OS is not macOS. Using pynvml") pynvml.nvmlInit() except (pynvml.NVMLError_LibraryNotFound, # pylint: disable=no-member pynvml.NVMLError_DriverNotLoaded, # pylint: disable=no-member pynvml.NVMLError_NoPermission): # pylint: disable=no-member self.initialized = True return self.initialized = True self.get_device_count() self.get_active_devices() self.get_handles()
def auto_select_gpu(): """Select gpu which has largest free memory""" if HAS_NVML: pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() largest_free_mem = 0 largest_free_idx = 0 for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) info = pynvml.nvmlDeviceGetMemoryInfo(handle) if info.free > largest_free_mem: largest_free_mem = info.free largest_free_idx = i pynvml.nvmlShutdown() largest_free_mem = largest_free_mem / 1024. / 1024. # Convert to MB idx_to_gpu_id = {} for i in range(deviceCount): idx_to_gpu_id[i] = '{}'.format(i) gpu_id = idx_to_gpu_id[largest_free_idx] logging.info('Using largest free memory GPU {} with free memory {}MB'.format(gpu_id, largest_free_mem)) return gpu_id else: logging.info('nvidia-ml-py is not installed, automatically select gpu is disabled!') return '0'
def get_gpu_mem_used(): try: from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo nvmlInit() handle = nvmlDeviceGetHandleByIndex(0) totalMemory = nvmlDeviceGetMemoryInfo(handle) return totalMemory.used except Exception: return -1
def get_available_gpus(): try: import pynvml as pv except: logging.debug("pyNVML module not found") raise Exception("pyNVML module not found") pv.nvmlInit() count = int(pv.nvmlDeviceGetCount()) return pv, count
def get_gpu_temperatures(): nvmlInit() gpus = dict() for i in range(nvmlDeviceGetCount()): handle = nvmlDeviceGetHandleByIndex(i) gpus[i] = int(nvmlDeviceGetTemperature(handle, 0)) nvmlShutdown() return gpus
def initialize(self): """ Initialize pynvml """ if not self.initialized: try: pynvml.nvmlInit() except pynvml.NVMLError_LibraryNotFound: self.initialized = True return self.initialized = True self.get_device_count() self.get_handles()
def init_nvidia(self): """Init the NVIDIA API.""" if import_error_tag: self.nvml_ready = False try: pynvml.nvmlInit() self.device_handles = get_device_handles() self.nvml_ready = True except Exception: logger.debug("pynvml could not be initialized.") self.nvml_ready = False return self.nvml_ready
def _init_nvml(self): if self._load_nvidia_lib() == -1: return -1 try: global pynvml import pip pip.main(['install', '--quiet', 'nvidia-ml-py']) import pynvml as pynvml pynvml.nvmlInit() return 0 except pynvml.NVMLError, err: logger.debug('Failed to initialize NVML: ', err) return -1
def main(): port = int(sys.argv[1]) if len(sys.argv) > 1 else 9200 try: pynvml.nvmlInit() atexit.register(pynvml.nvmlShutdown) register_standard_metrics() print('Starting on port {}'.format(port)) httpd = HTTPServer(('', port), MetricsHandler) httpd.serve_forever() except pynvml.NVMLError, err: print('NVML error: {}'.format(err))
def __init__(self, protocols, **kwargs): Monitor.__init__(self, **kwargs) self.protocols = protocols self.cpuAlert = kwargs['cpuAlert'] self.memAlert = kwargs['memAlert'] self.swapAlert = kwargs['swapAlert'] self._dataBase = kwargs.get('dbName', SYSTEM_LOG_SQLITE) self._tableName = kwargs.get('tableName', 'log') self.doGpu = kwargs['doGpu'] self.doNetwork = kwargs['doNetwork'] self.doDiskIO = kwargs['doDiskIO'] self.samplingTime = 1. # seconds self.labelList = ["cpu", "mem", "swap"] if self.doGpu: self.gpuLabelList = [] # get Gpus to monitor self.gpusToUse = [int(n) for n in (kwargs['gpusToUse']).split()] for i in self.gpusToUse: self.gpuLabelList.append("gpuMem_%d" % i) self.gpuLabelList.append("gpuUse_%d" % i) self.gpuLabelList.append("gpuTem_%d" % i) # init GPUs nvmlInit() self.labelList += self.gpuLabelList else: self.gpusToUse = None if self.doNetwork: self.nif = kwargs['nif'] self.netLabelList = [] # in the future we may display # all the network interfaces self.netLabelList.append("%s_send" % self.nif) self.netLabelList.append("%s_recv" % self.nif) self.labelList += self.netLabelList else: self.nif = None if self.doDiskIO: self.netLabelList = [] # in the future we may display # all the network interfaces self.netLabelList.append("disk_read") self.netLabelList.append("disk_write") self.labelList += self.netLabelList else: pass self.conn = lite.connect(os.path.join(self.workingDir, self._dataBase), isolation_level=None) self.cur = self.conn.cursor()
def get_nvml_driver_version(): try: from pynvml import nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion try: nvmlInit() v = nvmlSystemGetDriverVersion() log("nvmlSystemGetDriverVersion=%s", v) return v.split(".") except Exception as e: log.warn("Warning: failed to query the NVidia kernel module version via NVML:") log.warn(" %s", e) finally: nvmlShutdown() except ImportError as e: log("cannot use nvml to query the kernel module version:") log(" %s", e) return ""
def request_mem(mem_mb, i_am_nice=True): # titanx' mem: 12,881,559,552 bytes # 12*1024*1024*1024 = 12,884,901,888 mem = mem_mb * 1024 * 1024 nvml.nvmlInit() # n = nvml.nvmlDeviceGetCount() try: handle = nvml.nvmlDeviceGetHandleByIndex(0) info = nvml.nvmlDeviceGetMemoryInfo(handle) cap = info.total * nice_ratio # req = cap if mem > cap and i_am_nice else mem req = mem if req > cap and i_am_nice: raise MemoryError('You are supposed to be polite..') if req > info.free: raise MemoryError('Cannot fullfil the gpumem request') return req / info.free finally: nvml.nvmlShutdown()
def getFreeId(): import pynvml pynvml.nvmlInit() def getFreeRatio(id): handle = pynvml.nvmlDeviceGetHandleByIndex(id) use = pynvml.nvmlDeviceGetUtilizationRates(handle) ratio = 0.5*(float(use.gpu+float(use.memory))) return ratio deviceCount = pynvml.nvmlDeviceGetCount() available = [] for i in range(deviceCount): if getFreeRatio(i)<70: available.append(i) gpus = '' for g in available: gpus = gpus+str(g)+',' gpus = gpus[:-1] return gpus
def collect_via_pynvml(self, stats_config): """ Use pynvml python binding to collect metrics :param stats_config: :return: """ try: NVML_TEMPERATURE_GPU = 0 pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() for device_index in xrange(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(device_index) memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle) metrics = { 'memory.total': memoryInfo.total / 1024 / 1024, 'memory.used': memoryInfo.total / 1024 / 1024, 'memory.free': memoryInfo.free / 1024 / 1024, 'utilization.gpu': utilizationRates.gpu, 'utilization.memory': utilizationRates.memory, 'temperature.gpu': pynvml.nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) } for stat_name in stats_config[1:]: metric = metrics.get(stat_name) if metric: metric_name = 'gpu_{index}.{stat_name}'.format( index=str(device_index), stat_name=stat_name ) self.publish(metric_name, metric) finally: pynvml.nvmlShutdown()
def load_pynvml_env(): import pynvml # nvidia-ml-py3 # # BEGIN: Temporary workaround for nvml.dll load issue in Win10 (continued) _LoadNvmlLibrary() pynvml.nvmlLib = nvmlLib # # END: Temporary workaround for nvml.dll load issue in Win10 # if platform.system() == "Darwin": try: from pynvx import pynvml except: print("please install pynvx on OSX: pip install pynvx") sys.exit(1) pynvml.nvmlInit() return pynvml pynvml.nvmlInit() return pynvml
def __init__(self, index): try: nv.nvmlInit() self._handle = nv.nvmlDeviceGetHandleByIndex(index) except nv.NVMLError_LibraryNotFound: pass
def benchmark(input_arguments): args = process_arguments(input_arguments) if args.verbose: logging.basicConfig(level=logging.DEBUG) elif args.quiet: logging.basicConfig(level=logging.WARNING) else: logging.basicConfig(level=logging.INFO) source_db_user = args.user source_db_passwd = args.passwd source_db_server = args.server source_db_port = args.port source_db_name = args.name source_table = args.table label = args.label if args.queries_dir: queries_dir = args.queries_dir else: queries_dir = os.path.join(os.path.dirname(__file__), "queries") iterations = int(args.iterations) if (iterations > 1) is not True: # Need > 1 iteration as first iteration is dropped from calculations logging.error("Iterations must be greater than 1") exit(1) gpu_count = args.gpu_count gpu_name = args.gpu_name no_gather_conn_gpu_info = args.no_gather_conn_gpu_info gather_nvml_gpu_info = args.gather_nvml_gpu_info no_gather_nvml_gpu_info = args.no_gather_nvml_gpu_info machine_name = args.machine_name machine_uname = args.machine_uname destinations = args.destination.split(",") if "mapd_db" in destinations: valid_destination_set = True dest_db_user = args.dest_user dest_db_passwd = args.dest_passwd if args.dest_server is None: # If dest_server is not set for mapd_db, then exit logging.error( '"dest_server" is required when destination = "mapd_db"') exit(1) else: dest_db_server = args.dest_server dest_db_port = args.dest_port dest_db_name = args.dest_name dest_table = args.dest_table dest_table_schema_file = args.dest_table_schema_file if "file_json" in destinations: valid_destination_set = True if args.output_file_json is None: # If output_file_json is not set for file_json, then exit logging.error( '"output_file_json" is required when destination = "file_json"' ) exit(1) else: output_file_json = args.output_file_json if "output" in destinations: valid_destination_set = True if "jenkins_bench" in destinations: valid_destination_set = True if args.output_file_jenkins is None: # If output_file_jenkins is not set for jenkins_bench, then exit logging.error('"output_file_jenkins" is required ' + 'when destination = "jenkins_bench"') exit(1) else: output_file_jenkins = args.output_file_jenkins output_tag_jenkins = args.output_tag_jenkins if not valid_destination_set: logging.error("No valid destination(s) have been set. Exiting.") exit(1) # Establish connection to mapd db con = get_connection( db_user=source_db_user, db_passwd=source_db_passwd, db_server=source_db_server, db_port=source_db_port, db_name=source_db_name, ) if not con: exit(1) # Exit if cannot connect to db # Set run vars run_guid = str(uuid.uuid4()) logging.debug("Run guid: " + run_guid) run_timestamp = datetime.datetime.now() run_connection = str(con) logging.debug("Connection string: " + run_connection) run_driver = "" # TODO run_version = con._client.get_version() if "-" in run_version: run_version_short = run_version.split("-")[0] else: run_version_short = run_version conn_machine_name = re.search(r"@(.*?):", run_connection).group(1) # Set GPU info fields conn_gpu_count = None source_db_gpu_count = None source_db_gpu_mem = None source_db_gpu_driver_ver = "" source_db_gpu_name = "" if no_gather_conn_gpu_info: logging.debug( "--no-gather-conn-gpu-info passed, " + "using blank values for source database GPU info fields " + "[run_gpu_count, run_gpu_mem_mb] ") else: logging.debug("Gathering source database GPU info fields " + "[run_gpu_count, run_gpu_mem_mb] " + "using pymapd connection info. ") conn_hardware_info = con._client.get_hardware_info(con._session) conn_gpu_count = conn_hardware_info.hardware_info[0].num_gpu_allocated if conn_gpu_count == 0 or conn_gpu_count is None: no_gather_nvml_gpu_info = True if conn_gpu_count == 0: logging.warning( "0 GPUs detected from connection info, " + "using blank values for source database GPU info fields " + "If running against cpu-only server, make sure to set " + "--no-gather-nvml-gpu-info and --no-gather-conn-gpu-info.") else: source_db_gpu_count = conn_gpu_count try: source_db_gpu_mem = int( conn_hardware_info.hardware_info[0].gpu_info[0].memory / 1000000) except IndexError: logging.error("GPU memory info not available from connection.") if no_gather_nvml_gpu_info: logging.debug( "--no-gather-nvml-gpu-info passed, " + "using blank values for source database GPU info fields " + "[gpu_driver_ver, run_gpu_name] ") elif conn_machine_name == "localhost" or gather_nvml_gpu_info: logging.debug("Gathering source database GPU info fields " + "[gpu_driver_ver, run_gpu_name] " + "from local GPU using pynvml. ") import pynvml pynvml.nvmlInit() source_db_gpu_driver_ver = pynvml.nvmlSystemGetDriverVersion().decode() for i in range(source_db_gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) # Assume all cards are the same, overwrite name value source_db_gpu_name = pynvml.nvmlDeviceGetName(handle).decode() pynvml.nvmlShutdown() # If gpu_count argument passed in, override gathered value if gpu_count: source_db_gpu_count = gpu_count # Set machine names, using local info if connected to localhost if conn_machine_name == "localhost": local_uname = os.uname() if machine_name: run_machine_name = machine_name else: if conn_machine_name == "localhost": run_machine_name = local_uname.nodename.split(".")[0] else: run_machine_name = conn_machine_name if machine_uname: run_machine_uname = machine_uname else: if conn_machine_name == "localhost": run_machine_uname = " ".join(local_uname) else: run_machine_uname = "" # Read query files contents and write to query_list query_list = [] logging.debug("Queries dir: " + queries_dir) try: for query_filename in os.listdir(queries_dir): logging.debug("Validating query filename: " + query_filename) if validate_query_file(query_filename=query_filename): with open(queries_dir + "/" + query_filename, "r") as query_filepath: logging.debug("Reading query with filename: " + query_filename) query_mapdql = query_filepath.read().replace("\n", " ") query_mapdql = query_mapdql.replace( "##TAB##", source_table) query_list.append({ "name": query_filename, "mapdql": query_mapdql }) logging.info("Read all query files") except FileNotFoundError: logging.exception("Could not find queries directory.") exit(1) # Exit if cannot get queries dir # Run queries for query in query_list: # Set additional query vars # Query ID = filename without extention query_id = query["name"].rsplit(".")[0] # Run iterations of query query_results = [] logging.info("Running query: " + query["name"] + " iterations: " + str(iterations)) query_total_start_time = timeit.default_timer() for iteration in range(iterations): # Gather memory before running query iteration logging.debug("Getting pre-query memory usage on CPU") pre_query_cpu_mem_usage = get_mem_usage(con=con, mem_type="cpu") logging.debug("Getting pre-query memory usage on GPU") pre_query_gpu_mem_usage = get_mem_usage(con=con, mem_type="gpu") # Run query iteration logging.debug("Running iteration " + str(iteration) + " of query " + query["name"]) query_result = execute_query( query_name=query["name"], query_mapdql=query["mapdql"], iteration=iteration, con=con, ) # Gather memory after running query iteration logging.debug("Getting post-query memory usage on CPU") post_query_cpu_mem_usage = get_mem_usage(con=con, mem_type="cpu") logging.debug("Getting post-query memory usage on GPU") post_query_gpu_mem_usage = get_mem_usage(con=con, mem_type="gpu") # Calculate total (post minus pre) memory usage after query iteration query_cpu_mem_usage = round( post_query_cpu_mem_usage["usedram"] - pre_query_cpu_mem_usage["usedram"], 1, ) query_gpu_mem_usage = round( post_query_gpu_mem_usage["usedram"] - pre_query_gpu_mem_usage["usedram"], 1, ) if query_result: query.update({"succeeded": True}) query_error_info = "" # TODO - interpret query error info # Assign first query iteration times if iteration == 0: first_execution_time = round( query_result["execution_time"], 1) first_connect_time = round(query_result["connect_time"], 1) first_results_iter_time = round( query_result["results_iter_time"], 1) first_total_time = (first_execution_time + first_connect_time + first_results_iter_time) first_cpu_mem_usage = query_cpu_mem_usage first_gpu_mem_usage = query_gpu_mem_usage else: # Put noninitial iterations into query_result list query_results.append(query_result) # Verify no change in memory for noninitial iterations if query_cpu_mem_usage != 0.0: logging.error( ("Noninitial iteration ({0}) of query ({1}) " + "shows non-zero CPU memory usage: {2}").format( iteration, query["name"], query_cpu_mem_usage)) if query_gpu_mem_usage != 0.0: logging.error( ("Noninitial iteration ({0}) of query ({1}) " + "shows non-zero GPU memory usage: {2}").format( iteration, query["name"], query_gpu_mem_usage)) else: query.update({"succeeded": False}) logging.warning("Error detected during execution of query: " + query["name"] + ". This query will be skipped and " + "times will not reported") if query["succeeded"] is False: # Do not run any more iterations of the failed query break if query["succeeded"] is False: # Do not calculate results for the failed query, move on to the next continue # Calculate time for all iterations to run query_total_elapsed_time = round( ((timeit.default_timer() - query_total_start_time) * 1000), 1) logging.info("Completed all iterations of query " + query["name"]) # Aggregate iteration values execution_times, connect_times, results_iter_times, total_times = ( [], [], [], [], ) for query_result in query_results: execution_times.append(query_result["execution_time"]) connect_times.append(query_result["connect_time"]) results_iter_times.append(query_result["results_iter_time"]) total_times.append(query_result["total_time"]) # Overwrite result count, since should be the same for each iteration result_count = query_result["result_count"] # Calculate query times logging.debug("Calculating times from query " + query["name"]) query_times = calculate_query_times( total_times=total_times, execution_times=execution_times, connect_times=connect_times, results_iter_times=results_iter_times, ) # Update query dict entry with all values query.update({ "results": { "run_guid": run_guid, "run_timestamp": run_timestamp, "run_connection": run_connection, "run_machine_name": run_machine_name, "run_machine_uname": run_machine_uname, "run_driver": run_driver, "run_version": run_version, "run_version_short": run_version_short, "run_label": label, "run_gpu_count": source_db_gpu_count, "run_gpu_driver_ver": source_db_gpu_driver_ver, "run_gpu_name": source_db_gpu_name, "run_gpu_mem_mb": source_db_gpu_mem, "run_table": source_table, "query_id": query_id, "query_result_set_count": result_count, "query_error_info": query_error_info, "query_conn_first": first_connect_time, "query_conn_avg": query_times["connect_time_avg"], "query_conn_min": query_times["connect_time_min"], "query_conn_max": query_times["connect_time_max"], "query_conn_85": query_times["connect_time_85"], "query_exec_first": first_execution_time, "query_exec_avg": query_times["execution_time_avg"], "query_exec_min": query_times["execution_time_min"], "query_exec_max": query_times["execution_time_max"], "query_exec_85": query_times["execution_time_85"], "query_exec_25": query_times["execution_time_25"], "query_exec_stdd": query_times["execution_time_std"], # Render queries not supported yet "query_render_first": None, "query_render_avg": None, "query_render_min": None, "query_render_max": None, "query_render_85": None, "query_render_25": None, "query_render_stdd": None, "query_total_first": first_total_time, "query_total_avg": query_times["total_time_avg"], "query_total_min": query_times["total_time_min"], "query_total_max": query_times["total_time_max"], "query_total_85": query_times["total_time_85"], "query_total_all": query_total_elapsed_time, "results_iter_count": iterations, "results_iter_first": first_results_iter_time, "results_iter_avg": query_times["results_iter_time_avg"], "results_iter_min": query_times["results_iter_time_min"], "results_iter_max": query_times["results_iter_time_max"], "results_iter_85": query_times["results_iter_time_85"], "cpu_mem_usage_mb": first_cpu_mem_usage, "gpu_mem_usage_mb": first_gpu_mem_usage, } }) logging.debug("All values set for query " + query["name"] + ": " + str(query)) logging.debug("Closing source db connection.") con.close() logging.info("Completed all queries.") # Create list of successful queries logging.debug( "Removing failed queries from results going to destination db(s)") succesful_query_list = query_list for index, query in enumerate(succesful_query_list): if query["succeeded"] is False: del succesful_query_list[index] # Create successful query results list for upload to destination(s) query_results = [] for query in succesful_query_list: query_results.append(query["results"]) # Convert query list to json for outputs query_list_json = json.dumps(query_list, default=json_format_handler, indent=2) # Send results if "mapd_db" in destinations: # Create dataframe from list of query results logging.debug("Converting results list to pandas dataframe") results_df = DataFrame(query_results) # Establish connection to destination mapd db logging.debug("Connecting to destination mapd db") dest_con = get_connection( db_user=dest_db_user, db_passwd=dest_db_passwd, db_server=dest_db_server, db_port=dest_db_port, db_name=dest_db_name, ) if not dest_con: exit(1) # Exit if cannot connect to destination db # Load results into db, creating table if it does not exist tables = dest_con.get_tables() if dest_table not in tables: logging.info("Destination table does not exist. Creating.") try: with open(dest_table_schema_file, "r") as table_schema: logging.debug("Reading table_schema_file: " + dest_table_schema_file) create_table_sql = table_schema.read().replace("\n", " ") create_table_sql = create_table_sql.replace( "##TAB##", dest_table) except FileNotFoundError: logging.exception("Could not find table_schema_file.") exit(1) try: logging.debug("Executing create destination table query") res = dest_con.execute(create_table_sql) logging.debug("Destination table created.") except ( pymapd.exceptions.ProgrammingError, pymapd.exceptions.Error, ): logging.exception("Error running table creation") exit(1) logging.info("Loading results into destination db") dest_con.load_table_columnar( dest_table, results_df, preserve_index=False, chunk_size_bytes=0, col_names_from_schema=True, ) dest_con.close() if "file_json" in destinations: # Write to json file logging.debug("Opening json output file for writing") file_json_open = open(output_file_json, "w") logging.info("Writing to output json file: " + output_file_json) file_json_open.write(query_list_json) if "jenkins_bench" in destinations: # Write output to file formatted for jenkins benchmark plugin # https://github.com/jenkinsci/benchmark-plugin jenkins_bench_results = [] for query_result in query_results: logging.debug("Constructing output for jenkins benchmark plugin") jenkins_bench_results.append({ "name": query_result["query_id"], "description": "", "parameters": [], "results": [{ "name": query_result["query_id"] + " average", "description": "", "unit": "ms", "dblValue": query_result["query_exec_avg"], }], }) jenkins_bench_json = json.dumps({ "groups": [{ "name": source_table + output_tag_jenkins, "description": "Source table: " + source_table, "tests": jenkins_bench_results, }] }) # Write to json file logging.debug("Opening jenkins_bench json output file for writing") file_jenkins_open = open(output_file_jenkins, "w") logging.info("Writing to jenkins_bench json file: " + output_file_jenkins) file_jenkins_open.write(jenkins_bench_json) if "output" in destinations: logging.info("Printing query results to output") print(query_list_json) logging.info("Succesfully loaded query results info into destination(s)")
def init(self): self.util_history = [] self.temp_history = [] pynvml.nvmlInit() self.gpu_handles = [] self.deviceCount = pynvml.nvmlDeviceGetCount() for i in range(self.deviceCount): self.gpu_handles.append(pynvml.nvmlDeviceGetHandleByIndex(i)) self.cpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6) self.cpu_prog_bars = [] self.gpu_boxes = [] self.gpu_prog_bars = [] self.prev_idle = [] self.prev_total = [] self.idle = [] self.total = [] #---cpu_box--- try: stat = open("/proc/stat") statlines = stat.read().splitlines() stat.close() self.corecount = -1 for line in statlines: if (line[0:2] == "cp"): self.corecount+= 1 else: break except IOError: print("Problem opening /proc/stat, exiting..") pynvml.nvmlShutdown() quit() for i in range(self.corecount): self.cpu_prog_bars.append(Gtk.ProgressBar(text="CPU %d" % i, show_text=True)) self.cpu_box.pack_start(self.cpu_prog_bars[i], True, True, 0) self.prev_idle.append(0) self.prev_total.append(0) self.idle.append(0) self.total.append(0) #---gpu_boxes--- for i in range(self.deviceCount): product_name = pynvml.nvmlDeviceGetName(self.gpu_handles[i]) product_name = product_name.decode('utf-8') gpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8) label = Gtk.Label(product_name) self.gpu_prog_bars.append(Gtk.ProgressBar(text="GPU", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Utilization", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Usage", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Temperature", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Encoder", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Decoder", show_text=True)) gpu_box.pack_start(label, True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +1], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +2], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +3], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +4], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +5], True, True, 0) self.gpu_boxes.append(gpu_box) #---proc--- proc_liststore = Gtk.ListStore(int, str, int) self.tree = Gtk.TreeView(model=proc_liststore) renderer_pid = Gtk.CellRendererText() column_pid = Gtk.TreeViewColumn("Proccess ID", renderer_pid, text=0) column_pid.set_resizable(True) self.tree.append_column(column_pid) renderer_path = Gtk.CellRendererText() column_path = Gtk.TreeViewColumn("Command Line", renderer_path, text=1) column_path.set_resizable(True) column_path.set_fixed_width(250) self.tree.append_column(column_path) renderer_mem = Gtk.CellRendererText() column_mem = Gtk.TreeViewColumn("Memory (MiB)", renderer_mem, text=2) column_mem.set_resizable(True) self.tree.append_column(column_mem)
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) # 2. additional info (driver version, etc). try: driver_version = _decode(N.nvmlSystemGetDriverVersion()) except N.NVMLError: driver_version = None # N/A N.nvmlShutdown() return GPUStatCollection(gpu_list, driver_version=driver_version)
def gpu_profile(frame, event, arg): print_tensor_sizes = True last_tensor_sizes = set() gpu_profile_fn = f'{datetime.datetime.now():%d-%b-%y-%H:%M:%S}-gpu_mem_prof.txt' if 'GPU_DEBUG' in os.environ: print('profiling gpu usage to ', gpu_profile_fn) # it is _about to_ execute (!) # global last_tensor_sizes # global lineno, func_name, filename, module_name lineno = 1 func_name = 'loss' filename = 'train' module_name = 'cityscape_pspnet' os.environ['GPU_DEBUG'] = '2' if event == 'line': try: # about _previous_ line (!) if lineno is not None: pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(int(os.environ['GPU_DEBUG'])) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) line = linecache.getline(filename, lineno) where_str = module_name+' '+func_name+':'+str(lineno) with open(gpu_profile_fn, 'a+') as f: f.write(f"{where_str:<50}" f":{meminfo.used/1024**2:<7.1f}Mb " f"{line.rstrip()}\n") if print_tensor_sizes is True: for tensor in get_tensors(): if not hasattr(tensor, 'dbg_alloc_where'): tensor.dbg_alloc_where = where_str new_tensor_sizes = {(type(x), tuple(x.size()), x.dbg_alloc_where) for x in get_tensors()} for t, s, loc in new_tensor_sizes: f.write(f'+ {loc},{str(s)},{str(t)}\n') # for t, s, loc in new_tensor_sizes - last_tensor_sizes: # f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n') # for t, s, loc in last_tensor_sizes - new_tensor_sizes: # f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n') # last_tensor_sizes = new_tensor_sizes pynvml.nvmlShutdown() # save details about line _to be_ executed # lineno = None func_name = frame.f_code.co_name filename = frame.f_globals["__file__"] if (filename.endswith(".pyc") or filename.endswith(".pyo")): filename = filename[:-1] module_name = frame.f_globals["__name__"] lineno = frame.f_lineno if 'gmwda-pytorch' not in os.path.dirname(os.path.abspath(filename)): lineno = None # skip current line evaluation if ('car_datasets' in filename or '_exec_config' in func_name or 'gpu_profile' in module_name or 'tee_stdout' in module_name): lineno = None # skip current return gpu_profile except (KeyError, AttributeError): pass return gpu_profile
def __init__(self, index): nv.nvmlInit() self._handle = nv.nvmlDeviceGetHandleByIndex(index)
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. process['command'] = os.path.basename(ps_process.cmdline()[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: memory = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported processes = [] try: nv_processes = N.nvmlDeviceGetComputeRunningProcesses(handle) # dict type is mutable for nv_process in nv_processes: #TODO: could be more information such as system memory usage, # CPU percentage, create time etc. process = get_process_info(nv_process.pid) processes.append(process) except N.NVMLError: processes = None # Not supported gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) N.nvmlShutdown() return GPUStatCollection(gpu_list)
def get_gpu_count(): pynvml.nvmlInit() return pynvml.nvmlDeviceGetCount()
def set_affinity( gpu_id, nproc_per_node, *, mode="socket_unique_contiguous", cores="all_logical", balanced=True, ): """ The process is assigned with a proper CPU affinity that matches CPU-GPU hardware architecture on a given platform. Usually, it improves and stabilizes the performance of deep learning training workloads. This function assumes that the workload runs in multi-process single-device mode (there are multiple training processes, and each process is running on a single GPU). This is typical for multi-GPU data-parallel training workloads (e.g., using `torch.nn.parallel.DistributedDataParallel`). Available affinity modes: * 'socket' - the process is assigned with all available physical CPU cores from the CPU socket connected to the GPU with a given id. * 'socket_single' - the process is assigned with the first available physical CPU core from the list of all CPU cores from the CPU socket connected to the GPU with a given id (multiple GPUs could be assigned with the same CPU core). * 'socket_single_unique' - the process is assigned with a single unique available physical CPU core from the list of all CPU cores from the CPU socket connected to the GPU with a given id. * 'socket_unique_interleaved' - the process is assigned with a unique subset of available physical CPU cores from the CPU socket connected to a GPU with a given id, cores are assigned with interleaved indexing pattern * 'socket_unique_contiguous' - (the default) the process is assigned with a unique subset of available physical CPU cores from the CPU socket connected to a GPU with a given id, cores are assigned with contiguous indexing pattern Available "cores" modes: * 'all_logical' - assigns the process with all logical cores associated with a given corresponding physical core (i.e., automatically includes all available hyperthreading siblings) * 'single_logical' - assigns the process with only one logical core associated with a given corresponding physical core (i.e., excludes hyperthreading siblings) 'socket_unique_contiguous' is the recommended mode for deep learning training workloads on NVIDIA DGX machines. Args: gpu_id: integer index of a GPU, value from 0 to 'nproc_per_node' - 1 nproc_per_node: number of processes per node mode: affinity mode balanced: assign an equal number of physical cores to each process, affects only 'socket_unique_interleaved' and 'socket_unique_contiguous' affinity modes cores: 'all_logical' or 'single_logical' Returns a set of logical CPU cores on which the process is eligible to run. Example: import argparse import os import gpu_affinity import torch def main(): parser = argparse.ArgumentParser() parser.add_argument( '--local_rank', type=int, default=os.getenv('LOCAL_RANK', 0), ) args = parser.parse_args() nproc_per_node = torch.cuda.device_count() affinity = gpu_affinity.set_affinity(args.local_rank, nproc_per_node) print(f'{args.local_rank}: core affinity: {affinity}') if __name__ == "__main__": main() Launch the example with: python -m torch.distributed.launch --nproc_per_node <#GPUs> example.py WARNING: On DGX A100, only half of the CPU cores have direct access to GPUs. This function restricts execution only to the CPU cores directly connected to GPUs, so on DGX A100, it will limit the code to half of the CPU cores and half of CPU memory bandwidth (which may be fine for many DL models). WARNING: Intel's OpenMP implementation resets affinity on the first call to an OpenMP function after a fork. It's recommended to run with env variable: `KMP_AFFINITY=disabled` if the affinity set by gpu_affinity should be preserved after a fork (e.g. in PyTorch DataLoader workers). """ pynvml.nvmlInit() if mode == "socket": set_socket_affinity(gpu_id, nproc_per_node, cores) elif mode == "socket_single": set_socket_single_affinity(gpu_id, nproc_per_node, cores) elif mode == "socket_single_unique": set_socket_single_unique_affinity(gpu_id, nproc_per_node, cores) elif mode == "socket_unique_interleaved": set_socket_unique_affinity(gpu_id, nproc_per_node, cores, "interleaved", balanced) elif mode == "socket_unique_contiguous": set_socket_unique_affinity(gpu_id, nproc_per_node, cores, "contiguous", balanced) else: raise RuntimeError("Unknown affinity mode") affinity = os.sched_getaffinity(0) return affinity
def train(cudaid, args, model): pynvml.nvmlInit() dist.init_process_group(backend='nccl', init_method='env://', world_size=args.size, rank=cudaid) random.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration, " lr: ", lr) #cuda_list=range(args.size) print('rank: ', cudaid) torch.cuda.set_device(cudaid) model.cuda(cudaid) accumulation_steps = int(args.batch_size / args.size / args.gpu_size) #optimizer = torch.optim.Adam(model.parameters(), lr=lr,betas=(0.9,0.98),eps=1e-6,weight_decay=0.0) optimizer = apex.optimizers.FusedLAMB(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-6, weight_decay=0.0, max_grad_norm=1.0) model, optimizer = amp.initialize(model, optimizer, opt_level='O2') model = DDP(model) accum_batch_loss = 0 history_file = os.path.join(args.data_dir, args.history_file) if 'last' in args.field: abs_file = os.path.join(args.data_dir, args.abs_file) else: abs_file = '' iterator = NewsIterator(batch_size=args.gpu_size, npratio=4, feature_file=os.path.join(args.data_dir, args.feature_file), history_file=history_file, abs_file=abs_file, field=args.field, fp16=True) train_file = os.path.join(args.data_dir, args.data_file) batch_t = 0 iteration = 0 print('train...', args.field) if cudaid == 0: writer = SummaryWriter(os.path.join(args.data_dir, args.log_file)) epoch = 0 model.train() batch_t = 0 iteration = 0 step = 0 best_score = -1 for epoch in range(0, 10): all_loss = 0 all_batch = 0 data_batch = iterator.load_data_from_file(train_file, cudaid, args.size) print('load ok...') for imp_index, user_index, his_id, candidate_id, label in data_batch: batch_t += 1 assert candidate_id.shape[1] == 2 # if cudaid==1: # torch.set_printoptions(profile="full") # print(his_id) his_id = his_id.cuda(cudaid) candidate_id = candidate_id.cuda(cudaid) label = label.cuda(cudaid) loss = model(his_id, candidate_id, label) sample_size = candidate_id.shape[0] loss = loss.sum() / sample_size / math.log(2) accum_batch_loss += float(loss) all_loss += float(loss) all_batch += 1 # if cudaid==1: # torch.set_printoptions(profile="full") # w=open('input.txt','w') # w.write(str(his_id.cpu())) # w.close() # assert 1==0 loss = loss / accumulation_steps #loss.backward() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (batch_t) % accumulation_steps == 0: iteration += 1 adjust_learning_rate(optimizer, iteration) optimizer.step() optimizer.zero_grad() if cudaid == 0: # handle = pynvml.nvmlDeviceGetHandleByIndex(cudaid) # meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) # #print(int(meminfo.used)/1024/1024) # print('loss: ',loss,int(meminfo.used)/1024/1024) print(' batch_t: ', batch_t, ' iteration: ', iteration, ' epoch: ', epoch, ' accum_batch_loss: ', accum_batch_loss / accumulation_steps, ' lr: ', optimizer.param_groups[0]['lr']) writer.add_scalar('Loss/train', accum_batch_loss / accumulation_steps, iteration) writer.add_scalar('Ltr/train', optimizer.param_groups[0]['lr'], iteration) accum_batch_loss = 0 if iteration % 500 == 0 and cudaid == 0: torch.cuda.empty_cache() model.eval() if cudaid == 0: auc = test(model, args) print(auc) writer.add_scalar('auc/valid', auc, step) step += 1 if auc > best_score: torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot_best.pkl')) best_score = auc print('best score: ', best_score) torch.cuda.empty_cache() model.train() if cudaid == 0: torch.save( model.state_dict(), os.path.join(args.save_dir, 'Plain_robert_dot' + str(epoch) + '.pkl'))
def setup(self): self.data["root"] = os.getcwd() try: import __main__ self.data["program"] = __main__.__file__ except (ImportError, AttributeError): self.data["program"] = '<python with no main file>' if wandb._get_python_type() != "python": if os.getenv(env.NOTEBOOK_NAME): self.data["program"] = os.getenv(env.NOTEBOOK_NAME) else: meta = wandb.jupyter.notebook_metadata() if meta.get("path"): if "fileId=" in meta["path"]: self.data[ "colab"] = "https://colab.research.google.com/drive/" + meta[ "path"].split("fileId=")[1] self.data["program"] = meta["name"] else: self.data["program"] = meta["path"] self.data["root"] = meta["root"] program = os.path.join(self.data["root"], self.data["program"]) if not os.getenv(env.DISABLE_CODE): if self._api.git.enabled: self.data["git"] = { "remote": self._api.git.remote_url, "commit": self._api.git.last_commit } self.data["email"] = self._api.git.email self.data["root"] = self._api.git.root or self.data["root"] if os.path.exists(program) and self._api.git.is_untracked( self.data["program"]): util.mkdir_exists_ok( os.path.join(self.out_dir, "code", os.path.dirname(self.data["program"]))) saved_program = os.path.join(self.out_dir, "code", self.data["program"]) if not os.path.exists(saved_program): self.data["codeSaved"] = True copyfile(program, saved_program) self.data["startedAt"] = datetime.utcfromtimestamp( wandb.START_TIME).isoformat() try: username = getpass.getuser() except KeyError: # getuser() could raise KeyError in restricted environments like # chroot jails or docker containers. Return user id in these cases. username = str(os.getuid()) # Host names, usernames, emails, the root directory, and executable paths are sensitive for anonymous users. if self._api.settings().get('anonymous') != 'true': self.data["host"] = os.environ.get(env.HOST, socket.gethostname()) self.data["username"] = os.getenv(env.USERNAME, username) self.data["executable"] = sys.executable else: self.data.pop("email", None) self.data.pop("root", None) self.data["os"] = platform.platform(aliased=True) self.data["python"] = platform.python_version() if env.get_docker(): self.data["docker"] = env.get_docker() try: pynvml.nvmlInit() self.data["gpu"] = pynvml.nvmlDeviceGetName( pynvml.nvmlDeviceGetHandleByIndex(0)).decode("utf8") self.data["gpu_count"] = pynvml.nvmlDeviceGetCount() except pynvml.NVMLError: pass try: self.data["cpu_count"] = multiprocessing.cpu_count() except NotImplementedError: pass # TODO: we should use the cuda library to collect this if os.path.exists("/usr/local/cuda/version.txt"): self.data["cuda"] = open( "/usr/local/cuda/version.txt").read().split(" ")[-1].strip() self.data["args"] = sys.argv[1:] self.data["state"] = "running"
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses( handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses( handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': int(power / 1000) if power is not None else None, 'enforced.power.limit': int(power_limit / 1000) if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) N.nvmlShutdown() return GPUStatCollection(gpu_list)
def get_gpu_used(index): pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(index) memoryinfo = pynvml.nvmlDeviceGetMemoryInfo(handle) return memoryinfo.used
def check_perf(): "Suggest how to improve the setup to speed things up" from PIL import features, Image from packaging import version import pynvml print("Running performance checks.") # libjpeg_turbo check print("\n*** libjpeg-turbo status") if version.parse(Image.PILLOW_VERSION) >= version.parse("5.4.0"): if features.check_feature('libjpeg_turbo'): print("✔ libjpeg-turbo is on") else: print( "✘ libjpeg-turbo is not on. It's recommended you install libjpeg-turbo to speed up JPEG decoding. See https://docs.fast.ai/performance.html#libjpeg-turbo" ) else: print( f"❓ libjpeg-turbo's status can't be derived - need Pillow(-SIMD)? >= 5.4.0 to tell, current version {Image.PILLOW_VERSION}" ) # XXX: remove this check/note once Pillow and Pillow-SIMD 5.4.0 is available pillow_ver_5_4_is_avail = pypi_module_version_is_available( "Pillow", "5.4.0") if pillow_ver_5_4_is_avail == False: print( "5.4.0 is not yet available, other than the dev version on github, which can be installed via pip from git+https://github.com/python-pillow/Pillow. See https://docs.fast.ai/performance.html#libjpeg-turbo" ) # Pillow-SIMD check print("\n*** Pillow-SIMD status") if re.search(r'\.post\d+', Image.PILLOW_VERSION): print(f"✔ Running Pillow-SIMD {Image.PILLOW_VERSION}") else: print( f"✘ Running Pillow {Image.PILLOW_VERSION}; It's recommended you install Pillow-SIMD to speed up image resizing and other operations. See https://docs.fast.ai/performance.html#pillow-simd" ) # CUDA version check # compatibility table: k: min nvidia ver is required for v: cuda ver # note: windows nvidia driver version is slightly higher, see: # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html # note: add new entries if pytorch starts supporting new cudaXX nvidia2cuda = { "410.00": "10.0", "384.81": "9.0", "367.48": "8.0", } print("\n*** CUDA status") if torch.cuda.is_available(): pynvml.nvmlInit() nvidia_ver = pynvml.nvmlSystemGetDriverVersion().decode('utf-8') cuda_ver = torch.version.cuda max_cuda = "8.0" for k in sorted(nvidia2cuda.keys()): if version.parse(nvidia_ver) > version.parse(k): max_cuda = nvidia2cuda[k] if version.parse(str(max_cuda)) <= version.parse(cuda_ver): print( f"✔ Running the latest CUDA {cuda_ver} with NVIDIA driver {nvidia_ver}" ) else: print( f"✘ You are running pytorch built against cuda {cuda_ver}, your NVIDIA driver {nvidia_ver} supports cuda10. See https://pytorch.org/get-started/locally/ to install pytorch built against the faster CUDA version." ) else: print(f"❓ Running cpu-only torch version, CUDA check is not relevant") print( "\nRefer to https://docs.fast.ai/performance.html to make sense out of these checks and suggestions." )
def train_classifier(config_file, data_dir, knee_type, feature_extract, gpu, model_name, pretraining, overwrite_flag, learning_rate, drop_rate, num_epochs, batch_size, num_classes, patience, prev_checkpoint_path, model_evaluate, progression, biomarker, sampling, norm): '''This script trains a classifier CNN on a specified dataset. It can also infer on a previous checkpoint. The model availables are: "densenet121", "resnet18", "squeezenet", "alexnet", "Vgg11", "inceptionv3".''' feature_extract = int(feature_extract) if pretraining == 'Diagnosis': if '50' not in model_name: return elif pretraining == 'Random': if feature_extract > 0: return print(f'PyTorch Version: {torch.__version__}') print(f'Torchvision Version: {torchvision.__version__}') if progression in ['DiagnosisAll', 'Pain']: data_dir = join(data_dir, sampling, norm, biomarker, knee_type, progression) else: data_dir = join(data_dir, biomarker, knee_type, progression) log_dir = data_dir.replace('on/data', 'on/logs') # GPU ID to use torch.cuda.set_device(int(gpu)) nvmlInit() h = nvmlDeviceGetHandleByIndex(0) # Flag for feature extracting. When False, we finetune the whole model, # when True we only update the reshaped layer params transfer_learning_type = [ 'Finetuned', 'FeatureExtract', 'FirstLayerExtract', 'FirstTwoLayersExtract' ] transfer_learning_dict = {} transfer_learning_dict[1] = 'Linear' transfer_learning_dict[2] = 'layer2' transfer_learning_dict[3] = 'layer3' # Learning rate string for naming the checkpoint directory lr_exp = 'e'.join('{:.0E}'.format(Decimal(learning_rate)).split('E-')) # Unique training name if pretraining == 'Diagnosis' or model_evaluate: if progression in ['DiagnosisAll', 'Pain']: if pretraining == 'Diagnosis': diagnosispdpath = join( log_dir.split(sampling)[0], 'diagnosisallperformance.csv') checkpointpd = pd.read_csv(diagnosispdpath, index_col=[ 'Sampling', 'Normalization', 'Bone', 'Incidence', 'Fusion' ]) best_checkpoint_path = checkpointpd.loc[(sampling, norm, knee_type, 'DiagnosisAll', biomarker), 'Checkpoint'] elif pretraining == 'ImageNet': pass else: pdpath = join( log_dir.split(sampling)[0], progression.lower() + 'performance.csv') checkpointpd = pd.read_csv(pdpath, index_col=[ 'Sampling', 'Normalization', 'Bone', 'Incidence', 'Fusion' ]) best_checkpoint_path = checkpointpd.loc[(sampling, norm, knee_type, progression, biomarker), 'Checkpoint'] else: pdpath = join( log_dir.split(biomarker)[0], progression.lower() + 'performance.csv') checkpointpd = pd.read_csv( pdpath, index_col=['Bone', 'Incidence', 'Fusion']) best_checkpoint_path = checkpointpd.loc[(knee_type, progression, biomarker), 'Checkpoint'] best_checkpoint_dir = dirname(best_checkpoint_path) prev_training_name = best_checkpoint_dir.split('/')[-1] print('Previous training name =', prev_training_name) print('Previous training directory =', best_checkpoint_dir) prev_feature_extract = transfer_learning_type.index([ item for item in transfer_learning_type if item in prev_training_name ][0]) if learning_rate == 1e-4: learning_rate = float(prev_training_name[-3:].replace('e', 'e-')) / 10 elif learning_rate == 1e-5: learning_rate = float(prev_training_name[-3:].replace('e', 'e-')) / 100 elif learning_rate == 1e-6: learning_rate = float(prev_training_name[-3:].replace('e', 'e-')) / 1000 elif learning_rate == 1e-7: learning_rate = float(prev_training_name[-3:].replace( 'e', 'e-')) / 10000 lr_exp = 'e'.join('{:.0E}'.format(Decimal(learning_rate)).split('E-')) if pretraining == 'Diagnosis': training_name = 'Diagnosis' + model_name.capitalize( ) + transfer_learning_type[feature_extract] + lr_exp elif 'prev' in prev_checkpoint_path.lower() and model_evaluate: feature_extract = prev_feature_extract training_name = prev_training_name else: training_name = 'Prev' + model_name.capitalize( ) + transfer_learning_type[feature_extract] + lr_exp elif pretraining == 'ImageNet': training_name = 'ImageNet' + model_name.capitalize( ) + transfer_learning_type[feature_extract] + lr_exp elif pretraining == 'Random': training_name = 'Random' + model_name.capitalize( ) + transfer_learning_type[feature_extract] + lr_exp # Current Checkpoint path current_checkpoint_path = join(log_dir, training_name) print(f'Checkpoint Path: {current_checkpoint_path}') print(f'Transfer Learning: {transfer_learning_type[feature_extract]}') if not isdir(current_checkpoint_path): os.makedirs(current_checkpoint_path) elif os.listdir(current_checkpoint_path) and not model_evaluate: if not overwrite_flag: raise Exception( 'Previous checkpoints found and no overwrite flag specified.') if not model_evaluate: checkpoint_log = join(current_checkpoint_path, 'log_file.txt') checkpoint_header = '*Model Name*: ' + model_name.capitalize( ) + '_' + knee_type.capitalize() + '_' + progression.capitalize( ) + '\t*Transfer Learning Type*: ' + transfer_learning_type[ feature_extract] + '\t*Learning Rate*: ' + lr_exp + '\n\n' write_type = 'w' # Write mode if file does not exist with open(checkpoint_log, write_type) as f_checkpoint: if pretraining == 'Diagnosis': f_checkpoint.write( checkpoint_header.replace( transfer_learning_type[feature_extract], transfer_learning_type[feature_extract] + '_Diagnosis')) elif pretraining == 'ImageNet': f_checkpoint.write( checkpoint_header.replace( transfer_learning_type[feature_extract], transfer_learning_type[feature_extract] + '_ImageNet')) elif pretraining == 'Random': f_checkpoint.write( checkpoint_header.replace( transfer_learning_type[feature_extract], transfer_learning_type[feature_extract] + '_Random')) write_type = 'a' # Append mode after file is created def train_model(model, dataloaders, criterion, optimizer, current_checkpoint_path, num_epochs=25, prev_model=0, is_inception=False): total_time = time.time() output_dict = {} output_dict['best_val_MCC'] = -1 output_dict['best_val_TPN'] = 0 output_dict['best_epoch'] = prev_model for phase in ['Train', 'Val']: output_dict[phase] = {} output_dict[phase]['auc'] = np.zeros(num_epochs) output_dict[phase]['acc'] = np.zeros(num_epochs) output_dict[phase]['mcc'] = np.zeros(num_epochs) output_dict[phase]['tpn'] = np.zeros(num_epochs) output_dict[phase]['loss'] = 100 * np.ones(num_epochs) output_dict[phase]['epoch'] = np.zeros(num_epochs, dtype=int) best_model_wts = copy.deepcopy(model.state_dict()) best_MCC = -1 best_TPN = 0 best_epoch = 0 # for epoch in tqdm(range(num_epochs)): for epoch in range(num_epochs): since = time.time() # print(f'\nEpoch {epoch}/{(num_epochs - 1)}') # print('-' * 10) # Each epoch has a training and validation phase for phase in ['Train', 'Val']: if phase == 'Train': model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 running_corrects_positive = 0 running_corrects_negative = 0 running_positives = 0 running_negatives = 0 cnt = 0 num_hold = len(dataloaders[phase].dataset) output_dict[phase]['labels'] = np.zeros(num_hold) - 1 output_dict[phase]['softmax'] = np.zeros([num_hold, 2]) # Iterate over data. for inputs, labels, paths in dataloaders[phase]: inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'Train'): # Get model outputs and calculate loss # Special case for inception because in training it has an auxiliary output. In train # mode we calculate the loss by summing the final output and the auxiliary output # but in testing we only consider the final output. if is_inception and phase == 'Train': # From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958 outputs, aux_outputs = model(inputs) loss1 = criterion(outputs, labels) loss2 = criterion(aux_outputs, labels) loss = loss1 + 0.4 * loss2 else: outputs = model(inputs) loss = criterion(outputs, labels) _, preds = torch.max(torch.softmax(outputs, dim=1), 1) # backward + optimize only if in training phase if phase == 'Train': loss.backward() optimizer.step() # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) running_corrects_positive += np.logical_and( preds.cpu(), labels.data.cpu()).sum() running_corrects_negative += len( labels.data) - np.logical_or(preds.cpu(), labels.data.cpu()).sum() running_positives += torch.sum(labels.data) running_negatives += len(labels.data) - torch.sum( labels.data).int() output_dict[phase]['labels'][(cnt * batch_size):( (cnt + 1) * batch_size)] = labels.cpu().numpy() output_dict[phase]['softmax'][(cnt * batch_size):( (cnt + 1) * batch_size), :] = torch.softmax( outputs, dim=1).detach().cpu().numpy() cnt += 1 # print('True Positives: {} All Positives: {}'.format(running_corrects_positive, running_positives)) # print('True Negatives: {} All Negatives: {}'.format(running_corrects_negative, running_negatives))torch.sigmoid(outputs) epoch_loss = running_loss / len(dataloaders[phase].dataset) epoch_acc = running_corrects.double() / len( dataloaders[phase].dataset) epoch_tpr = running_corrects_positive / running_positives.double( ) epoch_tnr = running_corrects_negative / running_negatives.double( ) output_dict[phase]['auc'][epoch] = roc_auc_score( output_dict[phase]['labels'], output_dict[phase]['softmax'][:, 1]) output_dict[phase]['mcc'][epoch] = matthews_corrcoef( output_dict[phase]['labels'], output_dict[phase]['softmax'][:, 1] >= 0.5) output_dict[phase]['tpn'][ epoch] = epoch_tpr + epoch_tnr + output_dict[phase]['auc'][ epoch] # print(f'TPR: {epoch_tpr.numpy():5.3f}') # print(f'TNR: {epoch_tnr.numpy():5.3f}') # print(f'AUC: {output_dict[phase]["auc"][epoch]:5.3f}') # print(f'MCC: {output_dict[phase]["mcc"][epoch]:6.3f}') # print(f'TPN: {output_dict[phase]["tpn"][epoch]:6.3f}') # if phase == 'Val': # time_elapsed = time.time() - since # print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f} \nTime since last epoch: {(time_elapsed // 60):.0f}m {(time_elapsed % 60):.0f}s') # else: # print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}') # deep copy the model if phase == 'Val' and (output_dict[phase]['tpn'][epoch] > best_TPN) and (epoch_tpr >= 0.5) and ( epoch_tnr >= 0.5): best_MCC = output_dict[phase]['mcc'][epoch] best_TPN = output_dict[phase]['tpn'][epoch] output_dict['best_val_MCC'] = best_MCC output_dict['best_val_TPN'] = best_TPN output_dict['best_epoch'] = prev_model + epoch chkpath = join(current_checkpoint_path, 'best_epoch') torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'output_dict': output_dict }, chkpath) print('Saved model in ' + chkpath) # best_model_wts = copy.deepcopy(model.state_dict()) if phase == 'Val': output_dict['Val']['acc'][epoch] = epoch_acc output_dict['Val']['loss'][epoch] = epoch_loss output_dict['Val']['epoch'][epoch] = prev_model + epoch # scheduler.step(1 - output_dict['Val']['auc'][epoch]) if phase == 'Val' and (np.argmin(output_dict['Val']['loss']) + patience < epoch): print( f'Early stopping due to validation loss not improving for {patience} epochs' ) quit() if phase == 'Train': output_dict['Train']['acc'][epoch] = epoch_acc output_dict['Train']['loss'][epoch] = epoch_loss output_dict['Train']['epoch'][epoch] = prev_model + epoch if epoch % 1 == 0: current_train_acc = output_dict['Train']['acc'][epoch] current_val_acc = output_dict['Val']['acc'][epoch] current_train_loss = output_dict['Train']['loss'][epoch] current_val_loss = output_dict['Val']['loss'][epoch] best_val_MCC = output_dict['best_val_MCC'] best_val_TPN = output_dict['best_val_TPN'] with open(checkpoint_log, write_type) as f_checkpoint: f_checkpoint.write(f'*Epoch*: {output_dict["Train"]["epoch"][epoch]:3} *Train Loss*: {current_train_loss:5.3f}' + \ f' *Val Loss*: {current_val_loss:5.3f} *TPR*: {epoch_tpr.numpy():5.3f}' + \ f' *TNR*: {epoch_tnr.numpy():5.3f} *AUC*: {output_dict["Val"]["auc"][epoch]:5.3f} *MCC*: {output_dict["Val"]["mcc"][epoch]:6.3f}' + \ f' *TPN*: {output_dict["Val"]["tpn"][epoch]:5.3f} *Best Val TPN*: {best_val_TPN:6.3f} *Best Epoch*: {output_dict["best_epoch"]:3}\n') # print() total_time_elapsed = time.time() - total_time print( f'Training complete in {(total_time_elapsed // 3600):.0f}h {(total_time_elapsed // 60):.0f}m {(total_time_elapsed % 60):.0f}s' ) print(f'Best Val MCC: {best_MCC:6.3f}') # load best model weights model.load_state_dict(best_model_wts) return model # Set Model Parameters’ .requires_grad attribute # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # This helper function sets the .requires_grad attribute of the # parameters in the model to False when we are feature extracting. By # default, when we load a pretrained model all of the parameters have # .requires_grad=True, which is fine if we are training from scratch # or finetuning. However, if we are feature extracting and only want to # compute gradients for the newly initialized layer then we want all of # the other parameters to not require gradients. This will make more sense # later. def set_parameter_requires_grad(model, feature_extracting): if feature_extracting > 0: bypass_condition = 0 for param_name, param in model.named_parameters(): if transfer_learning_dict[ feature_extracting] in param_name or bypass_condition: param.requires_grad = True bypass_condition = 1 else: param.requires_grad = False def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True): # Initialize these variables which will be set in this if statement. Each of these # variables is model specific. model_ft = None input_size = 0 if 'resnet' in model_name.lower(): ''' Resnet50 ''' if '18' in model_name.lower(): model_ft = models.resnet18(pretrained=use_pretrained) elif '34' in model_name.lower(): model_ft = models.resnet34(pretrained=use_pretrained) elif '50' in model_name.lower(): model_ft = models.resnet50(pretrained=use_pretrained) set_parameter_requires_grad(model_ft, feature_extract) num_ftrs = model_ft.fc.in_features print(f'Fully Connected Layer Features = {num_ftrs}') # model_ft.fc.register_forward_hook(lambda m, inp, out: nn.functional.dropout(out, p=drop_rate, training=m.training)) # if not feature_extract: if pretraining == 'Diagnosis': model_ft.layer1 = nn.Sequential(nn.Dropout(drop_rate), model_ft.layer1) model_ft.layer2 = nn.Sequential(nn.Dropout(drop_rate), model_ft.layer2) model_ft.layer3 = nn.Sequential(nn.Dropout(drop_rate), model_ft.layer3) model_ft.layer4 = nn.Sequential(nn.Dropout(drop_rate), model_ft.layer4) model_ft.fc = nn.Sequential(nn.Dropout(drop_rate), nn.Linear(num_ftrs, num_classes)) else: model_ft.fc = nn.Linear(num_ftrs, num_classes) # print(f'Layers = {model_ft.children}') input_size = 224 elif model_name.lower() == 'alexnet': ''' Alexnet ''' model_ft = models.alexnet(pretrained=use_pretrained) set_parameter_requires_grad(model_ft, feature_extract) num_ftrs = model_ft.classifier[6].in_features model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes) input_size = 224 elif model_name.lower() == 'vgg': ''' VGG11_bn ''' model_ft = models.vgg11_bn(pretrained=use_pretrained) set_parameter_requires_grad(model_ft, feature_extract) num_ftrs = model_ft.classifier[6].in_features model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes) input_size = 224 elif model_name.lower() == 'squeezenet': ''' Squeezenet ''' model_ft = models.squeezenet1_0(pretrained=use_pretrained) set_parameter_requires_grad(model_ft, feature_extract) model_ft.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1, 1), stride=(1, 1)) model_ft.num_classes = num_classes input_size = 224 elif model_name.lower() == 'densenet': ''' Densenet ''' model_ft = models.densenet121(pretrained=use_pretrained) set_parameter_requires_grad(model_ft, feature_extract) num_ftrs = model_ft.classifier.in_features model_ft.classifier = nn.Linear(num_ftrs, num_classes) input_size = 224 elif model_name.lower() == 'inception': ''' Inception v3 Be careful, expects (299,299) sized images and has auxiliary output ''' model_ft = models.inception_v3(pretrained=use_pretrained) set_parameter_requires_grad(model_ft, feature_extract) # Handle the auxilary net num_ftrs = model_ft.AuxLogits.fc.in_features model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes) # Handle the primary net num_ftrs = model_ft.fc.in_features model_ft.fc = nn.Linear(num_ftrs, num_classes) input_size = 299 else: print('Invalid model name, exiting...') exit() return model_ft, input_size # Initialize the model for this run model_ft, input_size = initialize_model( model_name, num_classes, feature_extract, use_pretrained=(pretraining == 'ImageNet')) # Load Data # --------- # Now that we know what the input size must be, we can initialize the data # transforms, image datasets, and the dataloaders. Notice, the models were # pretrained with the hard-coded normalization values, as described # https://pytorch.org/docs/master/torchvision/models.html # Data augmentation and normalization for training # Just normalization for validation data_transforms = { 'Train': transforms.Compose([ # No augmentation necessary for spherical maps since that is done before the transformation transforms.RandomResizedCrop(input_size), transforms.RandomHorizontalFlip(p=0.25), transforms.RandomVerticalFlip(p=0.25), transforms.Resize(input_size), transforms.CenterCrop(input_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), # transforms.RandomErasing(p=0.25) ]), 'Val': transforms.Compose([ transforms.Resize(input_size), transforms.CenterCrop(input_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'Hold': transforms.Compose([ transforms.Resize(input_size), transforms.CenterCrop(input_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } print(f'Transforms: {data_transforms}') # print('Initializing Datasets and Dataloaders...') # Create training and validation datasets image_datasets = { x: datasets.ImageFolder(join(data_dir, x), data_transforms[x]) for x in ['Train', 'Val', 'Hold'] } for split in ['Train', 'Val', 'Hold']: if 'year' in progression: image_datasets[split].class_to_idx = {'Healthy': 0, 'Incidence': 1} elif 'pain' in progression.lower(): image_datasets[split].class_to_idx = {'Healthy': 0, 'Pain': 1} # Create training and validation dataloaders dataloaders_dict = { x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=8) for x in ['Train', 'Val', 'Hold'] } # Detect if we have a GPU available3 device = torch.device('cuda:' + gpu if torch.cuda.is_available() else 'cpu') # Send the model to GPU model_ft = model_ft.to(device) # Gather the parameters to be optimized/updated in this run. If we are # finetuning we will be updating all parameters. However, if we are # doing feature extract method, we will only update the parameters # that we have just initialized, i.e. the parameters with requires_grad # is True. params_to_update = model_ft.parameters() if feature_extract > 0: params_to_update = [] for name, param in model_ft.named_parameters(): if param.requires_grad == True: params_to_update.append(param) num_params = len(params_to_update) else: num_params = len(list(model_ft.parameters())) # Observe that all parameters are being optimized print('Number of training parameters:', num_params) optimizer_ft = optim.Adam(params_to_update, lr=learning_rate, weight_decay=0.1) # Run Training and Validation Step # -------------------------------- # Finally, the last step is to setup the loss for the model, then run the # training and validation function for the set number of epochs. Notice, # depending on the number of epochs this step may take a while on a CPU. # Also, the default learning rate is not optimal for all of the models, so # to achieve maximum accuracy it would be necessary to tune for each model # separately. pain_weights = [0.86589497, 1.18325617] # Setup the loss fxn class_weights = torch.FloatTensor(pain_weights).cuda() criterion = nn.CrossEntropyLoss(weight=class_weights) print(f'Loss Class Weights = {pain_weights}') info = nvmlDeviceGetMemoryInfo(h) if feature_extract == 1: if '18' in model_name: batch_size = 905 elif '34' in model_name: batch_size = 905 else: batch_size = 705 batch_step = 60 elif feature_extract == 3: if '18' in model_name: batch_size = 855 elif '34' in model_name: batch_size = 855 else: batch_size = 305 batch_step = 60 elif feature_extract == 2: if '18' in model_name: batch_size = 955 elif '34' in model_name: batch_size = 705 else: batch_size = 155 batch_step = 30 elif feature_extract == 0: if '18' in model_name: batch_size = 425 elif '34' in model_name: batch_size = 305 else: batch_size = 105 batch_step = 10 if info.total > 1.5e10: batch_size = round(2.8 * batch_size) batch_step = round(2 * batch_step) n_channels = 3 batch_adapt = 1 print('Batch Size:', batch_size) print('Batch Step:', batch_step) model_ft.train() while batch_adapt: input_shape = (batch_size, n_channels, input_size, input_size) try: inputs = torch.randn(*input_shape, dtype=torch.float32).cuda() labels = torch.ones(batch_size, dtype=torch.int64).cuda() # zero the parameter gradients optimizer_ft.zero_grad() outputs = model_ft(inputs) loss = criterion(outputs, labels) loss.backward() optimizer_ft.step() print('Allocated:', torch.cuda.max_memory_allocated()) print('Cached:', torch.cuda.max_memory_cached()) del inputs, labels, outputs, loss torch.cuda.empty_cache() if (info.total - torch.cuda.max_memory_cached()) < 2e9: batch_adapt = 0 print('Final Batch Size:', batch_size) else: batch_size += batch_step except RuntimeError as error: print(error) batch_size -= round(1.5 * batch_step) batch_adapt = 0 print('Final Batch Size:', batch_size) dataloaders_dict = { x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=8) for x in ['Train', 'Val', 'Hold'] } if pretraining == 'Diagnosis': best_checkpoint_load = torch.load(best_checkpoint_path, map_location='cuda:' + gpu) model_ft.load_state_dict(best_checkpoint_load['model_state_dict']) print(f'Loading previous model: {best_checkpoint_path}') if model_evaluate: best_checkpoint_load = torch.load(best_checkpoint_path, map_location='cuda:' + gpu) model_ft.load_state_dict(best_checkpoint_load['model_state_dict']) print(f'Loading best model: {best_checkpoint_path}') # We're evaluating the model_load here: def model_eval(model_load, dataloaders, phase_range=['Val']): model_load.eval() # Set model_load to evaluate mode results_dict = {} for phase in phase_range: cnt = 0 num_hold = len(dataloaders[phase].dataset) results_dict[phase] = {} results_dict[phase]['file_names'] = [] results_dict[phase]['labels'] = np.zeros(num_hold) - 1 results_dict[phase]['class_predict'] = np.zeros(num_hold) - 1 results_dict[phase]['logits'] = np.zeros([num_hold, 2]) results_dict[phase]['softmax'] = np.zeros([num_hold, 2]) # results_dict[phase]['features'] = np.zeros([num_hold,2048]) # Iterate over data. for inputs, labels, paths in tqdm(dataloaders[phase]): inputs = inputs.to(device) labels = labels.to(device) results_dict[phase]['file_names'].extend(list(paths)) results_dict[phase]['labels'][(cnt * batch_size):( (cnt + 1) * batch_size)] = labels.cpu().numpy() with torch.set_grad_enabled(False): outputs = model_load(inputs) results_dict[phase]['softmax'][(cnt * batch_size):( (cnt + 1) * batch_size), :] = torch.softmax( outputs, dim=1).cpu().numpy() results_dict[phase]['logits'][(cnt * batch_size):( (cnt + 1) * batch_size), :] = outputs.cpu().numpy() # results_dict[phase]['features'][(cnt*batch_size):((cnt + 1)*batch_size),:] = outputs.cpu().numpy() # forward _, preds = torch.max(torch.softmax(outputs, dim=1), 1) results_dict[phase]['class_predict'][( cnt * batch_size):((cnt + 1) * batch_size)] = preds.cpu().numpy() # statistics pred_comp = ( labels.cpu().numpy() == preds.cpu().numpy()) cnt += 1 return results_dict def perf_measure(y_actual, y_hat): TP = 0 FP = 0 TN = 0 FN = 0 for i in range(len(y_hat)): if y_actual[i] == y_hat[i] == 1: TP += 1 if y_hat[i] == 1 and y_actual[i] != y_hat[i]: FP += 1 if y_actual[i] == y_hat[i] == 0: TN += 1 if y_hat[i] == 0 and y_actual[i] != y_hat[i]: FN += 1 return TP, FP, TN, FN if model_evaluate.lower() == 'all': phase_range = ['Train', 'Val', 'Hold'] eval_dict = model_eval(model_ft, dataloaders_dict, phase_range) elif model_evaluate.lower() == 'trainval': phase_range = ['Train', 'Val'] eval_dict = model_eval(model_ft, dataloaders_dict, phase_range) elif model_evaluate.lower() == 'valhold': phase_range = ['Val', 'Hold'] eval_dict = model_eval(model_ft, dataloaders_dict, phase_range) else: phase_range = [model_evaluate.lower().capitalize()] eval_dict = model_eval(model_ft, dataloaders_dict, phase_range) for i in phase_range: TP, FP, TN, FN = perf_measure(eval_dict[i]['labels'], eval_dict[i]['class_predict']) # print(TP, FP, TN, FN) sensitivity = TP / (TP + FN) specificity = TN / (TN + FP) eval_dict[i]['sensitivity'] = sensitivity eval_dict[i]['specificity'] = specificity eval_dict[i]['auc'] = roc_auc_score(eval_dict[i]['labels'], eval_dict[i]['softmax'][:, 1]) if i == 'Hold': continue else: print("*{}* \nSensitivity = {} \nSpecificity = {} \nAUC = {}". format(i.capitalize(), sensitivity, specificity, eval_dict[i]['auc'])) with open(join(best_checkpoint_dir, 'model_perf.pickle'), 'wb') as f: eval_dict['checkpoint'] = [best_checkpoint_path] pickle.dump(eval_dict, f) quit() else: if pretraining == 'Diagnosis': model_ft.layer1 = model_ft.layer1[1] model_ft.layer2 = model_ft.layer2[1] model_ft.layer3 = model_ft.layer3[1] model_ft.layer4 = model_ft.layer4[1] model_ft.fc = model_ft.fc[1] model_ft = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, current_checkpoint_path, num_epochs=num_epochs, is_inception=(model_name == "inception"))
def __init__(self, name, init_config, instances): super(NvmlCheck, self).__init__(name, init_config, instances) pynvml.nvmlInit() pynvml.nvmlShutdown()
def identify_cards(): devices = {} try: import pynvml from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex deviceCount = None try: nvmlInit() deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) props = {} def meminfo(memory): return { "total" : int(memory.total), "free" : int(memory.free), "used" : int(memory.used), } def pciinfo(pci): i = {} for x in ("domain", "bus", "device", "pciDeviceId", "pciSubSystemId"): try: i[x] = int(getattr(pci, x)) except: pass try: i["busId"] = str(pci.busId) except: pass return i for prop, fn_name, args, conv in ( ("name", "nvmlDeviceGetName", (), str), ("serial", "nvmlDeviceGetSerial", (), str), ("uuid", "nvmlDeviceGetUUID", (), str), ("pci", "nvmlDeviceGetPciInfo", (), pciinfo), ("memory", "nvmlDeviceGetMemoryInfo", (), meminfo), ("pcie-link-generation-max", "nvmlDeviceGetMaxPcieLinkGeneration", (), int), ("pcie-link-width-max", "nvmlDeviceGetMaxPcieLinkWidth", (), int), ("pcie-link-generation", "nvmlDeviceGetCurrPcieLinkGeneration", (), int), ("pcie-link-width", "nvmlDeviceGetCurrPcieLinkWidth", (), int), ("clock-info-graphics", "nvmlDeviceGetClockInfo", (0,), int), ("clock-info-sm", "nvmlDeviceGetClockInfo", (1,), int), ("clock-info-mem", "nvmlDeviceGetClockInfo", (2,), int), ("clock-info-graphics-max", "nvmlDeviceGetMaxClockInfo", (0,), int), ("clock-info-sm-max", "nvmlDeviceGetMaxClockInfo", (1,), int), ("clock-info-mem-max", "nvmlDeviceGetMaxClockInfo", (2,), int), ("fan-speed", "nvmlDeviceGetFanSpeed", (), int), ("temperature", "nvmlDeviceGetTemperature", (0,), int), ("power-state", "nvmlDeviceGetPowerState", (), int), ("vbios-version", "nvmlDeviceGetVbiosVersion", (), str), ): try: fn = getattr(pynvml, fn_name) v = fn(handle, *args) if conv: v = conv(v) props[prop] = v except Exception as e: log("identify_cards() cannot query %s using %s on device %i with handle %s: %s", prop, fn, i, handle, e) continue devices[i] = props #unitCount = nvmlUnitGetCount() #log.info("unitCount=%s", unitCount) except Exception as e: log("identify_cards() pynvml error", exc_info=True) log.warn("Warning: failed to query the NVidia cards via NVML:") log.warn(" %s", e) finally: if deviceCount is not None: nvmlShutdown() except ImportError as e: log("cannot use nvml to query the kernel module version:") log(" %s", e) return devices
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def _decode(b): if isinstance(b, bytes): return b.decode('utf-8') # for python3, to unicode return b def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] # TODO: ps_process is being cached, but the dict below is not. process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = nv_process.usedGpuMemory // MB if \ nv_process.usedGpuMemory else None process['gpu_memory_usage'] = usedmem process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle) except N.NVMLError: utilization_enc = None # Not supported try: utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle) except N.NVMLError: utilization_dec = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] # A single process might run in both of graphics and compute mode, # However we will display the process only once seen_pids = set() for nv_process in nv_comp_processes + nv_graphics_processes: if nv_process.pid in seen_pids: continue seen_pids.add(nv_process.pid) try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'utilization.enc': utilization_enc[0] if utilization_enc else None, 'utilization.dec': utilization_dec[0] if utilization_dec else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) # 2. additional info (driver version, etc). try: driver_version = _decode(N.nvmlSystemGetDriverVersion()) except N.NVMLError: driver_version = None # N/A N.nvmlShutdown() return GPUStatCollection(gpu_list, driver_version=driver_version)
def __init__(self, ground_policy, F_s, F_sa, env, device, log, hyperparameters): self.env = env self.device = device self.log = log self.hyperparameters = hyperparameters self.ground_policy = ground_policy self.name = "" self.verbose = hyperparameters["verbose"] # Check env: self.discrete_env = True if 'Discrete' in str( env.action_space) else False if self.discrete_env: self.num_actions = self.env.action_space.n self.action_low = torch.zeros(self.num_actions, device=self.device) self.action_high = torch.ones(self.num_actions, device=self.device) if self.verbose: print("Num actions: ", self.num_actions) else: self.num_actions = len(self.env.action_space.high) self.action_low = torch.tensor(env.action_space.low, device=self.device) self.action_high = torch.tensor(env.action_space.high, device=self.device) if self.verbose: print("Env action low: ", self.action_low) print("Env action high: ", self.action_high) # Set up params: # Actor-Critic: self.use_actor_critic = hyperparameters["use_actor_critic"] self.use_CACLA_V = hyperparameters["use_CACLA_V"] self.use_CACLA_Q = hyperparameters["use_CACLA_Q"] self.use_DDPG = hyperparameters["use_DDPG"] self.use_SPG = hyperparameters["use_SPG"] self.use_GISPG = hyperparameters["use_GISPG"] # QV: self.use_QV = hyperparameters["use_QV"] self.use_QVMAX = hyperparameters["use_QVMAX"] # Exploration: self.gaussian_action_noise = hyperparameters["action_sigma"] self.boltzmann_exploration_temp = hyperparameters["boltzmann_temp"] self.epsilon = hyperparameters["epsilon"] self.epsilon_mid = hyperparameters["epsilon_mid"] if self.epsilon_mid: self.eps_factor = self.epsilon_mid**(1 / hyperparameters["steps"]) self.epsilon = 1 # General: self.use_half = hyperparameters["use_half"] self.batch_size = hyperparameters["batch_size"] self.use_world_model = hyperparameters["use_world_model"] # TODO: -Include PER with prioritization based on Upper Bound of Gradient Norm. # TODO: -include different sampling schemes from the papers investigatin PER in SL (small and big buffer for gradient norm too) # TODO: -add goal to replay buffer and Transition (For HRL) # Eligibility traces: if torch.cuda.is_available(): nvmlInit() self.nvml_handle = nvmlDeviceGetHandleByIndex(0) self.max_gpu_bytes = torch.cuda.get_device_properties( self.device).total_memory self.mem_usage = None self.current_episode = [] self.use_efficient_traces = hyperparameters["use_efficient_traces"] self.elig_traces_update_steps = hyperparameters[ "elig_traces_update_steps"] self.elig_traces_anneal_lambda = hyperparameters[ "elig_traces_anneal_lambda"] self.lambda_val = hyperparameters["elig_traces_lambda"] # Set up replay buffer: self.stack_dim = hyperparameters["stack_dim"] self.stack_count = hyperparameters["frame_stack"] self.buffer_size = hyperparameters[ "replay_buffer_size"] + hyperparameters["num_expert_samples"] self.use_PER = hyperparameters["use_PER"] self.use_CER = hyperparameters["use_CER"] self.PER_alpha = hyperparameters["PER_alpha"] self.PER_start_beta = hyperparameters["PER_beta"] self.PER_beta = self.PER_start_beta self.PER_anneal_beta = hyperparameters["PER_anneal_beta"] self.PER_max_priority = hyperparameters["PER_max_priority"] self.PER_running_avg = hyperparameters["PER_running_avg"] self.importance_weights = None # Create replay buffer: self.memory = self.create_replay_buffer() # Feature extractors: self.F_s = F_s self.F_sa = F_sa self.state_feature_len = F_s.layers_merge[-1].out_features if F_sa is not None: self.state_action_feature_len = F_sa.layers_merge[-1].out_features # Set up Networks: self.use_half = hyperparameters[ "use_half"] and torch.cuda.is_available() self.nets = [] self.actor, self.Q, self.V = self.init_actor_critic( self.F_s, self.F_sa)
def nccl_GPU(input, layer_id, construct_log, name, struct=None, splits=[], **kwargs): with tf.device("/cpu:0"): if "number_of_GPUs" not in construct_log: pynvml.nvmlInit() nb_GPU = pynvml.nvmlDeviceGetCount() construct_log["number_of_GPUs"] = nb_GPU else: nb_GPU = construct_log["number_of_GPUs"] gpu_input = [None]*nb_GPU towers_args = [] towers_dict = [] for g in range(nb_GPU): towers_args.append(dict(kwargs)) towers_dict.append(dict()) original_data = {} for key in splits: if key == "input": gpu_input = tf.split(input, nb_GPU) elif key in list(kwargs.keys()): if type(kwargs[key]) == str: value_to_split = construct_log[kwargs[key]] else: value_to_split = kwargs[key] value_splits = tf.split(value_to_split, nb_GPU) for i, targs in enumerate(towers_args): targs[key]=value_splits[i] else: value_to_split = construct_log[key] value_splits = tf.split(value_to_split, nb_GPU) for i, tdic in enumerate(towers_dict): tdic[key] = value_splits[i] original_data[key]=value_to_split variables = [] outs = [] destinations = [] for i in range(nb_GPU): with tf.device("/gpu:"+str(i)): destinations.append("/gpu:"+str(i)) for key in towers_dict[i]: construct_log[key[3:]] = towers_dict[i][key] replica_name = name if i == 0 else name+"_"+str(i) net_output = network(gpu_input[i], layer_id, construct_log, replica_name, struct=struct, var_scope=True, **towers_args[i]) replica_variables = tf.global_variables(scope=construct_log["network_scope"][replica_name].name) replica_variables = sorted(replica_variables, key = lambda x : x.name) variables.append(replica_variables) outs.append(net_output) construct_log["tower_devices"] = destinations master = variables[0] variables = list(zip(*variables)) for var in variables: for replic in var[1:]: construct_log["initialization_opps:[]"]= tf.assign(replic, var[0]) for key in original_data: construct_log[key[3:]] = original_data[key] return input
def _initialize(self, log=False): """ Initialize the library that will be returning stats for the system's GPU(s). For Nvidia (on Linux and Windows) the library is `pynvml`. For Nvidia (on macOS) the library is `pynvx`. For AMD `plaidML` is used. Parameters ---------- log: bool, optional Whether the class should output information to the logger. There may be occasions where the logger has not yet been set up when this class is queried. Attempting to log in these instances will raise an error. If GPU stats are being queried prior to the logger being available then this parameter should be set to ``False``. Otherwise set to ``True``. Default: ``False`` """ if not self._initialized: if get_backend() == "cpu": pass elif get_backend() == "amd": self._log("debug", "AMD Detected. Using plaidMLStats") loglevel = "INFO" if self._logger is None else self._logger.getEffectiveLevel() if plaidlib: self._plaid = plaidlib(log_level=loglevel, log=log) elif IS_MACOS: self._log("debug", "macOS Detected. Using pynvx") try: pynvx.cudaInit() except RuntimeError: self._initialized = True return else: try: self._log("debug", "OS is not macOS. Trying pynvml") pynvml.nvmlInit() except (pynvml.NVMLError_LibraryNotFound, # pylint: disable=no-member pynvml.NVMLError_DriverNotLoaded, # pylint: disable=no-member pynvml.NVMLError_NoPermission) as err: # pylint: disable=no-member if plaidlib is not None: self._log("debug", "pynvml errored. Trying plaidML") self._plaid = plaidlib(log=log) else: msg = ("There was an error reading from the Nvidia Machine Learning " "Library. Either you do not have an Nvidia GPU (in which case " "this warning can be ignored) or the most likely cause is " "incorrectly installed drivers. If this is the case, Please remove " "and reinstall your Nvidia drivers before reporting." "Original Error: {}".format(str(err))) self._log("warning", msg) self._initialized = True return except Exception as err: # pylint: disable=broad-except msg = ("An unhandled exception occured loading pynvml. " "Original error: {}".format(str(err))) if self._logger: self._logger.error(msg) else: print(msg) self._initialized = True return self._initialized = True self._get_device_count() self._get_active_devices() self._get_handles()
def main(): """Start the simulation server.""" # the following config variables read from the config.json file # are described here: # # port: local port on which the server is listening (launching webots instances). # sslKey: private key for a SSL enabled server. # sslCertificate: certificate for a SSL enabled server. # projectsDir: directory in which projects are located. # keyDir: directory where the host keys needed for validation are stored. # logDir: directory where the log files are written. # monitorLogEnabled: specify if the monitor data have to be stored in a file. # maxConnections: maximum number of simultaneous Webots instances. # global config global snapshots global nvidia global network_sent global network_received global monitorFile n = psutil.net_io_counters() network_sent = n.bytes_sent network_received = n.bytes_recv snapshots = [] config['WEBOTS_HOME'] = os.getenv('WEBOTS_HOME', '../../..').replace('\\', '/') config['webots'] = config['WEBOTS_HOME'] if sys.platform == 'darwin': config['webots'] += '/Contents/MacOS/webots' elif sys.platform == 'win32': config['webots'] += '/msys64/mingw64/bin/webots.exe' else: # linux config['webots'] += '/webots' if 'projectsDir' not in config: config['projectsDir'] = config[ 'WEBOTS_HOME'] + '/projects/samples/robotbenchmark' else: config['projectsDir'] = expand_path(config['projectsDir']) if 'keyDir' not in config: config['keyDir'] = 'key' else: config['keyDir'] = expand_path(config['keyDir']) if 'port' not in config: config['port'] = 2000 if 'maxConnections' not in config: config['maxConnections'] = 100 os.environ['WEBOTS_FIREJAIL_CONTROLLERS'] = '1' config['instancesPath'] = tempfile.gettempdir().replace( '\\', '/') + '/webots/instances/' # create the instances path if os.path.exists(config['instancesPath']): shutil.rmtree(config['instancesPath']) mkdir_p(config['instancesPath']) # logging system log_formatter = logging.Formatter( '%(asctime)-15s [%(levelname)-7s] %(message)s') root_logger = logging.getLogger() root_logger.setLevel(logging.DEBUG) if 'logDir' not in config: config['logDir'] = 'log' else: config['logDir'] = expand_path(config['logDir']) simulationLogDir = os.path.join(config['logDir'], 'simulation') logFile = os.path.join(simulationLogDir, 'output.log') try: if not os.path.exists(simulationLogDir): os.makedirs(simulationLogDir) file_handler = logging.FileHandler(logFile) file_handler.setFormatter(log_formatter) file_handler.setLevel(logging.INFO) root_logger.addHandler(file_handler) except (OSError, IOError) as e: sys.exit("Log file '" + logFile + "' cannot be created: " + str(e)) # create monitor.csv used by Snapshot if needed if 'monitorLogEnabled' not in config: config['monitorLogEnabled'] = True if config['monitorLogEnabled']: monitorFile = os.path.join(simulationLogDir, 'monitor.csv') try: if not os.path.exists(simulationLogDir): os.makedirs(simulationLogDir) file = open(monitorFile, 'w') file.write( "Timestamp, Webots running, Webots idle, CPU load, CPU memory, " "GPU load compute, GPU load memory, GPU memory, Swap, Disk, Network sent, Network received\n" ) file.close() except (OSError, IOError) as e: logging.error("Log file '" + monitorFile + "' cannot be created: " + str(e)) # startup janus server if needed if 'multimediaServer' in config: subprocess.Popen(["/opt/janus/bin/janus"]) # startup the server logging.info("Running simulation server on port %d" % config['port']) handlers = [] handlers.append((r'/monitor', MonitorHandler)) handlers.append((r'/client', ClientWebSocketHandler)) handlers.append((r'/load', LoadHandler)) handlers.append((r'/(.*)', tornado.web.StaticFileHandler, { 'path': config['WEBOTS_HOME'] + '/resources/web/server/www', 'default_filename': 'index.html' })) application = tornado.web.Application(handlers) if 'sslCertificate' in config and 'sslKey' in config: config['ssl'] = True ssl_certificate = os.path.abspath(expand_path( config['sslCertificate'])) ssl_key = os.path.abspath(expand_path(config['sslKey'])) ssl_options = {"certfile": ssl_certificate, "keyfile": ssl_key} http_server = tornado.httpserver.HTTPServer(application, ssl_options=ssl_options) else: config['ssl'] = False http_server = tornado.httpserver.HTTPServer(application) http_server.listen(config['port']) message = "Simulation server running on port %d (" % config['port'] if not config['ssl']: message += 'no ' message += 'SSL)' print(message) sys.stdout.flush() try: nvmlInit() nvidia = True except NVMLError: nvidia = False update_snapshot() try: tornado.ioloop.IOLoop.current().start() except Exception: logging.info(traceback.format_exc()) for client in ClientWebSocketHandler.clients: del client if nvidia: nvmlShutdown()
def initGPU(): nvmlInit()
# if Settings.execution_mode == ExecutionMode.GENERATOR: # print("==============================") # break executionTest(queryType) end_mem = gpuMemory.capture_gpu_memory_usage() gpuMemory.log_memory_usage(queryType, start_mem, end_mem) if __name__ == "__main__": Execution.getArgs() nvmlInit() drill = "drill" # None spark = "spark" compareResults = True if "compare_results" in Settings.data["RunSettings"]: compareResults = Settings.data["RunSettings"]["compare_results"] if ((Settings.execution_mode == ExecutionMode.FULL and compareResults == "true") or Settings.execution_mode == ExecutionMode.GENERATOR): # Create Table Drill ------------------------------------------------ from pydrill.client import PyDrill drill = PyDrill(host="localhost", port=8047) cs.init_drill_schema(drill,
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.info('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.info('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
# extension: .py # format_name: hydrogen # format_version: '1.2' # jupytext_version: 1.2.1 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- # %% import torch import torch.nn as nn import torchvision.models as models import pynvml pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) class Model(nn.Module): def __init__(self, pretrained_weights_path, device): super(Model, self).__init__() self.model = models.inception_v3() self.model.AuxLogits.fc = nn.Linear( self.model.AuxLogits.fc.in_features, 6) self.model.fc = nn.Linear(self.model.fc.in_features, 6) self.model.load_state_dict( torch.load(pretrained_weights_path, map_location=device)) del self.model._modules['AuxLogits'] #删除AuxLogits模块 #del self.model._modules['fc'] #self.model.AuxLogits.fc = nn.Linear(self.model.AuxLogits.fc.in_features, self.args.n_classes) #将模型AuxLogits模块的fc输出通道数改成我们需要的分类数
def get_gpus_info() -> Dict[str, Any]: """Get information about GPU devices: driver version, memory, utilization etc. The example below shows what kind of information is returned as the result. All figures about memory are given in bytes. Returns: Information about GPU devices. Raises: RuntimeError: if necessary cuda-related libraries are not found. Usually, it means that the function is run on a machine without GPU. Warning: The 'devices' value contains information about *all* gpus regardless of the value of :code:`CUDA_VISIBLE_DEVICES`. Examples: .. code-block:: print(get_gpu_info()) Output example (formatted for convenience): .. code-block:: none { 'driver': '440.33.01', 'devices': [ { 'name': 'GeForce RTX 2080 Ti', 'memory_total': 11554717696, 'memory_free': 11554652160, 'memory_used': 65536, 'utilization': 0, }, { 'name': 'GeForce RTX 2080 Ti', 'memory_total': 11552096256, 'memory_free': 11552030720, 'memory_used': 65536, 'utilization': 0, }, ], } """ try: pynvml.nvmlInit() except NVMLError_LibraryNotFound as err: raise RuntimeError( 'Failed to get information about GPU memory. ' 'Make sure that you actually have GPU and all relevant software installed.' ) from err n_devices = pynvml.nvmlDeviceGetCount() devices = [] for device_id in range(n_devices): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) devices.append({ 'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'), 'memory_total': memory_info.total, 'memory_free': memory_info.free, 'memory_used': memory_info.used, 'utilization': pynvml.nvmlDeviceGetUtilizationRates(handle).gpu, }) return { 'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'), 'devices': devices, }
def __calculate_GPU_index(self, nNodes): pv.nvmlInit() nGPUs = int(pv.nvmlDeviceGetCount()) rank = self.new_comm.Get_rank() return int(rank/nNodes) % nGPUs
import os.path import sys import socket import random import numbers from PIL import ImageOps import requests from urlparse import urlparse import cStringIO import PIL.Image from PIL import Image from visdom import Visdom import numpy as np import config import pynvml pynvml.nvmlInit() vis = Visdom() all_wins = {} def plot(title, name, i, v): win = all_wins.get(title, None) if win is None: win = vis.line(env=config.experiment_name, X=np.array([i]), Y=np.array([v]), opts={'legend':[name], 'title':title}) all_wins[title] = win else: vis.updateTrace(env=config.experiment_name, win=win, X=np.array([i]), Y=np.array([v]), name=name) #viz.image( np.random.rand(3,64, 64), win="abxxx", opts=dict(title='sr', caption='sr images'))
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # _cmdline = ps_process.cmdline() # if not _cmdline: # process['command'] = '?' # process['full_command'] = ['?'] # else: # process['command'] = os.path.basename(_cmdline[0]) # process['full_command'] = _cmdline # process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB # process['cpu_percent'] = ps_process.cpu_percent() # process['cpu_memory_usage'] = \ # round((ps_process.memory_percent() / 100.0) * # psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) # try: # temperature = N.nvmlDeviceGetTemperature( # handle, N.NVML_TEMPERATURE_GPU # ) # except N.NVMLError: # temperature = None # Not supported # try: # fan_speed = N.nvmlDeviceGetFanSpeed(handle) # except N.NVMLError: # fan_speed = None # Not supported # try: # memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes # except N.NVMLError: # memory = None # Not supported # try: # utilization = N.nvmlDeviceGetUtilizationRates(handle) # except N.NVMLError: # utilization = None # Not supported # try: # power = N.nvmlDeviceGetPowerUsage(handle) # except N.NVMLError: # power = None # try: # power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) # except N.NVMLError: # power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] # process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, # 'temperature.gpu': temperature, # 'fan.speed': fan_speed, # 'utilization.gpu': utilization.gpu if utilization else None, # 'power.draw': power // 1000 if power is not None else None, # 'enforced.power.limit': power_limit // 1000 # if power_limit is not None else None, # Convert bytes into MBytes # 'memory.used': memory.used // MB if memory else None, # 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) # 2. additional info (driver version, etc). try: driver_version = _decode(N.nvmlSystemGetDriverVersion()) except N.NVMLError: driver_version = None # N/A N.nvmlShutdown() return GPUStatCollection(gpu_list, driver_version=driver_version)
def count_gpus(): nvmlInit() count = nvmlDeviceGetCount() nvmlShutdown() return count
def training_step(self, batch, batch_idx) -> Dict: global isEmUpdateBusy # use to check whether the entire embedding update process is finished or not global isAddIndexBusy # use to check whether the entire indexing process is finished or not global processes # use to keep threads embedding update processes global threadHandle_index # use to keep thread in embedding indexing processes if (self.trainer.global_rank == 0) and (self.custom_config.end2end): if (not batch_idx == 0) and ( batch_idx % self.custom_config.indexing_freq == 0): free_gpu_list = [] nvmlInit() deviceCount = nvmlDeviceGetCount() my_list = json.loads(self.custom_config.gpu_order) for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) info = nvmlDeviceGetMemoryInfo(handle) if info.used / 1e6 < 15: position = my_list.index(i) free_gpu_list.append("cuda:" + str(position)) if len(free_gpu_list) >= self.custom_config.index_gpus: has_free_gpus = True else: has_free_gpus = False if (not isEmUpdateBusy) and has_free_gpus: model_copy = type(self.model.rag.ctx_encoder)( self.config_dpr ) # get a new instance #this will be load in the CPU model_copy.load_state_dict(self.model.rag.ctx_encoder. state_dict()) # copy weights processes = [] if len(free_gpu_list) > self.custom_config.index_gpus: cuda_devices = random.sample( free_gpu_list, self.custom_config.index_gpus) else: cuda_devices = free_gpu_list num_processes = len(cuda_devices) for rank in range(num_processes): logger.info( "Iniitializing embedding calculation process rank{}" .format(rank)) device = cuda_devices[rank] p = multiprocessing.Process( target=embed_update, args=( copy.deepcopy(model_copy), num_processes, device, rank, self.custom_config.shard_dir, self.custom_config.csv_path, ), ) processes.append(p) for p in processes: p.start() isEmUpdateBusy = True if isEmUpdateBusy and (not isAddIndexBusy): index_process_list = [ processes[k].is_alive() for k in range(self.custom_config.index_gpus) ] if ( sum(index_process_list) == 0 ): # If entire list is false, we can say all embedding calculation process has finished logger.info("Start adding the index") threadHandle_index = multiprocessing.Process( target=add_index, args=( self.custom_config.shard_dir, self.config.index_path, ), ) threadHandle_index.start() isAddIndexBusy = True # check when index building has started if isAddIndexBusy: # check still the index_building process is happening if not threadHandle_index.is_alive(): logger.info("Merging the dataset shards") saved_dataset_shards = [] for address in glob( str(self.custom_config.shard_dir) + "/*/"): saved_dataset_shards.append(load_from_disk(address)) concat = concatenate_datasets(saved_dataset_shards) concat.save_to_disk( self.config.passages_path ) # here we update the main passage file on the disk logger.info("done updating the dataset") # To Do (@Aaron) : Useful in the future dynamic memory implementation. # if you load the index from the disk make sure to update the index file here, otherwise it is ok to update the index file from the worker. # logger.info("then updating the index") # shutil.copy(self.custom_config.temp_index, self.config.idex_path) logger.info( "Loading new passages and iniitalzing new index") self.trainer.model.module.module.model.rag.retriever.re_load( ) self.trainer.model.module.module.model.rag.retriever.init_retrieval( ) isEmUpdateBusy = False isAddIndexBusy = False self.trainer.strategy.barrier("barrier") loss_tensors = self._step(batch) logs = { name: loss for name, loss in zip(self.loss_names, loss_tensors) } # tokens per batch tgt_pad_token_id = (self.tokenizer.generator.pad_token_id if isinstance(self.tokenizer, RagTokenizer) else self.tokenizer.pad_token_id) src_pad_token_id = (self.tokenizer.question_encoder.pad_token_id if isinstance(self.tokenizer, RagTokenizer) else self.tokenizer.pad_token_id) logs["tpb"] = (batch["input_ids"].ne(src_pad_token_id).sum() + batch["decoder_input_ids"].ne(tgt_pad_token_id).sum()) self.log("loss", loss_tensors[0]) return loss_tensors[0]
def identify_cards(): devices = {} try: import pynvml from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex deviceCount = None try: nvmlInit() deviceCount = nvmlDeviceGetCount() log("identify_cards() will probe %i cards", deviceCount) for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) log("identify_cards() handle(%i)=%s", i, handle) props = {} def meminfo(memory): return { "total" : int(memory.total), "free" : int(memory.free), "used" : int(memory.used), } def pciinfo(pci): i = {} for x in ("domain", "bus", "device", "pciDeviceId", "pciSubSystemId"): try: i[x] = int(getattr(pci, x)) except: pass try: i["busId"] = str(pci.busId) except: pass return i for prop, fn_name, args, conv in ( ("name", "nvmlDeviceGetName", (), str), ("serial", "nvmlDeviceGetSerial", (), str), ("uuid", "nvmlDeviceGetUUID", (), str), ("pci", "nvmlDeviceGetPciInfo", (), pciinfo), ("memory", "nvmlDeviceGetMemoryInfo", (), meminfo), ("pcie-link-generation-max", "nvmlDeviceGetMaxPcieLinkGeneration", (), int), ("pcie-link-width-max", "nvmlDeviceGetMaxPcieLinkWidth", (), int), ("pcie-link-generation", "nvmlDeviceGetCurrPcieLinkGeneration", (), int), ("pcie-link-width", "nvmlDeviceGetCurrPcieLinkWidth", (), int), ("clock-info-graphics", "nvmlDeviceGetClockInfo", (0,), int), ("clock-info-sm", "nvmlDeviceGetClockInfo", (1,), int), ("clock-info-mem", "nvmlDeviceGetClockInfo", (2,), int), ("clock-info-graphics-max", "nvmlDeviceGetMaxClockInfo", (0,), int), ("clock-info-sm-max", "nvmlDeviceGetMaxClockInfo", (1,), int), ("clock-info-mem-max", "nvmlDeviceGetMaxClockInfo", (2,), int), ("fan-speed", "nvmlDeviceGetFanSpeed", (), int), ("temperature", "nvmlDeviceGetTemperature", (0,), int), ("power-state", "nvmlDeviceGetPowerState", (), int), ("vbios-version", "nvmlDeviceGetVbiosVersion", (), str), ): try: fn = getattr(pynvml, fn_name) v = fn(handle, *args) if conv: v = conv(v) props[prop] = v except Exception as e: log("identify_cards() cannot query %s using %s on device %i with handle %s: %s", prop, fn, i, handle, e) continue log("identify_cards() [%i]=%s", i, props) devices[i] = props #unitCount = nvmlUnitGetCount() #log.info("unitCount=%s", unitCount) except Exception as e: log("identify_cards() pynvml error", exc_info=True) log.warn("Warning: failed to query the NVidia cards via NVML:") log.warn(" %s", e) finally: if deviceCount is not None: nvmlShutdown() except ImportError as e: log("cannot use nvml to query the kernel module version:") log(" %s", e) return devices
def do_GET(self): #checks if the server is alive if self.path == '/test': send_header(self) self.wfile.write(bytes('passed<br>', 'utf-8')) self.wfile.write(bytes('server is responding', 'utf-8')) #returns the running processes if self.path == '/runningProcesses': send_header(self) #send response: if modules['psutil']: for proc in psutil.process_iter(): try: pinfo = proc.as_dict(attrs=['pid', 'name']) except psutil.NoSuchProcess: pass print(pinfo) self.wfile.write(bytes(str(pinfo), 'utf-8')) else: self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8') #returns the CPU utilization and number of cores elif self.path == '/cpuInfo': send_header(self) #get CPU info cpuInfo = {} if modules['psutil']: cpuInfo['CPU Utilization'] = int(psutil.cpu_percent()) cpuInfo['CPU Cores'] = int(psutil.cpu_count()) else: cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.' json_dump = json.dumps(cpuInfo) self.wfile.write(bytes(json_dump, 'utf-8')) #get GPU info if modules['pynvml']: try: pynvml.nvmlInit() gpus = pynvml.nvmlDeviceGetCount() except: gpus = 0 self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8')) else: gpus = 0 self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8')) for i in range(gpus): handle = pynvml.nvmlDeviceGetHandleByIndex(i) self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8')) try: self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '°C', 'utf-8')) except: self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8')) try: gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8')) self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8')) except: self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8')) if gpus > 0: try: pynvml.nvmlShutdown() except: pass elif self.path == '/availableComputers': send_header(self) s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('google.com', 0)) global myownsocket myownsocket = s.getsockname()[0] port = 8003 available_computers = [] for i in range(1, 256): host = '192.168.178.' + str(i) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(0.2) try: alive = sock.connect_ex((host, port)) except: alive = -1 if alive == 0: print('available') available_computers.append(host) else: print('not available') print(host) self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8')) cmd_txt = """@echo off call "C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat" echo ##### start_rendering xsibatch -render "Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn" -frames #1#-#2# -pass "BEAUTY" -skip on -verbose on echo ##### rendering_done """ self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8')) self.wfile.write(bytes('<table border="1">\n', 'utf-8')) self.wfile.write(bytes('<tr>\n', 'utf-8')) self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8')) self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8')) self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8')) self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8')) available_cpus = {} for host in available_computers: available_cpus[host] = abs(get_cpu_cores(host)) total_cpus = sum(available_cpus.values()) frame_list = {} start_frame = 0 for host in available_computers: start_frame += 1 frame_list[host] = [start_frame] start_frame = start_frame + int(100 * (available_cpus[host] / total_cpus)) if start_frame > 100: start_frame = 100 frame_list[host].append(start_frame) index = 0 for host in available_computers: index += 1 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) index = 2 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) self.wfile.write(bytes('</table>\n', 'utf-8')) self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8')) self.wfile.write(bytes('</form>\n', 'utf-8')) self.wfile.write(bytes('</body>\n', 'utf-8')) self.wfile.write(bytes('</html>\n', 'utf-8')) elif self.path == '/execute_job': send_header(self) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) elif '/submit_job' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) #print(parsed) print(parameters) self.wfile.write(bytes('<body>', 'utf-8')) for index in range(1, 100): if not parameters.get('host' + str(index)).strip(): pass elif not parameters.get('start' + str(index)).strip(): pass elif not parameters.get('end' + str(index)).strip(): pass elif parameters.get('command'): cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip()) cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip()) self.wfile.write(bytes(escape(cmd_txt), 'utf-8')) self.wfile.write(bytes('<br>', 'utf-8')) print(cmd_txt) self.wfile.write(bytes('</body></html>', 'utf-8')) elif '/shutdown' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("Server will be shut down now......", 'utf-8')) server.shutdown() sys.exit() else: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("<br>", 'utf-8')) self.wfile.write(bytes(self.path, 'utf-8')) print(self.path)
def __init__(self): nv.nvmlInit() self._device_count = nv.nvmlDeviceGetCount() self._specs = [DeviceSpec(i) for i in range(self.device_count)]
def __calculate_GPU_index(self, nNodes): pv.nvmlInit() nGPUs = int(pv.nvmlDeviceGetCount()) rank = self.new_comm.Get_rank() return int(rank / nNodes) % nGPUs
def get_trainer(config, base_model, diff_attention_model, loss, device, logger, query_loader, gallery_loader, diff_optimizer, diff_scheduler): num_batch_images = config['dataset'].getint('num_batch_images') val_per_epochs = config['trainer'].getint('val_per_epochs') log_iteration = config['trainer'].getint('log_iteration') save = config['trainer'].getboolean('save') save_per_epochs = config['trainer'].getint('save_per_epochs') save_path = config['trainer']['save_path'] save_path = os.path.join(save_path, time.strftime("%Y%m%d", time.localtime())) torch.cuda.empty_cache() pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) trainer = create_trainer(diff_attention_model, diff_optimizer, loss, num_batch_images, device=device) RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'avg_loss') if loss.do_loss('triplet'): RunningAverage(output_transform=lambda x: x[1]).attach( trainer, 'avg_triplet_loss') if loss.do_loss('reg'): RunningAverage(output_transform=lambda x: x[2]).attach( trainer, 'avg_reg_loss') if save: checkpointer = ModelCheckpoint(save_path, 'supervised_offline', n_saved=10, require_empty=False) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=save_per_epochs), checkpointer, { 'base_model': base_model, 'diff_attention_model': diff_attention_model }) @trainer.on(Events.ITERATION_COMPLETED) def summary_iteration(engine): iteration = trainer.state.iteration if iteration % log_iteration == 0 and iteration != 0: logger.info('Epoch[{}/{}] Iteration[{}] Loss: {:.3f}'.format( trainer.state.epoch, trainer.state.max_epochs, iteration, trainer.state.metrics['avg_loss'])) @trainer.on(Events.EPOCH_COMPLETED) def summary_epoch(engine): meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) logger.info('GPU Memory Used(GB): {:.3f} GB'.format(meminfo.used / 1024**3)) logger.info('Epoch[{}] Loss: {:.3f} Base Lr: {:.2e}'.format( trainer.state.epoch, trainer.state.metrics['avg_loss'], diff_scheduler.get_last_lr()[0])) if loss.do_loss('triplet'): logger.info('Epoch[{}] Triplet_Loss: {:.3f}'.format( trainer.state.epoch, trainer.state.metrics['avg_triplet_loss'])) if loss.do_loss('reg'): logger.info('Epoch[{}] Regularization_Loss: {:.3f}'.format( trainer.state.epoch, trainer.state.metrics['avg_reg_loss'])) torch.cuda.empty_cache() @trainer.on(Events.EPOCH_COMPLETED) def change_lr(engine): diff_scheduler.step() @trainer.on(Events.EPOCH_COMPLETED(every=val_per_epochs)) def val_per_val_epochs(engine): logger.info('Start validation every {} epochs at epoch: {}'.format( val_per_epochs, trainer.state.epoch)) val.val(base_model, diff_attention_model, True, query_loader, gallery_loader, logger, device) return trainer
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 # Number of active GPUs self.gauge('nvml.gpus.number', deviceCount) for device_id in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetTemperature:{}'.format(err)) # power info try: pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000 self.gauge('nvml.power.', pwr, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetPowerUsage:{}'.format(err)) # fan info try: fan = pynvml.nvmlDeviceGetFanSpeed(handle) self.gauge('nvml.fan.', fan, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetFanSpeed:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % int(util_encoder[0])) self.gauge('nvml.util.encoder', int( util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % int(util_decoder[0])) self.gauge('nvml.util.decoder', int( util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) # Clocks throttling info # Divide by the mask so that the value is either 0 or 1 per GPU try: throttle_reasons = ( pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle)) self.gauge('nvml.throttle.appsettings', (throttle_reasons & pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting) / pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting, tags=d_tags) self.gauge('nvml.throttle.display', (throttle_reasons & GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS) / GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS, tags=d_tags) self.gauge('nvml.throttle.hardware', (throttle_reasons & pynvml.nvmlClocksThrottleReasonHwSlowdown) / pynvml.nvmlClocksThrottleReasonHwSlowdown, tags=d_tags) self.gauge('nvml.throttle.idle', (throttle_reasons & pynvml.nvmlClocksThrottleReasonGpuIdle) / pynvml.nvmlClocksThrottleReasonGpuIdle, tags=d_tags) self.gauge('nvml.throttle.power.hardware', (throttle_reasons & GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE) / GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE, tags=d_tags) self.gauge('nvml.throttle.power.software', (throttle_reasons & pynvml.nvmlClocksThrottleReasonSwPowerCap) / pynvml.nvmlClocksThrottleReasonSwPowerCap, tags=d_tags) self.gauge('nvml.throttle.syncboost', (throttle_reasons & GPU_THROTTLE_SYNCBOOST) / GPU_THROTTLE_SYNCBOOST, tags=d_tags) self.gauge('nvml.throttle.temp.hardware', (throttle_reasons & GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE) / GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE, tags=d_tags) self.gauge('nvml.throttle.temp.software', (throttle_reasons & GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE) / GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE, tags=d_tags) self.gauge('nvml.throttle.unknown', (throttle_reasons & pynvml.nvmlClocksThrottleReasonUnknown) / pynvml.nvmlClocksThrottleReasonUnknown, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetCurrentClocksThrottleReasons:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = ','.join(msg_list) else: status = AgentCheck.OK msg = 'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)