def auto_select_gpu(): """Select gpu which has largest free memory""" if HAS_NVML: pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() largest_free_mem = 0 largest_free_idx = 0 for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) info = pynvml.nvmlDeviceGetMemoryInfo(handle) if info.free > largest_free_mem: largest_free_mem = info.free largest_free_idx = i pynvml.nvmlShutdown() largest_free_mem = largest_free_mem / 1024. / 1024. # Convert to MB idx_to_gpu_id = {} for i in range(deviceCount): idx_to_gpu_id[i] = '{}'.format(i) gpu_id = idx_to_gpu_id[largest_free_idx] logging.info('Using largest free memory GPU {} with free memory {}MB'.format(gpu_id, largest_free_mem)) return gpu_id else: logging.info('nvidia-ml-py is not installed, automatically select gpu is disabled!') return '0'
def parse_cmdln(): parser = get_args() args = parser.parse_args() if args.program == 'gpu_temp': assertion(nvmlInit, ImportError('nvidia-ml-py is required for this program.')) assertion(mpl, ImportError('matplotlib is required for this program.')) assertion(args.deviceID, AssertionError('GPU index must be declared.')) nvmlInit() args.handle = nvmlDeviceGetHandleByIndex(args.deviceID) if args.program == 'cpu_usage': assertion(psutil, ImportError('psutil is required for this program.')) if args.program == 'screen_glow': assertion(PIL, ImportError('PIL is required for this program.')) return args
def __check_gpu(self): """ Check if the process list contains GPU processes and determine if GPUs exists. Add GPU processes to the processes list if required.""" if not self.exp.meta_data.plugin_list._contains_gpu_processes(): return try: import pynvml as pv except: logging.debug("pyNVML module not found") raise Exception("pyNVML module not found") try: pv.nvmlInit() count = int(pv.nvmlDeviceGetCount()) logging.debug("%s GPUs have been found.", count) if not self.exp.meta_data.get('test_state'): for i in range(count): handle = pv.nvmlDeviceGetHandleByIndex(i) if pv.nvmlDeviceGetComputeRunningProcesses(handle): raise Exception("Unfortunately, GPU %i is busy. Try \ resubmitting the job to the queue." % i) except Exception as e: raise Exception("Unable to run GPU plugins: %s", e.message) self.__set_gpu_processes(count)
def get_gpu_temperatures(): nvmlInit() gpus = dict() for i in range(nvmlDeviceGetCount()): handle = nvmlDeviceGetHandleByIndex(i) gpus[i] = int(nvmlDeviceGetTemperature(handle, 0)) nvmlShutdown() return gpus
def get_gpu_mem_used(): try: from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo nvmlInit() handle = nvmlDeviceGetHandleByIndex(0) totalMemory = nvmlDeviceGetMemoryInfo(handle) return totalMemory.used except Exception: return -1
def get_handles(self): """ Return all listed Nvidia handles """ if IS_MACOS: self.handles = pynvx.cudaDeviceGetHandles(ignore=True) else: self.handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(self.device_count)] if self.logger: self.logger.debug("GPU Handles found: %s", len(self.handles))
def _crawl_in_system(self): ''' nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE, POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS, PAGE_RETIREMENT, ACCOUNTING currently, following are requested based on dlaas requirements: utilization.gpu, utilization.memory, memory.total, memory.free, memory.used nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\ memory.total,memory.free,memory.used --format=csv,noheader,nounits ''' if self._init_nvml() == -1: return self.inspect_arr = exec_dockerps() num_gpus = pynvml.nvmlDeviceGetCount() for gpuid in range(0, num_gpus): gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid) temperature = pynvml.nvmlDeviceGetTemperature( gpuhandle, pynvml.NVML_TEMPERATURE_GPU) memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle) mem_total = memory.total / 1024 / 1024 mem_used = memory.used / 1024 / 1024 mem_free = memory.free / 1024 / 1024 power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000 power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit( gpuhandle) / 1000 util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle) util_gpu = util.gpu util_mem = util.memory entry = { 'utilization': {'gpu': util_gpu, 'memory': util_mem}, 'memory': {'total': mem_total, 'free': mem_free, 'used': mem_used}, 'temperature': temperature, 'power': {'draw': power_draw, 'limit': power_limit} } key = self._get_feature_key(gpuhandle, gpuid) if gpuid == num_gpus - 1: self._shutdown_nvml() yield (key, entry, 'gpu') return
def request_mem(mem_mb, i_am_nice=True): # titanx' mem: 12,881,559,552 bytes # 12*1024*1024*1024 = 12,884,901,888 mem = mem_mb * 1024 * 1024 nvml.nvmlInit() # n = nvml.nvmlDeviceGetCount() try: handle = nvml.nvmlDeviceGetHandleByIndex(0) info = nvml.nvmlDeviceGetMemoryInfo(handle) cap = info.total * nice_ratio # req = cap if mem > cap and i_am_nice else mem req = mem if req > cap and i_am_nice: raise MemoryError('You are supposed to be polite..') if req > info.free: raise MemoryError('Cannot fullfil the gpumem request') return req / info.free finally: nvml.nvmlShutdown()
def collect_via_pynvml(self, stats_config): """ Use pynvml python binding to collect metrics :param stats_config: :return: """ try: NVML_TEMPERATURE_GPU = 0 pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() for device_index in xrange(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(device_index) memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle) metrics = { 'memory.total': memoryInfo.total / 1024 / 1024, 'memory.used': memoryInfo.total / 1024 / 1024, 'memory.free': memoryInfo.free / 1024 / 1024, 'utilization.gpu': utilizationRates.gpu, 'utilization.memory': utilizationRates.memory, 'temperature.gpu': pynvml.nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) } for stat_name in stats_config[1:]: metric = metrics.get(stat_name) if metric: metric_name = 'gpu_{index}.{stat_name}'.format( index=str(device_index), stat_name=stat_name ) self.publish(metric_name, metric) finally: pynvml.nvmlShutdown()
def _main_func(): try: # first get name import torch as th import os except: self.P("ERROR: PyTorch not installed! Please install Pytorch.") return None nvsmires = None try: from pynvml.smi import nvidia_smi import pynvml nvsmi = nvidia_smi.getInstance() nvsmires = nvsmi.DeviceQuery('memory.free, memory.total, memory.used, utilization.gpu, temperature.gpu') pynvml_avail = True except: pynvml_avail = False lst_inf = [] # now we iterate all devices n_gpus = th.cuda.device_count() if n_gpus > 0: th.cuda.empty_cache() current_pid_has_usage = False current_pid_gpus = [] try: for device_id in range(n_gpus): dct_device = {} device_props = th.cuda.get_device_properties(device_id) dct_device['NAME'] = device_props.name dct_device['TOTAL_MEM'] = round( device_props.total_memory / 1024 ** (2 if mb else 3), 2 ) mem_total = None mem_allocated = None gpu_used = None gpu_temp = None gpu_temp_max = None if pynvml_avail and nvsmires is not None and 'gpu' in nvsmires: dct_gpu = nvsmires['gpu'][device_id] mem_total = round( dct_gpu['fb_memory_usage']['total'] / (1 if mb else 1024), 2 ) # already from th mem_allocated = round( dct_gpu['fb_memory_usage']['used'] / (1 if mb else 1024), 2 ) gpu_used = dct_gpu['utilization']['gpu_util'] if isinstance(gpu_used, str): gpu_used = -1 gpu_temp = dct_gpu['temperature']['gpu_temp'] gpu_temp_max = dct_gpu['temperature']['gpu_temp_max_threshold'] handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) processes = [] for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle): dct_proc_info = {k.upper(): v for k,v in proc.__dict__.items()} used_mem = dct_proc_info.pop('USEDGPUMEMORY', None) dct_proc_info['ALLOCATED_MEM'] = round( used_mem / 1024 ** (2 if mb else 3) if used_mem is not None else 0.0, 2 ) processes.append(dct_proc_info) if dct_proc_info['PID'] == os.getpid(): current_pid_has_usage = True current_pid_gpus.append(device_id) #endfor dct_device['PROCESSES'] = processes dct_device['USED_BY_PROCESS'] = device_id in current_pid_gpus else: str_os = platform.platform() ## check if platform is Tegra and record if 'tegra' in str_os.lower(): # we just record the overall fre memory mem_total = self.get_machine_memory() mem_allocated = mem_total - self.get_avail_memory() gpu_used = 1 gpu_temp = 1 gpu_temp_max = 100 if not self._done_first_smi_error and nvsmires is not None: self.P("Running `gpu_info` on Tegra platform: {}".format(nvsmires), color='r') self._done_first_smi_error = True elif not self._done_first_smi_error: str_log = "ERROR: Please make sure you have both pytorch and pynvml in order to monitor the GPU" str_log += "\nError info: pynvml_avail={}, nvsmires={}".format(pynvml_avail, nvsmires) self.P(str_log) self._done_first_smi_error = True #endif dct_device['ALLOCATED_MEM'] = mem_allocated dct_device['FREE_MEM'] = -1 if all(x is not None for x in [mem_total, mem_allocated]): dct_device['FREE_MEM'] = round(mem_total - mem_allocated,2) dct_device['MEM_UNIT'] = 'MB' if mb else 'GB' dct_device['GPU_USED'] = gpu_used dct_device['GPU_TEMP'] = gpu_temp dct_device['GPU_TEMP_MAX'] = gpu_temp_max lst_inf.append(dct_device) #end for all devices except Exception as e: self.P("gpu_info exception for device_id {}:\n{}".format(device_id, e), color='r') if show: self.P("GPU information for {} device(s):".format(len(lst_inf)), color='y') for dct_gpu in lst_inf: for k, v in dct_gpu.items(): self.P(" {:<14} {}".format(k + ':', v), color='y') if current_pid and current_pid_has_usage: return [lst_inf[x] for x in current_pid_gpus] else: return lst_inf
def myfuncHyper(): import pynvml pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() handle = pynvml.nvmlDeviceGetHandleByIndex( int(os.environ['CUDA_VISIBLE_DEVICES'])) gpuMem = pynvml.nvmlDeviceGetMemoryInfo(handle) print("Init") print(gpuMem.used) dictionary = dict(hyperParams.iloc[paramNr]) if not str(dictionary) in sizeDict: basicArchitecture = hyperParams.iloc[paramNr].basicArchitecture exec( open(basePath + 'model' + basicArchitecture[0] + '.py').read(), globals()) continueComputations = False saveComputations = False #exec(open(basePath+'runEpochs'+basicArchitecture[0]+'.py').read(), globals()) my_yDenseData = np.zeros((batchSize, nrOutputTargets)) my_inputDropout = 0.2 my_hiddenDropout = 0.5 my_lrGeneral = 0.1 my_lrWeight = 0.0 my_lrBias = 0.0 my_l2PenaltyWeight = 0.1 my_l2PenaltyBias = 0.1 my_l1PenaltyWeight = 0.1 my_l1PenaltyBias = 0.1 my_mom = 0.0 my_biasInit = np.zeros(nrOutputTargets) my_is_training = True if nrDenseFeatures > 0: my_xDenseData = np.zeros((batchSize, nrDenseFeatures)) if nrSparseFeatures > 0: indices = np.random.random_integers(0, nrSparseFeatures - 1, size=batchSize * estNonZFeatures) indptr = np.random.random_integers(0, len(indices) - 1, size=batchSize) indptr.sort() indptr[0] = 0 indptr = np.append(indptr, len(indices)) data = np.random.random_integers( 0, 430000, size=batchSize * estNonZFeatures).astype( np.float32) / 430000 mycsr = scipy.sparse.csr_matrix((data, indices, indptr), (batchSize, nrSparseFeatures)) mycsr.sort_indices() nonzx = mycsr.nonzero() valnonzx = (mycsr)[nonzx[0], nonzx[1]] my_xIndices = np.int64(np.vstack(nonzx).T) my_xValues = valnonzx.A.flatten() my_xDim = [mycsr.shape[0], mycsr.shape[1]] my_sparseMeanInit = np.zeros((1, nrSparseFeatures)) myfeed = { yDenseData: my_yDenseData, inputDropout: my_inputDropout, hiddenDropout: my_hiddenDropout, lrGeneral: my_lrGeneral, lrWeight: my_lrWeight, lrBias: my_lrBias, l2PenaltyWeight: my_l2PenaltyWeight, l2PenaltyBias: my_l2PenaltyBias, l1PenaltyWeight: my_l1PenaltyWeight, l1PenaltyBias: my_l1PenaltyBias, mom: my_mom, biasInit: my_biasInit, is_training: my_is_training } if nrDenseFeatures > 0: myfeed.update({xDenseData: my_xDenseData}) if nrSparseFeatures > 0: myfeed.update({ xIndices: my_xIndices, xValues: my_xValues, xDim: my_xDim, sparseMeanInit: my_sparseMeanInit, }) _ = session.run([init]) if nrSparseFeatures > 0: _ = session.run([sparseMeanInitOp], feed_dict=myfeed) _ = session.run([sparseMeanWSparseOp]) _ = session.run([optimizerDense], feed_dict=myfeed) _ = session.run([predNetwork], feed_dict=myfeed) print("GPU") gpuMem = pynvml.nvmlDeviceGetMemoryInfo(handle) sizeDict[str(dictionary)] = gpuMem.used sizeArray[paramNr] = gpuMem.used print(gpuMem.used) else: sizeArray[paramNr] = sizeDict[str(dictionary)]
if not os.path.exists(Cfg.modelDir): os.mkdir(Cfg.modelDir) if not os.path.exists(Cfg.logDir): os.mkdir(Cfg.logDir) if not os.path.exists(Cfg.dataDir): os.mkdir(Cfg.dataDir) # Fix the training devices and random seed. if torch.cuda.is_available(): np.random.seed(Cfg.seed) torch.cuda.manual_seed(Cfg.seed) if Cfg.GPUID > -1: torch.cuda.set_device(Cfg.GPUID) # Get the GPU logger. pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(Cfg.GPUID) device = 'cuda' else: np.random.seed(Cfg.seed) torch.manual_seed(Cfg.seed) device = 'cpu' # Set the parameters of the Lee Oscillator for tanh. if Cfg.LeeTanhType == 'A' or Cfg.LeeTanhType == 'a': a = [0.6, 0.6, -0.5, 0.5, -0.6, -0.6, -0.5, 0.5] elif Cfg.LeeTanhType == 'B' or Cfg.LeeTanhType == 'b': a = [1, 1, 1, 1, -1, -1, -1, -1] elif Cfg.LeeTanhType == 'C' or Cfg.LeeTanhType == 'c': a = [0.55, 0.55, -0.5, 0.5, -0.55, -0.55, 0.5, -0.5] elif Cfg.LeeTanhType == 'D' or Cfg.LeeTanhType == 'd': a = [1, 1, 1, 1, -1, -1, -1, -1]
def setup(self): class TimeOutException(Exception): pass def alarm_handler(signum, frame): raise TimeOutException() self.data["root"] = os.getcwd() program = os.getenv(env.PROGRAM) or util.get_program() if program: self.data["program"] = program else: self.data["program"] = '<python with no main file>' if wandb._get_python_type() != "python": if os.getenv(env.NOTEBOOK_NAME): self.data["program"] = os.getenv(env.NOTEBOOK_NAME) else: meta = wandb.jupyter.notebook_metadata() if meta.get("path"): if "fileId=" in meta["path"]: self.data[ "colab"] = "https://colab.research.google.com/drive/" + meta[ "path"].split("fileId=")[1] self.data["program"] = meta["name"] else: self.data["program"] = meta["path"] self.data["root"] = meta["root"] if not os.getenv(env.DISABLE_CODE): logger.debug("code probe starting") in_jupyter = wandb._get_python_type() != "python" # windows doesn't support alarm() and jupyter could call this in a thread context if platform.system() == "Windows" or not hasattr( signal, 'SIGALRM') or in_jupyter: logger.debug("non time limited probe of code") self._setup_code_git() self._setup_code_program() else: old_alarm = None try: try: old_alarm = signal.signal(signal.SIGALRM, alarm_handler) signal.alarm(25) self._setup_code_git() self._setup_code_program() finally: signal.alarm(0) except TimeOutException: logger.debug("timeout waiting for setup_code") finally: if old_alarm: signal.signal(signal.SIGALRM, old_alarm) logger.debug("code probe done") self.data["startedAt"] = datetime.utcfromtimestamp( wandb.START_TIME).isoformat() try: username = getpass.getuser() except KeyError: # getuser() could raise KeyError in restricted environments like # chroot jails or docker containers. Return user id in these cases. username = str(os.getuid()) # Host names, usernames, emails, the root directory, and executable paths are sensitive for anonymous users. if self._api.settings().get('anonymous') != 'true': self.data["host"] = os.environ.get(env.HOST, socket.gethostname()) self.data["username"] = os.getenv(env.USERNAME, username) self.data["executable"] = sys.executable else: self.data.pop("email", None) self.data.pop("root", None) self.data["os"] = platform.platform(aliased=True) self.data["python"] = platform.python_version() if env.get_docker(): self.data["docker"] = env.get_docker() try: pynvml.nvmlInit() self.data["gpu"] = pynvml.nvmlDeviceGetName( pynvml.nvmlDeviceGetHandleByIndex(0)).decode("utf8") self.data["gpu_count"] = pynvml.nvmlDeviceGetCount() except pynvml.NVMLError: pass try: self.data["cpu_count"] = multiprocessing.cpu_count() except NotImplementedError: pass # TODO: we should use the cuda library to collect this if os.path.exists("/usr/local/cuda/version.txt"): with open("/usr/local/cuda/version.txt") as f: self.data["cuda"] = f.read().split(" ")[-1].strip() self.data["args"] = sys.argv[1:] self.data["state"] = "running"
def step(self): valuesDict = {} valuesDict['table'] = self._tableName cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0) mem = valuesDict['mem'] = psutil.virtual_memory().percent swap = valuesDict['swap'] = psutil.swap_memory().percent # some code examples: # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py if self.doGpu: for i in self.gpusToUse: try: handle = nvmlDeviceGetHandleByIndex(i) memInfo = nvmlDeviceGetMemoryInfo(handle) valuesDict["gpuMem_%d" % i] = \ float(memInfo.used)*100./float(memInfo.total) util = nvmlDeviceGetUtilizationRates(handle) valuesDict["gpuUse_%d" % i] = util.gpu temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) valuesDict["gpuTem_%d" % i] = temp except NVMLError as err: handle = nvmlDeviceGetHandleByIndex(i) msg = "Device %d -> %s not suported\n" \ "Remove device %d from FORM" % \ (i, nvmlDeviceGetName(handle), i) errorWindow(None, msg) if self.doNetwork: try: # measure a sort interval pnic_before = psutil.net_io_counters(pernic=True)[self.nif] time.sleep(self.samplingTime) # sec pnic_after = psutil.net_io_counters(pernic=True)[self.nif] bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv valuesDict["%s_send" % self.nif] = \ bytes_sent * self.samplingTime / 1048576 valuesDict["%s_recv" % self.nif] = \ bytes_recv * self.samplingTime / 1048576 except: msg = "cannot get information of network interface %s" % \ self.nif if self.doDiskIO: try: # measure a sort interval disk_before = psutil.disk_io_counters(perdisk=False) time.sleep(self.samplingTime) # sec disk_after = psutil.disk_io_counters(perdisk=False) bytes_read = disk_after.read_bytes - disk_before.read_bytes bytes_write = disk_after.write_bytes - disk_before.write_bytes valuesDict["disk_read"] = \ self.samplingTime * bytes_read / self.mega valuesDict["disk_write"] = \ self.samplingTime * bytes_write / self.mega except: msg = "cannot get information of disk usage " if self.cpuAlert < 100 and cpu > self.cpuAlert: self.warning("CPU allocation =%f." % cpu) self.cpuAlert = cpu if self.memAlert < 100 and mem.percent > self.memAlert: self.warning("Memory allocation =%f." % mem) self.memAlert = mem if self.swapAlert < 100 and swap.percent > self.swapAlert: self.warning("SWAP allocation =%f." % swap) self.swapAlert = swap sqlCommand = "INSERT INTO %(table)s (" for label in self.labelList: sqlCommand += "%s, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ") VALUES(" for label in self.labelList: sqlCommand += "%"+"(%s)f, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ");" sql = sqlCommand % valuesDict try: self.cur.execute(sql) except Exception as e: print("ERROR: saving one data point (monitor). I continue") # Return finished = True if all protocols have finished finished = [] for prot in self.protocols: updatedProt = getUpdatedProtocol(prot) finished.append(updatedProt.getStatus() != STATUS_RUNNING) return all(finished)
def device_name(): with pynvml_context(): device_name = device_name_for(pynvml.nvmlDeviceGetHandleByIndex(0)) return device_name
def get(self): """Write the web page content.""" global cpu_load global gpu_load_compute global gpu_load_memory memory = psutil.virtual_memory() swap = psutil.swap_memory() if nvidia: nvmlHandle = nvmlDeviceGetHandleByIndex(0) gpu = nvmlDeviceGetName(nvmlHandle).decode('utf-8') gpu_memory = nvmlDeviceGetMemoryInfo(nvmlHandle) gpu_ram = round(gpu_memory.total / (1024 * 1048576), 2) gpu += " - " + str(gpu_ram) + "GB" else: gpu = "Not recognized" ram = str(int(round(float(memory.total) / (1024 * 1048576)))) + "GB" ram += " (swap: " + str(int(round(float(swap.total) / (1024 * 1048576)))) + "GB)" real_cores = psutil.cpu_count(False) cores_ratio = int(psutil.cpu_count(True) / real_cores) cores = " (" + str(cores_ratio) + "x " + str(real_cores) + " cores)" if sys.platform.startswith('linux'): distribution = distro.linux_distribution() os_name = 'Linux ' + distribution[0] + " " + distribution[1] + " " + distribution[2] command = "cat /proc/cpuinfo" all_info = subprocess.check_output(command, shell=True).decode('utf-8').strip() for line in all_info.split("\n"): if "model name" in line: cpu = re.sub(".*model name.*:", "", line, 1) break elif sys.platform == 'win32': computer = wmi.WMI() os_info = computer.Win32_OperatingSystem()[0] cpu = computer.Win32_Processor()[0].Name os_name = os_info.Name.split('|')[0] + ", version " + os_info.Version elif sys.platform == 'darwin': os_name = 'macOS ' + platform.mac_ver()[0] os.environ['PATH'] = os.environ['PATH'] + os.pathsep + '/usr/sbin' command = 'sysctl -n machdep.cpu.brand_string' cpu = subprocess.check_output(command).strip() else: # unknown platform os_name = 'Unknown' cpu = 'Unknown' self.write("<!DOCTYPE html>\n") self.write("<html><head><meta charset='utf-8'/><title>Webots simulation server</title>") self.write("<link rel='stylesheet' type='text/css' href='css/monitor.css'></head>\n") self.write("<body><h1>Webots simulation server: " + socket.getfqdn() + "</h1>") self.write("<h2>Host: " + os_name + "</h2>\n") self.write("<p><b>CPU load: %g%%</b><br>\n" % cpu_load) self.write(cpu + cores + "</p>\n") self.write("<p><b>GPU load compute: %g%% — load memory: %g%%</b><br>\n" % (gpu_load_compute, gpu_load_memory)) self.write(gpu + "</p>\n") self.write("<p><b>RAM:</b><br>" + ram + "</p>\n") self.write("<canvas id='graph' height='400' width='1024'></canvas>\n") self.write("<script src='https://www.cyberbotics.com/harry-plotter/0.9f/harry.min.js'></script>\n") self.write("<script>\n") self.write("window.onload = function() {\n") def appendData(label): global snapshots d = "{title:'" + label + "',values:[" for s in snapshots: d += str(s.data[label]) + ',' return d[:-1] + "]}," datas = '' datas += appendData('Webots running') datas += appendData('Webots idle') datas += appendData('CPU load') datas += appendData('CPU memory') datas += appendData('GPU load compute') datas += appendData('GPU load memory') datas += appendData('GPU memory') datas += appendData('Swap') datas += appendData('Disk') datas += appendData('Network sent') datas += appendData('Network received') datas = datas[:-1] # remove the last coma self.write(" plotter({\n") self.write(" canvas: 'graph',\n") self.write(" datas:[ " + datas + "],\n") self.write(""" labels:{ ypos:"left", x:100, y:[50,100], marks:2 }, fill:"none", opacity:0.5, linewidth:3, background:"#fff", autoscale:"top", grid:{ x:[0,100] }, mouseover:{ radius:4, linewidth:2, bullet:"#444", shadowbox:"1,1,0,#000", axis:"x" } });""") self.write("}\n") self.write("</script>\n") self.write("</body></html>")
def update_snapshot(): """Compute a monitoring snapshot.""" global current_load global network_sent global network_received global cpu_load global gpu_load_compute global gpu_load_memory memory = psutil.virtual_memory() swap = psutil.swap_memory() disk = psutil.disk_usage('/') n = psutil.net_io_counters() new_network_sent = n.bytes_sent new_network_received = n.bytes_recv network_sent_rate = float(new_network_sent - network_sent) / (SNAPSHOT_REFRESH * 1000000) # expressed in MB/s network_received_rate = float(new_network_received - network_received) / (SNAPSHOT_REFRESH * 1000000) # MB/s network_sent = new_network_sent network_received = new_network_received global nvidia if nvidia: nvmlHandle = nvmlDeviceGetHandleByIndex(0) gpu_memory = nvmlDeviceGetMemoryInfo(nvmlHandle) gpu_ram_usage = round(100 * float(gpu_memory.used) / float(gpu_memory.total), 1) else: # not supported nvmlHandle = 0 gpu_ram_usage = 0 cpu_load = psutil.cpu_percent() try: gpu_load = nvmlDeviceGetUtilizationRates(nvmlHandle) gpu_load_compute = gpu_load.gpu gpu_load_memory = gpu_load.memory except NVMLError: # not supported on some hardware gpu_load_compute = 0 gpu_load_memory = 0 webots_idle = 0 webots_running = 0 for client in ClientWebSocketHandler.clients: if client.idle: webots_idle = webots_idle + 1 else: webots_running = webots_running + 1 snapshot = Snapshot() snapshot.data['Timestamp'] = int(time.time()) snapshot.data['Webots running'] = webots_running snapshot.data['Webots idle'] = webots_idle snapshot.data['CPU load'] = cpu_load snapshot.data['CPU memory'] = memory.percent snapshot.data['GPU load compute'] = gpu_load_compute snapshot.data['GPU load memory'] = gpu_load_memory snapshot.data['GPU memory'] = gpu_ram_usage snapshot.data['Swap'] = swap.percent snapshot.data['Disk'] = disk.percent snapshot.data['Network sent'] = network_sent_rate snapshot.data['Network received'] = network_received_rate snapshot.write() current_load = 0 for key, value in snapshot.data.items(): if key == 'Timestamp': continue if value > current_load: current_load = value snapshots.append(snapshot) if len(snapshots) > 600: # display data for the last 10 minutes del snapshots[0] tornado.ioloop.IOLoop.current().add_timeout(int(time.time()) + SNAPSHOT_REFRESH, update_snapshot)
def step(self): valuesDict = {} valuesDict['table'] = self._tableName cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0) mem = valuesDict['mem'] = psutil.virtual_memory().percent swap = valuesDict['swap'] = psutil.swap_memory().percent # some code examples: # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py if self.doGpu: for i in self.gpusToUse: try: handle = nvmlDeviceGetHandleByIndex(i) memInfo = nvmlDeviceGetMemoryInfo(handle) valuesDict["gpuMem_%d" % i] = \ float(memInfo.used)*100./float(memInfo.total) util = nvmlDeviceGetUtilizationRates(handle) valuesDict["gpuUse_%d" % i] = util.gpu temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) valuesDict["gpuTem_%d" % i] = temp except NVMLError as err: handle = nvmlDeviceGetHandleByIndex(i) msg = "Device %d -> %s not suported\n" \ "Remove device %d from FORM" % \ (i, nvmlDeviceGetName(handle), i) errorWindow(None, msg) if self.doNetwork: try: # measure a sort interval pnic_before = psutil.net_io_counters(pernic=True)[self.nif] time.sleep(self.samplingTime) # sec pnic_after = psutil.net_io_counters(pernic=True)[self.nif] bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv valuesDict["%s_send" % self.nif] = \ bytes_sent * self.samplingTime / 1048576 valuesDict["%s_recv" % self.nif] = \ bytes_recv * self.samplingTime / 1048576 except: msg = "cannot get information of network interface %s" % \ self.nif if self.doDiskIO: try: # measure a sort interval disk_before = psutil.disk_io_counters(perdisk=False) time.sleep(self.samplingTime) # sec disk_after = psutil.disk_io_counters(perdisk=False) bytes_read = disk_after.read_bytes - disk_before.read_bytes bytes_write = disk_after.write_bytes - disk_before.write_bytes valuesDict["disk_read"] = \ self.samplingTime * bytes_read / self.mega valuesDict["disk_write"] = \ self.samplingTime * bytes_write / self.mega except: msg = "cannot get information of disk usage " if self.cpuAlert < 100 and cpu > self.cpuAlert: self.warning("CPU allocation =%f." % cpu) self.cpuAlert = cpu if self.memAlert < 100 and mem.percent > self.memAlert: self.warning("Memory allocation =%f." % mem) self.memAlert = mem if self.swapAlert < 100 and swap.percent > self.swapAlert: self.warning("SWAP allocation =%f." % swap) self.swapAlert = swap sqlCommand = "INSERT INTO %(table)s (" for label in self.labelList: sqlCommand += "%s, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ") VALUES(" for label in self.labelList: sqlCommand += "%" + "(%s)f, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ");" sql = sqlCommand % valuesDict try: self.cur.execute(sql) except Exception as e: print("ERROR: saving one data point (monitor). I continue") # Return finished = True if all protocols have finished finished = [] for prot in self.protocols: updatedProt = getUpdatedProtocol(prot) finished.append(updatedProt.getStatus() != STATUS_RUNNING) return all(finished)
def printGPUINFO(): gpu_id = config.GPU_ID gpu_obj = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) print ("gup mem used:", pynvml.nvmlDeviceGetMemoryInfo(gpu_obj).used/1024/1024, "MB")
def getFreeRatio(id): handle = pynvml.nvmlDeviceGetHandleByIndex(id) use = pynvml.nvmlDeviceGetUtilizationRates(handle) ratio = 0.5*(float(use.gpu+float(use.memory))) return ratio
def lane_attn(): # data read x_train = np.load(r'./data_split/x_train.npy') x_test = np.load(r'./data_split/x_test.npy') x_validation = np.load(r'./data_split/x_validation.npy') y_train = np.load(r'./data_split/y_train.npy') y_test = np.load(r'./data_split/y_test.npy') y_validation = np.load(r'./data_split/y_validation.npy') #data standard normalization a, b, c = x_train.shape x_train = x_train.reshape(a * b, c) scaler = preprocessing.StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_train = x_train.reshape(a, b, c) a, b, c = x_validation.shape x_validation = x_validation.reshape(a * b, c) x_validation = scaler.transform(x_validation) x_validation = x_validation.reshape(a, b, c) a, b, c = x_test.shape x_test = x_test.reshape(a * b, c) x_test = scaler.transform(x_test) x_test = x_test.reshape(a, b, c) x1 = torch.from_numpy(x_train).float() y1 = torch.from_numpy(y_train).float() x2 = torch.from_numpy(x_validation).float() y2 = torch.from_numpy(y_validation).float() x3 = torch.from_numpy(x_test).float() y3 = torch.from_numpy(y_test).float() #data from.npy to pytorch data global BATCH_SIZE BATCH_SIZE = 512 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_dataset = Data.TensorDataset(x1, y1) trainloader = Data.DataLoader( dataset=train_dataset, # torch TensorDataset format batch_size=BATCH_SIZE, # mini batch size shuffle=False, # 要不要打乱数据 (打乱比较好) num_workers=2, # 多线程来读数据 drop_last=True, ) vali_dataset = Data.TensorDataset(x2, y2) valiloader = Data.DataLoader( dataset=vali_dataset, # torch TensorDataset format batch_size=BATCH_SIZE, # mini batch size shuffle=False, # 要不要打乱数据 (打乱比较好) num_workers=2, # 多线程来读数据 drop_last=True, ) test_dataset = Data.TensorDataset(x3, y3) testloader = Data.DataLoader( dataset=test_dataset, # torch TensorDataset format batch_size=BATCH_SIZE, # mini batch size shuffle=False, # 要不要打乱数据 (打乱比较好) num_workers=2, # 多线程来读数据 drop_last=True, ) #Encoder structure class Encoder(nn.Module): def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout=0): #n_layers represents the layer of LSTM super().__init__() self.input_dim = input_dim self.emb_dim = emb_dim self.hid_dim = hid_dim self.n_layers = n_layers self.dropout = dropout self.embedding = nn.Linear(input_dim, emb_dim) self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout) self.dropout = nn.Dropout(dropout) def forward(self, x): #x = [src sent len, batch size,2] embedded = self.dropout(self.embedding(x)) #embedded = [src sent len, batch size, emb dim] outputs, (hidden, cell) = self.rnn(embedded) #outputs = [len, batch size, hid dim * n directions] #direction==1 hier #hidden = [n layers * n directions, batch size, hid dim] #cell = [n layers * n directions, batch size, hid dim] #outputs are always from the top hidden layer return outputs, hidden, cell #Decoder structure class Decoder(nn.Module): def __init__(self, decoder_input_dim, emb_dim, hid_dim, n_layers, dropout=0.2): super().__init__() self.emb_dim = emb_dim self.hid_dim = hid_dim self.decoder_input_dim = decoder_input_dim self.n_layers = n_layers self.dropout = dropout self.embedding = nn.Linear(decoder_input_dim, emb_dim) self.attn = nn.Linear(40 + self.emb_dim, 1) # self.attn_combine = nn.Linear(self.hidden_size+self.de_emb_dim , self.hidden_size) self.rnn = nn.LSTM(emb_dim + hid_dim, hid_dim, n_layers, dropout=dropout) self.out = nn.Linear(hid_dim, decoder_input_dim) self.dropout = nn.Dropout(dropout) def forward(self, input, context1, context2, context3, hidden, cell): #hidden = [n layers * n directions, batch size, hid dim] #cell = [n layers * n directions, batch size, hid dim] #n directions in the decoder will both always be 1, therefore: #hidden = [n layers, batch size, hid dim] #context = [n layers, batch size, hid dim] input = input.unsqueeze(0) embedded = self.dropout(self.embedding(input)) context1 = context1.permute(1, 0, 2) context2 = context2.permute(1, 0, 2) context3 = context3.permute(1, 0, 2) # context=[batch,1,hidden_dim] embedded = embedded.permute(1, 0, 2) # embedded[batch,1,embedded_dim] attn1 = self.attn(torch.cat((embedded, context1), dim=2)) attn2 = self.attn(torch.cat((embedded, context2), dim=2)) attn3 = self.attn(torch.cat((embedded, context3), dim=2)) # attn1=np.array(attn1) attn1 = attn1.squeeze(1) attn2 = attn2.squeeze(1) attn3 = attn3.squeeze(1) #print(attn1.size()) attn = torch.cat((attn1, attn2, attn3), 1) #print(attn.size()) attn_weights = F.softmax(attn, dim=1) #print(attn_weights.size()) #print(attn_weights[200,:]) attn_weights = attn_weights.unsqueeze(1) context1 = context1.permute(0, 2, 1) context2 = context2.permute(0, 2, 1) context3 = context3.permute(0, 2, 1) #print(context1.shape) #print(attn_weights[:,:,0].shape) context1 = torch.bmm(context1, attn_weights[:, :, 0].unsqueeze(2)) context2 = torch.bmm(context2, attn_weights[:, :, 1].unsqueeze(2)) context3 = torch.bmm(context3, attn_weights[:, :, 2].unsqueeze(2)) context1 = context1.permute(0, 2, 1) context2 = context2.permute(0, 2, 1) #permute change the dimesion between the last and the middel positiom context3 = context3.permute(0, 2, 1) context = torch.cat((context1, context2, context3), dim=2) #print(context.shape) embedded = embedded.permute(1, 0, 2) context = context.permute(1, 0, 2) emb_con = torch.cat((embedded, context), 2) output, (hidden, cell) = self.rnn(emb_con, (hidden, cell)) #output = [len, batch size, hid dim * n directions] #hidden = [n layers * n directions, batch size, hid dim] #cell = [n layers * n directions, batch size, hid dim] #sent len and n directions will always be 1 in the decoder, therefore: #output = [1, batch size, hid dim] #hidden = [n layers, batch size, hid dim] #cell = [n layers, batch size, hid dim] prediction = self.out(output.squeeze(0)) #prediction = [batch size, output dim] return prediction, hidden, cell # seq-seq combine encoder-decoder and #the decoder process is done step by step class Seq2Seq(nn.Module): global firstinput def __init__(self, encoder, decoder, device): super().__init__() self.encoder = encoder self.decoder = decoder self.device = device #assert encoder.hid_dim == decoder.hid_dim, "Hidden dimensions of encoder and decoder must be equal!" assert encoder.n_layers == decoder.n_layers, "Encoder and decoder must have equal number of layers!" def forward(self, x, y, teacher_forcing_ratio=0.5): #teacher_forcing_ratio is probability to use teacher forcing #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time batch_size = BATCH_SIZE max_len = 25 trg_vocab_size = 2 #tensor to store decoder outputs outputs = torch.zeros(max_len, batch_size, trg_vocab_size) #last hidden state of the encoder is used as the initial hidden state of the decoder #choose the cell state of second layer as context encoder_outputs1, hidden1, cell1 = self.encoder( x[:, :, [0, 1, 2, 3, 4, 5, 6, 7, 28, 29, 30, 31]]) encoder_outputs2, hidden2, cell2 = self.encoder( x[:, :, [8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35]]) encoder_outputs3, hidden3, cell3 = self.encoder( x[:, :, [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]]) #take the cell of last layer as context context1 = cell1[1, :, :] context1 = context1.unsqueeze(0) context2 = cell2[1, :, :] context2 = context2.unsqueeze(0) context3 = cell3[1, :, :] context3 = context3.unsqueeze(0) hidden = torch.cat((hidden1, hidden2, hidden3), dim=2) cell = torch.cat((cell1, cell2, cell3), dim=2) #cell=cell1+cell2+cell3 # print('c-shape:',context.shape) input = firstinput #the firstinput of decoder process, we should give the true vaule # input = input.unsqueeze(0) #print(input.size()) for t in range(max_len): output, hidden, cell = self.decoder(input, context1, context2, context3, hidden, cell) outputs[t] = output #print(output) #input = output.unsqueeze(0) #print(input.size()) #context=cell[1,:,:] #context=context.unsqueeze(0) teacher_force = random.random() < teacher_forcing_ratio top1 = output if t == 24: break input = ((y[t, :, :]) if teacher_force else top1) #outputs[t] = output #print('output',output.size()) #input = output.unsqueeze(0) return outputs INPUT_DIM = 12 DECODER_INPUT_DIM = 2 HID_DIM = 40 HID_DIM1 = 120 N_LAYERS = 2 ENC_EMB_DIM = 32 DEC_EMB_DIM = 16 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS) dec = Decoder(DECODER_INPUT_DIM, DEC_EMB_DIM, HID_DIM1, N_LAYERS) model = Seq2Seq(enc, dec, device).to(device) def init_weights(m): for name, param in m.named_parameters(): nn.init.uniform_(param.data, -0.15, 0.15) # nn.init.orthogonal_(param.data) model.apply(init_weights) def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'The model has {count_parameters(model):,} trainable parameters') optimizer = optim.Adam(model.parameters(), weight_decay=0.00001, lr=0.01) criterion = nn.MSELoss() def train(model, dataloader, optimizer, criterion, clip): global firstinput model.train() epoch_loss = 0 for x, y in dataloader: x = x.transpose(1, 0) y = y.transpose(1, 0) x = x.to('cuda') y = y.to('cuda') firstinput = y[0, :, :] y = y[1:, :, :] optimizer.zero_grad() output = model(x, y) output = output.to('cuda') # loss = criterion(output, y) #print(output.size()) # for lateral position,we give more attention,so his penalization is *3 loss = 3 * criterion(output[:, :, 1], y[:, :, 1]) + criterion( output[:, :, 0], y[:, :, 0]) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), clip) optimizer.step() epoch_loss += loss.item() #print(epoch_loss) return epoch_loss / len(dataloader) # In[ ]: def evaluate(model, validataloader, criterion): model.eval() epoch_loss = 0 # no loss backward, with torch.no_grad(): for x, y in validataloader: x = x.transpose(1, 0) y = y.transpose(1, 0) x = x.to('cuda') y = y.to('cuda') firstinput = y[0, :, :] y = y[1:, :, :] optimizer.zero_grad() output = model(x, y, 0) #turn off teacher forcing output = output.to('cuda') loss = 3 * criterion(output[:, :, 1], y[:, :, 1]) + criterion( output[:, :, 0], y[:, :, 0]) epoch_loss += loss.item() return epoch_loss / len(validataloader) # In[ ]: def test(model, testdataloader, criterion): global j global firstinput global test_result model.eval() epoch_loss = 0 with torch.no_grad(): for x, y in testdataloader: x = x.transpose(1, 0) y = y.transpose(1, 0) x = x.to('cuda') y = y.to('cuda') firstinput = y[0, :, :] y = y[1:, :, :] optimizer.zero_grad() output = model(x, y, 0) #turn off teacher forcing test_result[:, j:j + BATCH_SIZE, :] = output j = j + BATCH_SIZE output = output.to('cuda') # loss = criterion(output, y) loss = 3 * criterion(output[:, :, 1], y[:, :, 1]) + criterion( output[:, :, 0], y[:, :, 0]) epoch_loss += loss.item() # print(len(testdataloader)) return epoch_loss / len(testdataloader) # In[ ]: N_EPOCHS = 40 CLIP = 1 #CLIP clip the gradients to prevent them from exploding global test_result test_result = np.zeros([25, 80000, 2]) pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) print('this is lane-attention\n') for epoch in range(N_EPOCHS): global j j = 0 start_time = time.process_time() train_loss = train(model, trainloader, optimizer, criterion, CLIP) valid_loss = evaluate(model, valiloader, criterion) end_time = time.process_time() print(f'Epoch: {epoch+1:02} | Time: {end_time-start_time}s') print(f'\tTrain Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}') #writer.add_scalars('loss',{'train_loss': train_loss, #'valid_loss': valid_loss},epoch ) test_loss = test(model, testloader, criterion) if test_loss < 1.7: print('testloss:', test_loss) test_result = test_result[:, :j, :] np.save(r'./result/lane_attn_predict_tra.npy', test_result) np.save(r'./result/true_tra.npy', y_test[:, 1:, :]) break if epoch == 39: print('testloss:', test_loss) test_result = test_result[:, :j, :] np.save(r'./result/lane_attn_predict_tra.npy', test_result) np.save(r'./result/true_tra.npy', y_test[:, 1:, :]) break print('meminfo.used:', meminfo.used / (1024 * 1024)) print('meminfo.total:', meminfo.total / (1024 * 1024)) return 0
def __init__(self, ground_policy, F_s, F_sa, env, device, log, hyperparameters): self.env = env self.device = device self.log = log self.hyperparameters = hyperparameters self.ground_policy = ground_policy self.name = "" self.verbose = hyperparameters["verbose"] # Check env: self.discrete_env = True if 'Discrete' in str( env.action_space) else False if self.discrete_env: self.num_actions = self.env.action_space.n self.action_low = torch.zeros(self.num_actions, device=self.device) self.action_high = torch.ones(self.num_actions, device=self.device) if self.verbose: print("Num actions: ", self.num_actions) else: self.num_actions = len(self.env.action_space.high) self.action_low = torch.tensor(env.action_space.low, device=self.device) self.action_high = torch.tensor(env.action_space.high, device=self.device) if self.verbose: print("Env action low: ", self.action_low) print("Env action high: ", self.action_high) # Set up parameters: # Actor-Critic: self.use_actor_critic = hyperparameters["use_actor_critic"] self.use_CACLA_V = hyperparameters["use_CACLA_V"] self.use_CACLA_Q = hyperparameters["use_CACLA_Q"] self.use_DDPG = hyperparameters["use_DDPG"] self.use_SPG = hyperparameters["use_SPG"] self.use_GISPG = hyperparameters["use_GISPG"] # QV: self.use_QV = hyperparameters["use_QV"] self.use_QVMAX = hyperparameters["use_QVMAX"] # Exploration: self.gaussian_action_noise = hyperparameters["action_sigma"] self.boltzmann_exploration_temp = hyperparameters["boltzmann_temp"] self.epsilon = hyperparameters["epsilon"] self.epsilon_mid = hyperparameters["epsilon_mid"] if self.epsilon_mid: self.eps_factor = self.epsilon_mid**(1 / hyperparameters["steps"]) self.epsilon = 1 # General: self.use_half = hyperparameters["use_half"] self.batch_size = hyperparameters["batch_size"] self.use_world_model = hyperparameters["use_world_model"] # TODO: -Include PER with prioritization based on Upper Bound of Gradient Norm. # TODO: -include different sampling schemes from the papers investigatin PER in SL (small and big buffer for gradient norm too) # TODO: -add goal to replay buffer and Transition (For HRL) # Eligibility traces: if torch.cuda.is_available(): nvmlInit() self.nvml_handle = nvmlDeviceGetHandleByIndex(0) self.max_gpu_bytes = torch.cuda.get_device_properties( self.device).total_memory self.mem_usage = None self.current_episode = [] self.use_efficient_traces = hyperparameters["use_efficient_traces"] self.elig_traces_update_steps = hyperparameters[ "elig_traces_update_steps"] self.elig_traces_anneal_lambda = hyperparameters[ "elig_traces_anneal_lambda"] self.lambda_val = hyperparameters["elig_traces_lambda"] # Set up replay buffer: self.stack_dim = hyperparameters["stack_dim"] self.stack_count = hyperparameters["frame_stack"] self.buffer_size = hyperparameters[ "replay_buffer_size"] + hyperparameters["num_expert_samples"] self.use_PER = hyperparameters["use_PER"] self.use_CER = hyperparameters["use_CER"] self.PER_alpha = hyperparameters["PER_alpha"] self.PER_start_beta = hyperparameters["PER_beta"] self.PER_beta = self.PER_start_beta self.PER_anneal_beta = hyperparameters["PER_anneal_beta"] self.PER_max_priority = hyperparameters["PER_max_priority"] self.PER_running_avg = hyperparameters["PER_running_avg"] self.importance_weights = None # Create replay buffer: self.memory = self.create_replay_buffer() # Feature extractors: self.F_s = F_s self.F_sa = F_sa self.state_feature_len = F_s.layers_merge[-1].out_features if F_sa is not None: self.state_action_feature_len = F_sa.layers_merge[-1].out_features # Set up Networks: self.use_half = hyperparameters[ "use_half"] and torch.cuda.is_available() self.nets = [] self.actor, self.Q, self.V = self.init_actor_critic( self.F_s, self.F_sa)
def training_step(self, batch, batch_idx) -> Dict: global isEmUpdateBusy # use to check whether the entire embedding update process is finished or not global isAddIndexBusy # use to check whether the entire indexing process is finished or not global processes # use to keep threads embedding update processes global threadHandle_index # use to keep thread in embedding indexing processes if (self.trainer.global_rank == 0) and (self.custom_config.end2end): if (not batch_idx == 0) and ( batch_idx % self.custom_config.indexing_freq == 0): free_gpu_list = [] nvmlInit() deviceCount = nvmlDeviceGetCount() my_list = json.loads(self.custom_config.gpu_order) for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) info = nvmlDeviceGetMemoryInfo(handle) if info.used / 1e6 < 15: position = my_list.index(i) free_gpu_list.append("cuda:" + str(position)) if len(free_gpu_list) >= self.custom_config.index_gpus: has_free_gpus = True else: has_free_gpus = False if (not isEmUpdateBusy) and has_free_gpus: model_copy = type(self.model.rag.ctx_encoder)( self.config_dpr ) # get a new instance #this will be load in the CPU model_copy.load_state_dict(self.model.rag.ctx_encoder. state_dict()) # copy weights processes = [] if len(free_gpu_list) > self.custom_config.index_gpus: cuda_devices = random.sample( free_gpu_list, self.custom_config.index_gpus) else: cuda_devices = free_gpu_list num_processes = len(cuda_devices) for rank in range(num_processes): logger.info( "Iniitializing embedding calculation process rank{}" .format(rank)) device = cuda_devices[rank] p = multiprocessing.Process( target=embed_update, args=( copy.deepcopy(model_copy), num_processes, device, rank, self.custom_config.shard_dir, self.custom_config.csv_path, ), ) processes.append(p) for p in processes: p.start() isEmUpdateBusy = True if isEmUpdateBusy and (not isAddIndexBusy): index_process_list = [ processes[k].is_alive() for k in range(self.custom_config.index_gpus) ] if ( sum(index_process_list) == 0 ): # If entire list is false, we can say all embedding calculation process has finished logger.info("Start adding the index") threadHandle_index = multiprocessing.Process( target=add_index, args=( self.custom_config.shard_dir, self.config.index_path, ), ) threadHandle_index.start() isAddIndexBusy = True # check when index building has started if isAddIndexBusy: # check still the index_building process is happening if not threadHandle_index.is_alive(): logger.info("Merging the dataset shards") saved_dataset_shards = [] for address in glob( str(self.custom_config.shard_dir) + "/*/"): saved_dataset_shards.append(load_from_disk(address)) concat = concatenate_datasets(saved_dataset_shards) concat.save_to_disk( self.config.passages_path ) # here we update the main passage file on the disk logger.info("done updating the dataset") # if you load the index from the disk make sure to update the index file here, otherwise it is ok to update the index file from the worker. # logger.info("then updating the index") # shutil.copy(self.custom_config.temp_index, self.config.idex_path) logger.info( "Loading new passages and iniitalzing new index") self.trainer.model.module.module.model.rag.retriever.re_load( ) self.trainer.model.module.module.model.rag.retriever.init_retrieval( ) isEmUpdateBusy = False isAddIndexBusy = False self.trainer.accelerator_connector.accelerator.barrier( "barrier") # waint untill the index and kb get re-initialized. loss_tensors = self._step(batch) logs = { name: loss for name, loss in zip(self.loss_names, loss_tensors) } # tokens per batch tgt_pad_token_id = (self.tokenizer.generator.pad_token_id if isinstance(self.tokenizer, RagTokenizer) else self.tokenizer.pad_token_id) src_pad_token_id = (self.tokenizer.question_encoder.pad_token_id if isinstance(self.tokenizer, RagTokenizer) else self.tokenizer.pad_token_id) logs["tpb"] = (batch["input_ids"].ne(src_pad_token_id).sum() + batch["decoder_input_ids"].ne(tgt_pad_token_id).sum()) self.log("loss", loss_tensors[0]) return loss_tensors[0]
from bokeh.plotting import figure, ColumnDataSource from bokeh.models import DataRange1d, NumeralTickFormatter, BasicTicker from bokeh.layouts import column from bokeh.models.mappers import LinearColorMapper from bokeh.palettes import all_palettes import math import time import pynvml from jupyterlab_nvdashboard.utils import format_bytes pynvml.nvmlInit() ngpus = pynvml.nvmlDeviceGetCount() gpu_handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(ngpus)] def gpu(doc): fig = figure(title="GPU Utilization", sizing_mode="stretch_both", x_range=[0, 100]) def get_utilization(): return [ pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu for i in range(ngpus) ] gpu = get_utilization() y = list(range(len(gpu)))
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) # 2. additional info (driver version, etc). try: driver_version = _decode(N.nvmlSystemGetDriverVersion()) except N.NVMLError: driver_version = None # N/A N.nvmlShutdown() return GPUStatCollection(gpu_list, driver_version=driver_version)
def getFreeRatio(id): handle = pynvml.nvmlDeviceGetHandleByIndex(id) use = pynvml.nvmlDeviceGetUtilizationRates(handle) ratio = 0.5 * (float(use.gpu + float(use.memory))) return ratio
import utils.blob as blob_utils import utils.net as net_utils import utils.Lossfuction as Lossfuction import utils.resnet_weights_helper as resnet_utils from lib.nn import SynchronizedBatchNorm2d import pynvml import cv2 import modeling.CRL as CRL from torchvision.utils import make_grid from tensorboardX import SummaryWriter from torch.utils.checkpoint import checkpoint logger = logging.getLogger(__name__) pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) def get_func(func_name): """Helper to return a function object by name. func_name must identify a function in this module or the path to a function relative to the base 'modeling' module. """ if func_name == '': return None try: parts = func_name.split('.') # Refers to a function in this module if len(parts) == 1: return globals()[parts[0]] # Otherwise, assume we're referencing a module under modeling module_name = 'modeling.' + '.'.join(parts[:-1])
def identify_cards(): devices = {} try: import pynvml from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex deviceCount = None try: nvmlInit() deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) props = {} def meminfo(memory): return { "total" : int(memory.total), "free" : int(memory.free), "used" : int(memory.used), } def pciinfo(pci): i = {} for x in ("domain", "bus", "device", "pciDeviceId", "pciSubSystemId"): try: i[x] = int(getattr(pci, x)) except: pass try: i["busId"] = str(pci.busId) except: pass return i for prop, fn_name, args, conv in ( ("name", "nvmlDeviceGetName", (), str), ("serial", "nvmlDeviceGetSerial", (), str), ("uuid", "nvmlDeviceGetUUID", (), str), ("pci", "nvmlDeviceGetPciInfo", (), pciinfo), ("memory", "nvmlDeviceGetMemoryInfo", (), meminfo), ("pcie-link-generation-max", "nvmlDeviceGetMaxPcieLinkGeneration", (), int), ("pcie-link-width-max", "nvmlDeviceGetMaxPcieLinkWidth", (), int), ("pcie-link-generation", "nvmlDeviceGetCurrPcieLinkGeneration", (), int), ("pcie-link-width", "nvmlDeviceGetCurrPcieLinkWidth", (), int), ("clock-info-graphics", "nvmlDeviceGetClockInfo", (0,), int), ("clock-info-sm", "nvmlDeviceGetClockInfo", (1,), int), ("clock-info-mem", "nvmlDeviceGetClockInfo", (2,), int), ("clock-info-graphics-max", "nvmlDeviceGetMaxClockInfo", (0,), int), ("clock-info-sm-max", "nvmlDeviceGetMaxClockInfo", (1,), int), ("clock-info-mem-max", "nvmlDeviceGetMaxClockInfo", (2,), int), ("fan-speed", "nvmlDeviceGetFanSpeed", (), int), ("temperature", "nvmlDeviceGetTemperature", (0,), int), ("power-state", "nvmlDeviceGetPowerState", (), int), ("vbios-version", "nvmlDeviceGetVbiosVersion", (), str), ): try: fn = getattr(pynvml, fn_name) v = fn(handle, *args) if conv: v = conv(v) props[prop] = v except Exception as e: log("identify_cards() cannot query %s using %s on device %i with handle %s: %s", prop, fn, i, handle, e) continue devices[i] = props #unitCount = nvmlUnitGetCount() #log.info("unitCount=%s", unitCount) except Exception as e: log("identify_cards() pynvml error", exc_info=True) log.warn("Warning: failed to query the NVidia cards via NVML:") log.warn(" %s", e) finally: if deviceCount is not None: nvmlShutdown() except ImportError as e: log("cannot use nvml to query the kernel module version:") log(" %s", e) return devices
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses( handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses( handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process.pid) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': int(power / 1000) if power is not None else None, 'enforced.power.limit': int(power_limit / 1000) if power is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) N.nvmlShutdown() return GPUStatCollection(gpu_list)
def base_structure(): # data read x_train=np.load(r'./data_split/x_train.npy') x_test=np.load(r'./data_split/x_test.npy') x_validation=np.load(r'./data_split/x_validation.npy') y_train=np.load(r'./data_split/y_train.npy') y_test=np.load(r'./data_split/y_test.npy') y_validation=np.load(r'./data_split/y_validation.npy') #data standard normalization a,b,c=x_train.shape x_train=x_train.reshape(a*b,c) scaler = preprocessing.StandardScaler().fit(x_train) x_train=scaler.transform(x_train) x_train=x_train.reshape(a,b,c) a,b,c=x_validation.shape x_validation=x_validation.reshape(a*b,c) x_validation=scaler.transform(x_validation) x_validation=x_validation.reshape(a,b,c) a,b,c=x_test.shape x_test=x_test.reshape(a*b,c) x_test=scaler.transform(x_test) x_test=x_test.reshape(a,b,c) x1=torch.from_numpy(x_train).float() y1=torch.from_numpy(y_train).float() x2=torch.from_numpy(x_validation).float() y2=torch.from_numpy(y_validation).float() x3=torch.from_numpy(x_test).float() y3=torch.from_numpy(y_test).float() #data from.npy to pytorch data global BATCH_SIZE BATCH_SIZE=512 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_dataset = Data.TensorDataset(x1,y1) trainloader = Data.DataLoader( dataset=train_dataset, # torch TensorDataset format batch_size=BATCH_SIZE, # mini batch size shuffle=False, # 要不要打乱数据 (打乱比较好) num_workers=2, # 多线程来读数据 drop_last=True, ) vali_dataset = Data.TensorDataset(x2,y2) valiloader = Data.DataLoader( dataset=vali_dataset, # torch TensorDataset format batch_size=BATCH_SIZE, # mini batch size shuffle=False, # 要不要打乱数据 (打乱比较好) num_workers=2, # 多线程来读数据 drop_last=True, ) test_dataset = Data.TensorDataset(x3,y3) testloader = Data.DataLoader( dataset=test_dataset, # torch TensorDataset format batch_size=BATCH_SIZE, # mini batch size shuffle=False, # 要不要打乱数据 (打乱比较好) num_workers=2, # 多线程来读数据 drop_last=True, ) #encoder str class Encoder(nn.Module): def __init__(self, input_dim,emb_dim, hid_dim, n_layers, dropout=0.1): super().__init__() self.input_dim = input_dim self.emb_dim = emb_dim self.hid_dim = hid_dim self.n_layers = n_layers self.dropout = dropout self.embedding = nn.Linear(input_dim, emb_dim) self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout) self.dropout = nn.Dropout(dropout) def forward(self, x): #x = [len, batch size,inout_size] embedded = self.dropout(self.embedding(x)) #embedded = [len, batch size, emb dim] outputs, (hidden, cell) = self.rnn(embedded) #outputs = [src sent len, batch size, hid dim * n directions] #hidden = [n layers * n directions, batch size, hid dim] #cell = [n layers * n directions, batch size, hid dim] #outputs are always from the top hidden layer return hidden, cell # decoder str class Decoder(nn.Module): def __init__(self, decoder_input_dim,emb_dim, hid_dim, n_layers, dropout=0.2): super().__init__() self.emb_dim = emb_dim self.hid_dim = hid_dim self.decoder_input_dim = decoder_input_dim self.n_layers = n_layers self.dropout = dropout self.embedding = nn.Linear(decoder_input_dim, emb_dim) self.rnn = nn.LSTM(emb_dim+hid_dim, hid_dim, n_layers, dropout = dropout) self.out = nn.Linear(hid_dim, decoder_input_dim) self.dropout = nn.Dropout(dropout) def forward(self, input,context, hidden, cell): input = input.unsqueeze(0) embedded = self.dropout(self.embedding(input)) #input = [1, batch size] # print('inputshape:',input.shape) emb_con = torch.cat((embedded, context), dim = 2) ##embedded = self.dropout(self.embedding(input)) #embedded = [1, batch size, emb dim] output, (hidden, cell) = self.rnn(emb_con, (hidden, cell)) #output = [len, batch size, hid dim * n directions] #hidden = [n layers * n directions, batch size, hid dim] #cell = [n layers * n directions, batch size, hid dim] #sent len and n directions will always be 1 in the decoder, therefore: #output = [1, batch size, hid dim] #hidden = [n layers, batch size, hid dim] #cell = [n layers, batch size, hid dim] prediction = self.out(output.squeeze(0)) #prediction = [batch size, output dim] return prediction, hidden, cell class Seq2Seq(nn.Module): global firstinput def __init__(self, encoder, decoder, device): super().__init__() self.encoder = encoder self.decoder = decoder self.device = device assert encoder.hid_dim == decoder.hid_dim, "Hidden dimensions of encoder and decoder must be equal!" assert encoder.n_layers == decoder.n_layers, "Encoder and decoder must have equal number of layers!" def forward(self, x, y, teacher_forcing_ratio = 0.5): #src = [src sent len, batch size] #trg = [trg sent len, batch size] #teacher_forcing_ratio is probability to use teacher forcing #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time batch_size = BATCH_SIZE max_len = 25 trg_vocab_size = 2 #tensor to store decoder outputs outputs = torch.zeros(max_len, batch_size, trg_vocab_size) #last hidden state of the encoder is used as the initial hidden state of the decoder hidden, cell = self.encoder(x) context=cell[1,:,:] context=context.unsqueeze(0) # print('c-shape:',context.shape) #first input to the decoder is the <sos> tokens input=firstinput #print(input.size()) # input = input.unsqueeze(0) #print(input.size()) for t in range(max_len): output, hidden, cell = self.decoder(input, context,hidden, cell) outputs[t] = output #print(output) #input = output.unsqueeze(0) #print(input.size()) teacher_force = random.random() < teacher_forcing_ratio top1 = output if t==24: break input = ((y[t,:,:]) if teacher_force else top1) #outputs[t] = output #print('output',output.size()) #input = output.unsqueeze(0) return outputs INPUT_DIM =36 ENCODER_INPUT_DIM = 2 HID_DIM = 128 N_LAYERS = 2 ENC_EMB_DIM = 64 DEC_EMB_DIM = 16 enc = Encoder(INPUT_DIM, ENC_EMB_DIM , HID_DIM, N_LAYERS) dec = Decoder(ENCODER_INPUT_DIM,DEC_EMB_DIM , HID_DIM, N_LAYERS) model = Seq2Seq(enc, dec, device).to(device) # In[ ]: def init_weights(m): for name, param in m.named_parameters(): nn.init.uniform_(param.data, -0.15, 0.15) # nn.init.orthogonal_(param.data) model.apply(init_weights) # In[ ]: def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'The model has {count_parameters(model):,} trainable parameters') # In[ ]: optimizer = optim.Adam(model.parameters(),weight_decay=0.00001,lr=0.01) criterion = nn.MSELoss() def train(model, dataloader,optimizer, criterion, clip): global firstinput model.train() epoch_loss = 0 for x,y in dataloader: x=x.transpose(1,0) y=y.transpose(1,0) x=x.to('cuda') y=y.to('cuda') firstinput=y[0,:,:] y=y[1:,:,:] optimizer.zero_grad() output = model(x, y) output = output.to('cuda') # loss = criterion(output, y) #print(output.size()) loss = 3*criterion(output[:,:,1],y[:,:,1])+criterion(output[:,:,0],y[:,:,0]) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), clip) optimizer.step() epoch_loss += loss.item() #print(epoch_loss) return epoch_loss/len(dataloader) # In[ ]: def evaluate(model, validataloader, criterion): model.eval() epoch_loss = 0 with torch.no_grad(): for x,y in validataloader: x=x.transpose(1,0) y=y.transpose(1,0) x=x.to('cuda') y=y.to('cuda') firstinput=y[0,:,:] y=y[1:,:,:] optimizer.zero_grad() output = model(x, y, 0) #turn off teacher forcing output = output.to('cuda') loss = 3*criterion(output[:,:,1],y[:,:,1])+criterion(output[:,:,0],y[:,:,0]) epoch_loss += loss.item() return epoch_loss / len(validataloader) # In[ ]: def test(model, testdataloader, criterion): global j global firstinput global test_result model.eval() epoch_loss = 0 with torch.no_grad(): for x,y in testdataloader: x=x.transpose(1,0) y=y.transpose(1,0) x=x.to('cuda') y=y.to('cuda') firstinput=y[0,:,:] y=y[1:,:,:] optimizer.zero_grad() output = model(x, y, 0) #turn off teacher forcing test_result[:,j:j+BATCH_SIZE,:]=output j=j+BATCH_SIZE output = output.to('cuda') # loss = criterion(output, y) loss = 3*criterion(output[:,:,1],y[:,:,1])+criterion(output[:,:,0],y[:,:,0]) epoch_loss += loss.item() # print(len(testdataloader)) return epoch_loss / len(testdataloader) # In[ ]: N_EPOCHS = 40 CLIP = 1 global test_result test_result=np.zeros([25,80000,2]) pynvml.nvmlInit() handle=pynvml.nvmlDeviceGetHandleByIndex(0) meminfo=pynvml.nvmlDeviceGetMemoryInfo(handle) print('this is base-line\n') for epoch in range(N_EPOCHS): global j j=0 start_time = time.process_time() train_loss = train(model, trainloader, optimizer, criterion, CLIP) valid_loss = evaluate(model, valiloader, criterion) end_time = time.process_time() print(f'Epoch: {epoch+1:02} | Time: {end_time-start_time}s') print(f'\tTrain Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}') #writer.add_scalars('loss',{'train_loss': train_loss, #'valid_loss': valid_loss},epoch ) test_loss = test(model, testloader, criterion) if test_loss<3.9: print('testloss:',test_loss) test_result=test_result[:,:j,:] np.save(r'./result/base_predict_tra.npy',test_result) np.save(r'./result/true_tra.npy',y_test[:,1:,:]) break if epoch == 39: print('testloss:',test_loss) test_result=test_result[:,:j,:] np.save(r'./result/base_predict_tra.npy',test_result) np.save(r'./result/true_tra.npy',y_test[:,1:,:]) break print('meminfo.used:',meminfo.used/(1024*1024)) print('meminfo.total:',meminfo.total/(1024*1024)) return 0
def _get_data(self): data = {} if self.deviceCount: for i in range(self.deviceCount): gpuIdx = str(i) handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle) brand = pynvml.nvmlDeviceGetBrand(handle) brands = ['Unknown', 'Quadro', 'Tesla', 'NVS', 'Grid', 'GeForce', 'Titan'] ### Get data ### ## Memory usage try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) except Exception as e: self.debug(str(e)) mem = None ## ECC errors try: _memError = {} _eccCounter = {} eccErrors = {} eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC'] memErrorType = ['ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED'] memoryLocationType = ['L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY', 'REGISTER_FILE', 'TEXTURE_MEMORY'] for memoryLocation in range(5): for eccCounter in range(2): for memError in range(2): _memError[memErrorType[memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter(handle,memError,eccCounter,memoryLocation) _eccCounter[eccCounterType[eccCounter]] = _memError eccErrors[memoryLocationType[memoryLocation]] = _eccCounter except Exception as e: self.debug(str(e)) eccErrors = None ## Temperature try: temp = pynvml.nvmlDeviceGetTemperature(handle,pynvml.NVML_TEMPERATURE_GPU) except Exception as e: self.debug(str(e)) temp = None ## Fan try: fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle) except Exception as e: self.debug(str(e)) fanspeed = None ## GPU and Memory Utilization try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) gpu_util = util.gpu mem_util = util.memory except Exception as e: self.debug(str(e)) gpu_util = None mem_util = None ## Encoder Utilization try: encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) enc_util = encoder[0] except Exception as e: self.debug(str(e)) enc_util = None ## Decoder Utilization try: decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) dec_util = decoder[0] except Exception as e: self.debug(str(e)) dec_util = None ## Clock frequencies try: clock_core = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS) clock_sm = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM) clock_mem = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor except Exception as e: self.debug(str(e)) clock_core = None clock_sm = None clock_mem = None ### Packing data ### self.debug("Device", gpuIdx, ":", str(name)) data["device_name_" + gpuIdx] = name self.debug("Brand:", str(brands[brand])) self.debug(str(name), "Temp :", str(temp)) data["device_temp_" + gpuIdx] = temp self.debug(str(name), "Mem total :", str(mem.total), 'bytes') data["device_mem_total_" + gpuIdx] = mem.total self.debug(str(name), "Mem used :", str(mem.used), 'bytes') data["device_mem_used_" + gpuIdx] = mem.used self.debug(str(name), "Mem free :", str(mem.free), 'bytes') data["device_mem_free_" + gpuIdx] = mem.free self.debug(str(name), "Load GPU :", str(gpu_util), '%') data["device_load_gpu_" + gpuIdx] = gpu_util self.debug(str(name), "Load MEM :", str(mem_util), '%') data["device_load_mem_" + gpuIdx] = mem_util self.debug(str(name), "Load ENC :", str(enc_util), '%') data["device_load_enc_" + gpuIdx] = enc_util self.debug(str(name), "Load DEC :", str(dec_util), '%') data["device_load_dec_" + gpuIdx] = dec_util self.debug(str(name), "Core clock:", str(clock_core), 'MHz') data["device_core_clock_" + gpuIdx] = clock_core self.debug(str(name), "SM clock :", str(clock_sm), 'MHz') data["device_sm_clock_" + gpuIdx] = clock_sm self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz') data["device_mem_clock_" + gpuIdx] = clock_mem self.debug(str(name), "Fan speed :", str(fanspeed), '%') data["device_fanspeed_" + gpuIdx] = fanspeed self.debug(str(name), "ECC errors:", str(eccErrors)) if eccErrors is not None: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"]["AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] else: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = None ## Get unit (S-class Nvidia cards) data if self.unitCount: for i in range(self.unitCount): gpuIdx = str(i) handle = pynvml.nvmlUnitGetHandleByIndex(i) try: fan = pynvml.nvmlUnitGetFanSpeedInfo(handle) fan_speed = fan.speed # Fan speed (RPM) fan_state = fan.state # Flag that indicates whether fan is working properly except Exception as e: self.debug(str(e)) fan_speed = None fan_state = None try: psu = pynvml.nvmlUnitGetPsuInfo(handle) psu_current = psu.current # PSU current (A) psu_power = psu.power # PSU power draw (W) psu_state = psu.state # The power supply state psu_voltage = psu.voltage # PSU voltage (V) except Exception as e: self.debug(str(e)) psu_current = None psu_power = None psu_state = None psu_voltage = None try: temp_intake = pynvml.nvmlUnitGetTemperature(handle,0) # Temperature at intake in C temp_exhaust = pynvml.nvmlUnitGetTemperature(handle,1) # Temperature at exhaust in C temp_board = pynvml.nvmlUnitGetTemperature(handle,2) # Temperature on board in C except Exception as e: self.debug(str(e)) temp_intake = None temp_exhaust = None temp_board = None self.debug('Unit fan speed:',str(fan_speed)) data["unit_fan_speed_" + gpuIdx] = fan_speed self.debug('Unit fan state:',str(fan_state)) data["unit_fan_state_" + gpuIdx] = fan_state self.debug('Unit PSU current:',str(psu_current)) data["unit_psu_current_" + gpuIdx] = psu_current self.debug('Unit PSU power:', str(psu_power)) data["unit_psu_power_" + gpuIdx] = psu_power self.debug('Unit PSU state:', str(psu_state)) data["unit_psu_state_" + gpuIdx] = psu_state self.debug('Unit PSU voltage:', str(psu_voltage)) data["unit_psu_voltage_" + gpuIdx] = psu_voltage self.debug('Unit temp intake:', str(temp_intake)) data["unit_temp_intake_" + gpuIdx] = temp_intake self.debug('Unit temp exhaust:', str(temp_exhaust)) data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust self.debug('Unit temp board:', str(temp_board)) data["unit_temp_board_" + gpuIdx] = temp_board ## Get data via legacy mode if self.legacy: try: output, error = Popen( [ "nvidia-settings", "-c", ":0", "-q", "GPUUtilization", "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q", "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory" ], shell=False, stdout=PIPE,stderr=PIPE).communicate() output = repr(str(output)) if len(output) < 800: raise Exception('Error in fetching data from nvidia-settings ' + output) self.debug(str(error), output) except Exception as e: self.error(str(e)) self.error('Setting legacy mode to False') self.legacy = False return data for i in range(self.deviceCount): gpuIdx = str(i) if data["device_temp_" + gpuIdx] is None: coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_temp_" + gpuIdx] = int(coreTemp) self.debug('Using legacy temp for GPU {0}: {1}'.format(gpuIdx, coreTemp)) except Exception as e: self.debug(str(e), "skipping device_temp_" + gpuIdx) if data["device_mem_used_" + gpuIdx] is None: memUsed = findall('UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_mem_used_" + gpuIdx] = int(memUsed) self.debug('Using legacy mem_used for GPU {0}: {1}'.format(gpuIdx, memUsed)) except Exception as e: self.debug(str(e), "skipping device_mem_used_" + gpuIdx) if data["device_load_gpu_" + gpuIdx] is None: gpu_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][1] try: data["device_load_gpu_" + gpuIdx] = int(gpu_util) self.debug('Using legacy load_gpu for GPU {0}: {1}'.format(gpuIdx, gpu_util)) except Exception as e: self.debug(str(e), "skipping device_load_gpu_" + gpuIdx) if data["device_load_mem_" + gpuIdx] is None: mem_util = findall('(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][2] try: data["device_load_mem_" + gpuIdx] = int(mem_util) self.debug('Using legacy load_mem for GPU {0}: {1}'.format(gpuIdx, mem_util)) except Exception as e: self.debug(str(e), "skipping device_load_mem_" + gpuIdx) if data["device_core_clock_" + gpuIdx] is None: clock_core = findall('GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][1] try: data["device_core_clock_" + gpuIdx] = int(clock_core) self.debug('Using legacy core_clock for GPU {0}: {1}'.format(gpuIdx, clock_core)) except Exception as e: self.debug(str(e), "skipping device_core_clock_" + gpuIdx) if data["device_mem_clock_" + gpuIdx] is None: clock_mem = findall('GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][2] try: data["device_mem_clock_" + gpuIdx] = int(clock_mem) self.debug('Using legacy mem_clock for GPU {0}: {1}'.format(gpuIdx, clock_mem)) except Exception as e: self.debug(str(e), "skipping device_mem_clock_" + gpuIdx) return data
import pynvml import numpy as np import os import time pynvml.nvmlInit() # 这里的0是GPU id handle0 = pynvml.nvmlDeviceGetHandleByIndex(0) handle1 = pynvml.nvmlDeviceGetHandleByIndex(1) handle2 = pynvml.nvmlDeviceGetHandleByIndex(2) handle3 = pynvml.nvmlDeviceGetHandleByIndex(3) memInfo0 = pynvml.nvmlDeviceGetMemoryInfo(handle0) memInfo1 = pynvml.nvmlDeviceGetMemoryInfo(handle1) memInfo2 = pynvml.nvmlDeviceGetMemoryInfo(handle2) memInfo3 = pynvml.nvmlDeviceGetMemoryInfo(handle3) commandList = ['', '', '',] commandFalg = np.ones(len(commandList)) def getUsedRate(memInfo): return memInfo.used / memInfo.total def sendCommand(deviceID): print(os.system('python train.py --epochs 1002 --devices 0')) print(str(deviceID) + ': command') exit()
def _get_device_info(device_id): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) return mem_info
def init(self): self.util_history = [] self.temp_history = [] pynvml.nvmlInit() self.gpu_handles = [] self.deviceCount = pynvml.nvmlDeviceGetCount() for i in range(self.deviceCount): self.gpu_handles.append(pynvml.nvmlDeviceGetHandleByIndex(i)) self.cpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6) self.cpu_prog_bars = [] self.gpu_boxes = [] self.gpu_prog_bars = [] self.prev_idle = [] self.prev_total = [] self.idle = [] self.total = [] #---cpu_box--- try: stat = open("/proc/stat") statlines = stat.read().splitlines() stat.close() self.corecount = -1 for line in statlines: if (line[0:2] == "cp"): self.corecount+= 1 else: break except IOError: print("Problem opening /proc/stat, exiting..") pynvml.nvmlShutdown() quit() for i in range(self.corecount): self.cpu_prog_bars.append(Gtk.ProgressBar(text="CPU %d" % i, show_text=True)) self.cpu_box.pack_start(self.cpu_prog_bars[i], True, True, 0) self.prev_idle.append(0) self.prev_total.append(0) self.idle.append(0) self.total.append(0) #---gpu_boxes--- for i in range(self.deviceCount): product_name = pynvml.nvmlDeviceGetName(self.gpu_handles[i]) product_name = product_name.decode('utf-8') gpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8) label = Gtk.Label(product_name) self.gpu_prog_bars.append(Gtk.ProgressBar(text="GPU", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Utilization", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Usage", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Temperature", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Encoder", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Decoder", show_text=True)) gpu_box.pack_start(label, True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +1], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +2], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +3], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +4], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +5], True, True, 0) self.gpu_boxes.append(gpu_box) #---proc--- proc_liststore = Gtk.ListStore(int, str, int) self.tree = Gtk.TreeView(model=proc_liststore) renderer_pid = Gtk.CellRendererText() column_pid = Gtk.TreeViewColumn("Proccess ID", renderer_pid, text=0) column_pid.set_resizable(True) self.tree.append_column(column_pid) renderer_path = Gtk.CellRendererText() column_path = Gtk.TreeViewColumn("Command Line", renderer_path, text=1) column_path.set_resizable(True) column_path.set_fixed_width(250) self.tree.append_column(column_path) renderer_mem = Gtk.CellRendererText() column_mem = Gtk.TreeViewColumn("Memory (MiB)", renderer_mem, text=2) column_mem.set_resizable(True) self.tree.append_column(column_mem)
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def identify_cards(): devices = {} try: import pynvml from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex deviceCount = None try: if not wrap_nvml_init(nvmlInit): return devices deviceCount = nvmlDeviceGetCount() log("identify_cards() will probe %i cards", deviceCount) for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) log("identify_cards() handle(%i)=%s", i, handle) props = {} def meminfo(memory): return { "total": int(memory.total), "free": int(memory.free), "used": int(memory.used), } def pciinfo(pci): i = {} for nvname, pubname in { "domain": "domain", "bus": "bus", "device": "device", "pciDeviceId": "pci-device-id", "pciSubSystemId": "pci-subsystem-id", }.items(): try: i[pubname] = int(getattr(pci, nvname)) except (ValueError, AttributeError): pass try: i["bus-id"] = bytestostr(pci.busId) except AttributeError: pass return i for prefix, prop, fn_name, args, conv in ( ("", "name", "nvmlDeviceGetName", (), strtobytes), ("", "serial", "nvmlDeviceGetSerial", (), strtobytes), ("", "uuid", "nvmlDeviceGetUUID", (), strtobytes), ("", "pci", "nvmlDeviceGetPciInfo", (), pciinfo), ("", "memory", "nvmlDeviceGetMemoryInfo", (), meminfo), ("pcie-link", "generation-max", "nvmlDeviceGetMaxPcieLinkGeneration", (), int), ("pcie-link", "width-max", "nvmlDeviceGetMaxPcieLinkWidth", (), int), ("pcie-link", "generation", "nvmlDeviceGetCurrPcieLinkGeneration", (), int), ("pcie-link", "width", "nvmlDeviceGetCurrPcieLinkWidth", (), int), ("clock-info", "graphics", "nvmlDeviceGetClockInfo", (0, ), int), ("clock-info", "sm", "nvmlDeviceGetClockInfo", (1, ), int), ("clock-info", "mem", "nvmlDeviceGetClockInfo", (2, ), int), ("clock-info", "graphics-max", "nvmlDeviceGetMaxClockInfo", (0, ), int), ("clock-info", "sm-max", "nvmlDeviceGetMaxClockInfo", (1, ), int), ("clock-info", "mem-max", "nvmlDeviceGetMaxClockInfo", (2, ), int), ("", "fan-speed", "nvmlDeviceGetFanSpeed", (), int), ("", "temperature", "nvmlDeviceGetTemperature", (0, ), int), ("", "power-state", "nvmlDeviceGetPowerState", (), int), ("", "vbios-version", "nvmlDeviceGetVbiosVersion", (), strtobytes), ): try: fn = getattr(pynvml, fn_name) v = fn(handle, *args) if conv: v = conv(v) if prefix: d = props.setdefault(prefix, {}) else: d = props d[prop] = v except Exception as e: log( "identify_cards() cannot query %s using %s on device %i with handle %s: %s", prop, fn, i, handle, e) continue log("identify_cards() [%i]=%s", i, props) devices[i] = props #unitCount = nvmlUnitGetCount() #log.info("unitCount=%s", unitCount) except Exception as e: log("identify_cards() pynvml error", exc_info=True) log.warn("Warning: failed to query the NVidia cards using NVML:") log.warn(" %s", e) finally: if deviceCount is not None: nvmlShutdown() except ImportError as e: log("cannot use nvml to query the kernel module version:") log(" %s", e) return devices
def get_handles(self): """ Return all listed Nvidia handles """ self.handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(self.device_count)]
def __init__(self, device_idx): super().__init__() self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
def get_infos(): """Get all information about all your graphics cards. Returns: dict: The returned result is a dict with 3 keys: count, driver_version and devices: count: Number of gpus found driver_version: The version of the system’s graphics driver devices: It's a list and every item is a namedtuple Device which has 10 fields, for exzample id, name and fan_speed etc. It should be noted that the Process field is also a namedtuple which has 11 fields. """ infos = {} Device = namedtuple( "Device", [ "id", "name", "free", "used", "total", "temperature", "fan_speed", "power_usage", "power_state", "process", ], ) Process = namedtuple( "Process", [ "pid", "memory_percent", "status", "username", "num_threads", "cpu_num", "cpu_percent", "name", "cmdline", "used_gpu_mem", "create_time", ], ) driver_version = pynvml.nvmlSystemGetDriverVersion().decode() device_count = pynvml.nvmlDeviceGetCount() devices = [] for i in range(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle).decode() mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) power_usage = pynvml.nvmlDeviceGetPowerUsage( handle) # Power usage in milliwatts mW processes = pynvml.nvmlDeviceGetComputeRunningProcesses( handle) # Which processes are using the GPU # process_info = [(item.pid, item.usedGpuMemory) for item in process_info] process_info = [] for p in processes: # append Process object to process_info pid = p.pid used_gpu_mem = p.usedGpuMemory p = psutil.Process(pid=pid) _ = p.cpu_percent() time.sleep(0.05) process_info.append( Process( pid=pid, memory_percent=p.memory_percent(), status=p.status(), username=p.username(), num_threads=p.num_threads(), cpu_num=p.cpu_num(), cpu_percent=p.cpu_percent(), name=p.name(), cmdline=" ".join(p.cmdline()), used_gpu_mem=used_gpu_mem, create_time=p.create_time(), )) try: fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle) except pynvml.NVMLError_NotSupported as e: fan_speed = None power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) power_state = pynvml.nvmlDeviceGetPowerState(handle) temperature = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) devices.append( Device( id=i, name=name, free=mem_info.free, used=mem_info.used, total=mem_info.total, temperature=temperature, fan_speed=fan_speed, power_usage=power_usage, power_state=power_state, process=process_info, )) infos["count"] = device_count infos["driver_version"] = driver_version infos["devices"] = devices return infos
def identify_cards(): devices = {} try: import pynvml from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex deviceCount = None try: nvmlInit() deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) props = {} def meminfo(memory): return { "total": int(memory.total), "free": int(memory.free), "used": int(memory.used), } def pciinfo(pci): i = {} for x in ("domain", "bus", "device", "pciDeviceId", "pciSubSystemId"): try: i[x] = int(getattr(pci, x)) except: pass try: i["busId"] = str(pci.busId) except: pass return i for prop, fn_name, args, conv in ( ("name", "nvmlDeviceGetName", (), str), ("serial", "nvmlDeviceGetSerial", (), str), ("uuid", "nvmlDeviceGetUUID", (), str), ("pci", "nvmlDeviceGetPciInfo", (), pciinfo), ("memory", "nvmlDeviceGetMemoryInfo", (), meminfo), ("pcie-link-generation-max", "nvmlDeviceGetMaxPcieLinkGeneration", (), int), ("pcie-link-width-max", "nvmlDeviceGetMaxPcieLinkWidth", (), int), ("pcie-link-generation", "nvmlDeviceGetCurrPcieLinkGeneration", (), int), ("pcie-link-width", "nvmlDeviceGetCurrPcieLinkWidth", (), int), ("clock-info-graphics", "nvmlDeviceGetClockInfo", (0, ), int), ("clock-info-sm", "nvmlDeviceGetClockInfo", (1, ), int), ("clock-info-mem", "nvmlDeviceGetClockInfo", (2, ), int), ("clock-info-graphics-max", "nvmlDeviceGetMaxClockInfo", (0, ), int), ("clock-info-sm-max", "nvmlDeviceGetMaxClockInfo", (1, ), int), ("clock-info-mem-max", "nvmlDeviceGetMaxClockInfo", (2, ), int), ("fan-speed", "nvmlDeviceGetFanSpeed", (), int), ("temperature", "nvmlDeviceGetTemperature", (0, ), int), ("power-state", "nvmlDeviceGetPowerState", (), int), ("vbios-version", "nvmlDeviceGetVbiosVersion", (), str), ): try: fn = getattr(pynvml, fn_name) v = fn(handle, *args) if conv: v = conv(v) props[prop] = v except Exception as e: log( "identify_cards() cannot query %s using %s on device %i with handle %s: %s", prop, fn, i, handle, e) continue devices[i] = props #unitCount = nvmlUnitGetCount() #log.info("unitCount=%s", unitCount) except Exception as e: log("identify_cards() pynvml error", exc_info=True) log.warn("Warning: failed to query the NVidia cards via NVML:") log.warn(" %s", e) finally: if deviceCount is not None: nvmlShutdown() except ImportError as e: log("cannot use nvml to query the kernel module version:") log(" %s", e) return devices
def do_GET(self): #checks if the server is alive if self.path == '/test': send_header(self) self.wfile.write(bytes('passed<br>', 'utf-8')) self.wfile.write(bytes('server is responding', 'utf-8')) #returns the running processes if self.path == '/runningProcesses': send_header(self) #send response: if modules['psutil']: for proc in psutil.process_iter(): try: pinfo = proc.as_dict(attrs=['pid', 'name']) except psutil.NoSuchProcess: pass print(pinfo) self.wfile.write(bytes(str(pinfo), 'utf-8')) else: self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8') #returns the CPU utilization and number of cores elif self.path == '/cpuInfo': send_header(self) #get CPU info cpuInfo = {} if modules['psutil']: cpuInfo['CPU Utilization'] = int(psutil.cpu_percent()) cpuInfo['CPU Cores'] = int(psutil.cpu_count()) else: cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.' json_dump = json.dumps(cpuInfo) self.wfile.write(bytes(json_dump, 'utf-8')) #get GPU info if modules['pynvml']: try: pynvml.nvmlInit() gpus = pynvml.nvmlDeviceGetCount() except: gpus = 0 self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8')) else: gpus = 0 self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8')) for i in range(gpus): handle = pynvml.nvmlDeviceGetHandleByIndex(i) self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8')) try: self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '°C', 'utf-8')) except: self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8')) try: gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8')) self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8')) except: self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8')) if gpus > 0: try: pynvml.nvmlShutdown() except: pass elif self.path == '/availableComputers': send_header(self) s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('google.com', 0)) global myownsocket myownsocket = s.getsockname()[0] port = 8003 available_computers = [] for i in range(1, 256): host = '192.168.178.' + str(i) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(0.2) try: alive = sock.connect_ex((host, port)) except: alive = -1 if alive == 0: print('available') available_computers.append(host) else: print('not available') print(host) self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8')) cmd_txt = """@echo off call "C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat" echo ##### start_rendering xsibatch -render "Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn" -frames #1#-#2# -pass "BEAUTY" -skip on -verbose on echo ##### rendering_done """ self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8')) self.wfile.write(bytes('<table border="1">\n', 'utf-8')) self.wfile.write(bytes('<tr>\n', 'utf-8')) self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8')) self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8')) self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8')) self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8')) available_cpus = {} for host in available_computers: available_cpus[host] = abs(get_cpu_cores(host)) total_cpus = sum(available_cpus.values()) frame_list = {} start_frame = 0 for host in available_computers: start_frame += 1 frame_list[host] = [start_frame] start_frame = start_frame + int(100 * (available_cpus[host] / total_cpus)) if start_frame > 100: start_frame = 100 frame_list[host].append(start_frame) index = 0 for host in available_computers: index += 1 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) index = 2 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) self.wfile.write(bytes('</table>\n', 'utf-8')) self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8')) self.wfile.write(bytes('</form>\n', 'utf-8')) self.wfile.write(bytes('</body>\n', 'utf-8')) self.wfile.write(bytes('</html>\n', 'utf-8')) elif self.path == '/execute_job': send_header(self) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) elif '/submit_job' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) #print(parsed) print(parameters) self.wfile.write(bytes('<body>', 'utf-8')) for index in range(1, 100): if not parameters.get('host' + str(index)).strip(): pass elif not parameters.get('start' + str(index)).strip(): pass elif not parameters.get('end' + str(index)).strip(): pass elif parameters.get('command'): cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip()) cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip()) self.wfile.write(bytes(escape(cmd_txt), 'utf-8')) self.wfile.write(bytes('<br>', 'utf-8')) print(cmd_txt) self.wfile.write(bytes('</body></html>', 'utf-8')) elif '/shutdown' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("Server will be shut down now......", 'utf-8')) server.shutdown() sys.exit() else: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("<br>", 'utf-8')) self.wfile.write(bytes(self.path, 'utf-8')) print(self.path)
#!/usr/bin/env python # -*- coding: utf-8 -*- import argparse import os from datetime import datetime import torch import numpy as np import config as cfg import util.PointNetVlad as PNV import pynvml from dateutil import tz pynvml.nvmlInit() handle0 = pynvml.nvmlDeviceGetHandleByIndex(0) if torch.cuda.device_count() > 1: handle1 = pynvml.nvmlDeviceGetHandleByIndex(1) ratio = 1024 ** 2 def print_gpu(s=""): if torch.cuda.device_count() > 1: meminfo0 = pynvml.nvmlDeviceGetMemoryInfo(handle0) meminfo1 = pynvml.nvmlDeviceGetMemoryInfo(handle0) used = (meminfo0.used + meminfo1.used) / ratio else: meminfo0 = pynvml.nvmlDeviceGetMemoryInfo(handle0) used = meminfo0.used / ratio print(s+" used: ", used) parser = argparse.ArgumentParser() parser.add_argument('--results_dir', default='results/', help='results dir [default: results]')
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def _decode(b): if isinstance(b, bytes): return b.decode('utf-8') # for python3, to unicode return b def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] # TODO: ps_process is being cached, but the dict below is not. process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = (nv_process.usedGpuMemory // MB if nv_process.usedGpuMemory else None) process['gpu_memory_usage'] = usedmem # process['gpu_memory_usage'] = ("%d MiB" % usedmem if usedmem is not None else usedmem) process['cpu_percent'] = ps_process.cpu_percent() # process['cpu_memory_usage'] = "%d MiB" % ( # round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['cpu_memory_usage'] = ( round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle) except N.NVMLError: utilization_enc = None # Not supported try: utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle) except N.NVMLError: utilization_dec = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] # A single process might run in both of graphics and compute mode, # However we will display the process only once seen_pids = set() for nv_process in nv_comp_processes + nv_graphics_processes: if nv_process.pid in seen_pids: continue seen_pids.add(nv_process.pid) try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] try: process['cpu_percent'] = cache_process.cpu_percent() except psutil.NoSuchProcess: process['cpu_percent'] = 0.0 except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. process['cpu_percent'] = 0.0 pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else 0, 'utilization.enc': utilization_enc[0] if utilization_enc else None, 'utilization.dec': utilization_dec[0] if utilization_dec else None, 'power.draw': power // 1000 if power is not None else 0, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else 0, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else 0, 'memory.total': memory.total // MB if memory else 0, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) # 2. additional info (driver version, etc). try: driver_version = _decode(N.nvmlSystemGetDriverVersion()) except N.NVMLError: driver_version = None # N/A N.nvmlShutdown() return GPUStatCollection(gpu_list, driver_version=driver_version)
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def _decode(b): if isinstance(b, bytes): return b.decode('utf-8') # for python3, to unicode return b def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = nv_process.usedGpuMemory // MB if \ nv_process.usedGpuMemory else None process['gpu_memory_usage'] = usedmem process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) # 2. additional info (driver version, etc). try: driver_version = _decode(N.nvmlSystemGetDriverVersion()) except N.NVMLError: driver_version = None # N/A N.nvmlShutdown() return GPUStatCollection(gpu_list, driver_version=driver_version)
def run(args): if not os.path.exists(args.model_save_dir): os.makedirs(args.model_save_dir) args.model_save_path = os.path.join(args.model_save_dir,\ f'{args.modelName}-{args.datasetName}-{args.train_mode}.pth') # indicate used gpu if len(args.gpu_ids) == 0 and torch.cuda.is_available(): # load free-most gpu pynvml.nvmlInit() dst_gpu_id, min_mem_used = 0, 1e16 for g_id in [0, 1, 2, 3]: handle = pynvml.nvmlDeviceGetHandleByIndex(g_id) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) mem_used = meminfo.used if mem_used < min_mem_used: min_mem_used = mem_used dst_gpu_id = g_id print(f'Find gpu: {dst_gpu_id}, use memory: {min_mem_used}!') logger.info(f'Find gpu: {dst_gpu_id}, with memory: {min_mem_used} left!') args.gpu_ids.append(dst_gpu_id) # device using_cuda = len(args.gpu_ids) > 0 and torch.cuda.is_available() logger.info("Let's use %d GPUs!" % len(args.gpu_ids)) device = torch.device('cuda:%d' % int(args.gpu_ids[0]) if using_cuda else 'cpu') args.device = device # add tmp tensor to increase the temporary consumption of GPU tmp_tensor = torch.zeros((100, 100)).to(args.device) # load data and models dataloader = MMDataLoader(args) model = AMIO(args).to(device) del tmp_tensor def count_parameters(model): answer = 0 for p in model.parameters(): if p.requires_grad: answer += p.numel() # print(p) return answer logger.info(f'The model has {count_parameters(model)} trainable parameters') # exit() # using multiple gpus # if using_cuda and len(args.gpu_ids) > 1: # model = torch.nn.DataParallel(model, # device_ids=args.gpu_ids, # output_device=args.gpu_ids[0]) atio = ATIO().getTrain(args) # do train atio.do_train(model, dataloader) # load pretrained model assert os.path.exists(args.model_save_path) model.load_state_dict(torch.load(args.model_save_path)) model.to(device) # do test if args.is_tune: # using valid dataset to tune hyper parameters results = atio.do_test(model, dataloader['valid'], mode="VALID") else: results = atio.do_test(model, dataloader['test'], mode="TEST") del model torch.cuda.empty_cache() gc.collect() time.sleep(5) return results
def get_device_handles(): """Get a list of NVML device handles, one per device. Can throw NVMLError. """ return [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(pynvml.nvmlDeviceGetCount())]
def stats(self): stats = {} for i in range(0, self.gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) memory = pynvml.nvmlDeviceGetMemoryInfo(handle) temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) in_use_by_us = gpu_in_use_by_this_process(handle) stats["gpu.{}.{}".format(i, "gpu")] = util.gpu stats["gpu.{}.{}".format(i, "memory")] = util.memory stats["gpu.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.{}.{}".format(i, "temp")] = temp if in_use_by_us: stats["gpu.process.{}.{}".format(i, "gpu")] = util.gpu stats["gpu.process.{}.{}".format(i, "memory")] = util.memory stats["gpu.process.{}.{}".format( i, "memoryAllocated")] = (memory.used / float(memory.total)) * 100 stats["gpu.process.{}.{}".format(i, "temp")] = temp # Some GPUs don't provide information about power usage try: power_watts = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0 power_capacity_watts = pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) / 1000.0 power_usage = (power_watts / power_capacity_watts) * 100 stats["gpu.{}.{}".format(i, "powerWatts")] = power_watts stats["gpu.{}.{}".format(i, "powerPercent")] = power_usage if in_use_by_us: stats["gpu.process.{}.{}".format(i, "powerWatts")] = power_watts stats["gpu.process.{}.{}".format(i, "powerPercent")] = power_usage except pynvml.NVMLError as err: pass except pynvml.NVMLError as err: pass if psutil: net = psutil.net_io_counters() sysmem = psutil.virtual_memory() stats["cpu"] = psutil.cpu_percent() stats["memory"] = sysmem.percent stats["network"] = { "sent": net.bytes_sent - self.network_init["sent"], "recv": net.bytes_recv - self.network_init["recv"] } # TODO: maybe show other partitions, will likely need user to configure stats["disk"] = psutil.disk_usage('/').percent stats["proc.memory.availableMB"] = sysmem.available / 1048576.0 try: stats["proc.memory.rssMB"] = self.proc.memory_info().rss / \ 1048576.0 stats["proc.memory.percent"] = self.proc.memory_percent() stats["proc.cpu.threads"] = self.proc.num_threads() except psutil.NoSuchProcess: pass return stats
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.info('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.info('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def _summary(self): summary = [] summary.append("GPU running Processes:") initGPU() try: gpusToUse = [int(n) for n in (self.gpusToUse.get()).split()] for i in gpusToUse: handle = nvmlDeviceGetHandleByIndex(i) cps = nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: # p_tags['pid'] = ps.pid msg = " %d) " % i + psutil.Process(ps.pid).name() msg += " (mem =%.2f MB)" % (float(ps.usedGpuMemory) / 1048576.) summary.append(msg) except NVMLError as err: summary.append(str(err)) return summary