def is_node_free(node_name): print node_name try: cpu_usage = float(subprocess.Popen(SSH_CMD.split(' ') + [node_name, PYTHON_CMD, '%s/scripts/get_cpu_usage.py' % CLUSTER_DIR], stdout=subprocess.PIPE).communicate()[0]) cpu_usage /= 100.0 mem_usage = float(subprocess.Popen(SSH_CMD.split(' ') + [node_name, PYTHON_CMD, '%s/scripts/get_mem_usage.py' % CLUSTER_DIR], stdout=subprocess.PIPE).communicate()[0]) except Exception as e: print e return False print cpu_usage, mem_usage return cpu_usage < CPU_FREE_TOL and mem_usage < RAM_FREE_TOL
def get_free_gpus(node_name): ''' Based off Awni's runAll.py ''' output = subprocess.Popen(SSH_CMD.split(' ') + [node_name, 'nvidia-smi', '-q', '-x'], stdout=subprocess.PIPE).communicate()[0] if not output: print 'No output for %s' % node_name return [] try: tree = et.fromstring(output.strip()) except xml.parsers.expat.ExpatError: print 'Invalid XML: ', output.strip() return [] gpus = tree.findall('gpu') print 'Detected %d gpus on %s' % (len(gpus), node_name) free_gpus = [] for i, gpu in enumerate(gpus): mem = gpu.findall('memory_usage') if len(mem) == 0: mem = gpu.findall('fb_memory_usage') if len(mem) == 0: print 'Couldn\'t get memory usage on %s' % node_name return [] mem = mem[0] tot = int(mem.findall('total')[0].text.split()[0]) used = int(mem.findall('used')[0].text.split()[0]) print used, '/', tot if float(used) / tot < GPU_FREE_TOL: free_gpus.append(i) return free_gpus
def is_node_free(node_name): print node_name try: cpu_usage = float( subprocess.Popen(SSH_CMD.split(' ') + [ node_name, PYTHON_CMD, '%s/scripts/get_cpu_usage.py' % CLUSTER_DIR ], stdout=subprocess.PIPE).communicate()[0]) cpu_usage /= 100.0 mem_usage = float( subprocess.Popen(SSH_CMD.split(' ') + [ node_name, PYTHON_CMD, '%s/scripts/get_mem_usage.py' % CLUSTER_DIR ], stdout=subprocess.PIPE).communicate()[0]) except Exception as e: print e return False print cpu_usage, mem_usage return cpu_usage < CPU_FREE_TOL and mem_usage < RAM_FREE_TOL