def installProfTester(self): ret = util.executeBashCmd("tar xvf testing_dcgm.tar.gz", True) if ret[0] == 0: ret = util.executeBashCmd("sudo cp _out/Linux_amd64_release/testing/apps/amd64/dcgmproftester /usr/bin/", True) else: print(("ERROR: Something went wrong in extracting testing_dcgm.tar.gz??, \ command returned: \n", ret)) return ret[0]
def removeBinaries(self, prnt): #Remove existing installation files and binaries ret = util.executeBashCmd("sudo rm -rf testing_dcgm*", prnt) ret = util.executeBashCmd("sudo rm -rf datacenter-gpu-manager*.deb", prnt) ret = util.executeBashCmd("sudo rm -rf _out/", prnt) ret = util.executeBashCmd("sudo rm -rf /usr/bin/dcgmproftester", prnt) ret = util.executeBashCmd("sudo rm -rf *.txt", prnt) ret = util.executeBashCmd("sudo rm -rf *.csv", prnt) ret = util.executeBashCmd("sudo rm -rf *.pyc", prnt) #Uninstall dcgmi print("Removing existing installation of dcgmi") uninstall_cmd = "sudo dpkg --purge datacenter-gpu-manager-dcp-nda-only" ret = util.executeBashCmd(uninstall_cmd, prnt) if ret[0] != 0: print(("Error: Couldnt purge existing installation of \ datacenter-gpu-manager-dcp-nda-on, ret: ", ret)) else: print("\nSUCCESS: No error on uninstall") uninstall_cmd = "sudo apt-get remove --purge datacenter-gpu-manager" ret = util.executeBashCmd(uninstall_cmd, prnt) if ret[0] != 0: print(("Error: Couldnt purge existing installation of datacenter-gpu-manager, ret: ", \ ret)) else: print("\nSUCCESS: No error on uninstalling datacenter-gpu-manager") return 0
def startNvHostEngine(self): print("\n&&&& INFO: Killing any existing nvhostengine instance") ret = util.executeBashCmd("sudo /usr/bin/nv-hostengine -t", True) print("\n&&&& INFO: Stopping dcgm service ") ret = util.executeBashCmd("sudo service dcgm stop", True) print("\n&&&& INFO: Starting nvhostengine") ret = util.executeBashCmd("sudo /usr/bin/nv-hostengine", True) print("\n&&&& INFO: dcgmi discovery output") ret = util.executeBashCmd("sudo /usr/bin/dcgmi discovery -l", True) return ret
def installDcgm(self): print("\n&&&& INFO: Installing latest version of datacenter-gpu-manager-dcp-nda-on") ret = util.executeBashCmd("sudo dpkg -i datacenter-gpu-manager-dcp-nda-only_1.6.4_amd64.deb", True) if ret[0] != 0: print(("ERROR: Couldnt install dcgmi, ret: ", ret)) else: print("\nSUCCESS: Installed datacenter-gpu-manager-dcp-nda-on successfully") return ret[0]
def getLatestDcgm(self): print("Getting the URL for latest dcgm package\n") baseurl = "http://cqa-fs01/dvsshare/dcgm/daily/r418_00/" cmd = 'wget -q -O - http://cqa-fs01/dvsshare/dcgm/daily/r418_00/ | grep -Eo \ \\2019[0-9]{8} | tail -1' ret, folder_name = util.executeBashCmd(cmd, True) if "$folder_name" == "": print("Package index not found. Maybe the server is down?") dcgm_url = baseurl + folder_name + '/testing_dcgm/x86_64/testing_dcgm.tar.gz' deb_url = baseurl + folder_name + '/DEBS/datacenter-gpu-manager-dcp-nda-only_1.6.4_amd64.deb' return dcgm_url, deb_url
def main(cmdArgs): metrics = cmdArgs.metrics time = cmdArgs.time gpuid_list = cmdArgs.gpuid_list download_bin = cmdArgs.download_bin ret = util.removeDependencies(True) if ret == 0: ret = util.installDependencies(True) if ret == 0: if download_bin: cmd = '{executable} run_validate_dcgm.py -m {0} -t {1} -d -i {2}'\ .format(metrics, time, gpuid_list, executable=sys.executable) else: cmd = '{executable} run_validate_dcgm.py -m {0} -t {1} -i {2}'\ .format(metrics, time, gpuid_list, executable=sys.executable) ret = util.executeBashCmd(cmd, True) print("\nTests are done, removing dependencies") ret = util.removeDependencies(False) print("\n All Done")
def main(cmdArgs): metrics = int(cmdArgs.metrics) gpuid_list = cmdArgs.gpuid_list time = int(cmdArgs.time) download_bin = cmdArgs.download_bin print(("Download_binaries: ", download_bin)) if time < int(10): print('Modifying the time to 10s which is minimum\n') time = 10 print(cmdArgs) ro = RunValidateDcgm() if download_bin: #Remove existing installation of dcgmi and dcgmproftestor ret = ro.removeBinaries(True) #download latest installers if ret == 0: dcgm_url, deb_url = ro.getLatestDcgm() ret = ro.downloadInstallers(dcgm_url, deb_url) else: print("ERROR: Some problem with removing binaries\n") print(ret) #Install latest dcgm if ret == 0: ret = ro.installDcgm() #Install latest dcgmproftester if ret == 0: ret = ro.installProfTester() else: print("Something went wrong installing dcgmproftester\n") #if(ret == 0): ret = ro.startNvHostEngine() print("\nSleeping for 2 seconds") tm.sleep(2) gpu_list = gpuid_list.split(",") ro.gpuCount = len(gpu_list) #spawn dcgmi thread to load the profiling module once. print("Start : %s" % tm.ctime()) tm.sleep(2) dcgm_time = int(time) + 4 dcgm_thread_load_profiling_module = Process(target=ro._runDcgmLoadProfilingModule, \ name="dcgm_worker-%d" %metrics) dcgm_thread_load_profiling_module.start() #wait for the thread to finish dcgm_thread_load_profiling_module.join() smi_in_beg = ro.getSmiOp() mem_in_beg = ro.getMemUsage(smi_in_beg, gpu_list) #print ("In Beginning: \n" + str(smi_in_beg)) #spawn dcgmproftester threads, one each for every GPU for i in range(0, len(gpu_list)): threadName = 'dcgmproftester_worker-' + str(gpu_list[i]) print("\n&&&& RUNNING GPU_" + str(gpu_list[i]) + "_metric_validation_test") ro.prot_thread_gpu.append(Process(target=ro._runProftester, args=[gpu_list[i], metrics, \ time], name=threadName)) #print gpu_list, len(gpu_list) ro.prot_thread_gpu[i].start() #spawn dcgmi thread print("Start : %s" % tm.ctime()) tm.sleep(2) dcgm_time = int(time) + 4 dcgm_thread = Process(target=ro._runDcgm, args=[metrics, gpuid_list, dcgm_time], \ name="dcgm_worker-%s" %metrics) dcgm_thread.start() tm.sleep(time/2) smi_while_running = ro.getSmiOp() mem_in_between = ro.getMemUsage(smi_while_running, gpu_list) #print ("In Between: \n" + str(smi_while_running)) #wait for the thread to finish dcgm_thread.join() for i in range(0, len(gpu_list)): ro.prot_thread_gpu[i].join() #Copy the dcgm data in csv file cmd = '{executable} python2 parse_dcgm_single_metric.py -f dcgmLogs_{0}.txt -m {1} -i {2}'.format(metrics, \ metrics, gpuid_list, executable=sys.executable) ret = util.executeBashCmd(cmd, True) #Copy the dcgmproftester data in csv if metrics in ro.metrics_util_list: for i in range(0, len(gpu_list)): cmd = '{executable} parse_dcgmproftester_single_metric.py -f \ dcgmLogsproftester_{0}_gpu{1}.txt -m {2} -i {3}'.format(metrics, gpu_list[i], \ metrics, gpu_list[i], executable=sys.executable) ret = util.executeBashCmd(cmd, True) #Compare the results and determine pass and fail for i in range(0, len(gpu_list)): dcgm_file = 'dcgm_{0}.csv'.format(metrics) dcgmproftester_file = 'dcgmProfTester_{0}_gpu{1}.csv'.format(metrics, gpu_list[i]) ret = ro.validateAccuracy(dcgm_file, dcgmproftester_file, int(gpu_list[i]), metrics) if ret == 0: print("\n&&&& PASSED GPU_" + str(gpu_list[i]) + "_metric_validation_test") ro.results[gpu_list[i]] = 'PASS' else: print("\n&&&& FAILED GPU_" + str(gpu_list[i]) + "_metric_validation_test") ro.results[gpu_list[i]] = 'FAIL' print("\n") #for i in range(0, len(gpu_list)): #print('Validation for GPU ' + str(gpu_list[i]) + ': ' + ro.results[gpu_list[i]]) smi_at_end = ro.getSmiOp() mem_in_end = ro.getMemUsage(smi_at_end, gpu_list) print("\nMemory in Beg of test run of all GPU's under test: " + str(mem_in_beg)) print("Memory in Between of test run of all GPU's under test: " + str(mem_in_between)) print("Memory in end of test run of all GPU's under test: " + str(mem_in_end) + "\n") for i in range(0, len(gpu_list)): print("\n&&&& RUNNING GPU_" + str(gpu_list[i]) + "_memory_validation_test") val = int(mem_in_end[i][0:len(mem_in_end[i])-3]) #print ("Val without string: ", val) if ((mem_in_beg[i] != mem_in_end[i]) or val > 156): print("\n&&&& FAILED GPU_" + str(gpu_list[i]) + "_memory_validation_test") else: print("\n&&&& PASSED GPU_" + str(gpu_list[i]) + "_memory_validation_test") if download_bin: ret = ro.removeBinaries(False) ret = ro.killNvHostEngine
def _runProftester(self, gpuIndex, metric, time): metrics = str(metric) print("\n&&&& INFO: Running dcgmproftester to collect metrics on gpu {0}".format(gpuIndex)) os.environ['CUDA_VISIBLE_DEVICES'] = str(gpuIndex) util.executeBashCmd("echo {0} | /usr/bin/dcgmproftester -d {0} -t {1} 2>&1 | tee dcgmLogsproftester_{2}_gpu{3}.txt".format(time, metrics, metrics, gpuIndex), False)
def _runDcgm(self, metrics, gpuid_list, time): print("\n&&&& INFO: Running dcgm to collect metrics on {0}".format(metrics)) ret = util.executeBashCmd("echo {0} | timeout {0}s /usr/bin/dcgmi dmon -e {1} -i {2} 2>&1 | tee dcgmLogs_{3}.txt".format(time, metrics, gpuid_list, metrics), False)
def _runDcgmLoadProfilingModule(self): print("\n&&&& INFO: Running dcgm to just to load profiling module once.") ret = util.executeBashCmd("timeout 3s /usr/bin/dcgmi dmon -e 1001 -i 0", False)
def killNvHostEngine(self): print("\n&&&& INFO: Killing any existing nvhostengine instance") ret = util.executeBashCmd("sudo /usr/bin/nv-hostengine -t", True) print("\n&&&& INFO: Stopping dcgm service ") ret = util.executeBashCmd("sudo service dcgm stop", True)