示例#1
0
 def installProfTester(self):
     ret = util.executeBashCmd("tar xvf testing_dcgm.tar.gz", True)
     if ret[0] == 0:
         ret = util.executeBashCmd("sudo cp _out/Linux_amd64_release/testing/apps/amd64/dcgmproftester /usr/bin/", True)
     else:
         print(("ERROR: Something went wrong in extracting testing_dcgm.tar.gz??, \
                 command returned: \n", ret))
     return ret[0]
示例#2
0
    def removeBinaries(self, prnt):
        #Remove existing installation files and binaries
        ret = util.executeBashCmd("sudo rm -rf testing_dcgm*", prnt)
        ret = util.executeBashCmd("sudo rm -rf datacenter-gpu-manager*.deb", prnt)
        ret = util.executeBashCmd("sudo rm -rf _out/", prnt)
        ret = util.executeBashCmd("sudo rm -rf /usr/bin/dcgmproftester", prnt)
        ret = util.executeBashCmd("sudo rm -rf *.txt", prnt)
        ret = util.executeBashCmd("sudo rm -rf *.csv", prnt)
        ret = util.executeBashCmd("sudo rm -rf *.pyc", prnt)

        #Uninstall dcgmi
        print("Removing existing installation of dcgmi")
        uninstall_cmd = "sudo dpkg --purge datacenter-gpu-manager-dcp-nda-only"
        ret = util.executeBashCmd(uninstall_cmd, prnt)
        if ret[0] != 0:
            print(("Error: Couldnt purge existing installation of \
                    datacenter-gpu-manager-dcp-nda-on, ret: ", ret))
        else:
            print("\nSUCCESS: No error on uninstall")

        uninstall_cmd = "sudo apt-get remove --purge datacenter-gpu-manager"
        ret = util.executeBashCmd(uninstall_cmd, prnt)
        if ret[0] != 0:
            print(("Error: Couldnt purge existing installation of datacenter-gpu-manager, ret: ", \
                    ret))
        else:
            print("\nSUCCESS: No error on uninstalling datacenter-gpu-manager")

        return 0
示例#3
0
    def startNvHostEngine(self):
        print("\n&&&& INFO: Killing any existing nvhostengine instance")
        ret = util.executeBashCmd("sudo /usr/bin/nv-hostengine -t", True)

        print("\n&&&& INFO: Stopping dcgm service ")
        ret = util.executeBashCmd("sudo service dcgm stop", True)

        print("\n&&&& INFO: Starting nvhostengine")
        ret = util.executeBashCmd("sudo /usr/bin/nv-hostengine", True)

        print("\n&&&& INFO: dcgmi discovery output")
        ret = util.executeBashCmd("sudo /usr/bin/dcgmi discovery -l", True)

        return ret
示例#4
0
    def installDcgm(self):
        print("\n&&&& INFO: Installing latest version of datacenter-gpu-manager-dcp-nda-on")
        ret = util.executeBashCmd("sudo dpkg -i datacenter-gpu-manager-dcp-nda-only_1.6.4_amd64.deb", True)
        if ret[0] != 0:
            print(("ERROR: Couldnt install dcgmi, ret: ", ret))
        else:
            print("\nSUCCESS: Installed datacenter-gpu-manager-dcp-nda-on successfully")

        return ret[0]
示例#5
0
 def getLatestDcgm(self):
     print("Getting the URL for latest dcgm package\n")
     baseurl = "http://cqa-fs01/dvsshare/dcgm/daily/r418_00/"
     cmd = 'wget -q -O - http://cqa-fs01/dvsshare/dcgm/daily/r418_00/ | grep -Eo \
             \\2019[0-9]{8} | tail -1'
     ret, folder_name = util.executeBashCmd(cmd, True)
     if "$folder_name" == "":
         print("Package index not found. Maybe the server is down?")
     dcgm_url = baseurl + folder_name + '/testing_dcgm/x86_64/testing_dcgm.tar.gz'
     deb_url = baseurl + folder_name + '/DEBS/datacenter-gpu-manager-dcp-nda-only_1.6.4_amd64.deb'
     return dcgm_url, deb_url
示例#6
0
def main(cmdArgs):
    metrics = cmdArgs.metrics
    time = cmdArgs.time
    gpuid_list = cmdArgs.gpuid_list
    download_bin = cmdArgs.download_bin

    ret = util.removeDependencies(True)
    if ret == 0:
        ret = util.installDependencies(True)

    if ret == 0:
        if download_bin:
            cmd = '{executable} run_validate_dcgm.py -m {0} -t {1} -d -i {2}'\
                    .format(metrics, time, gpuid_list, executable=sys.executable)
        else:
            cmd = '{executable} run_validate_dcgm.py -m {0} -t {1} -i {2}'\
                    .format(metrics, time, gpuid_list, executable=sys.executable)
        ret = util.executeBashCmd(cmd, True)

    print("\nTests are done, removing dependencies")

    ret = util.removeDependencies(False)

    print("\n All Done")
示例#7
0
def main(cmdArgs):
    metrics = int(cmdArgs.metrics)
    gpuid_list = cmdArgs.gpuid_list
    time = int(cmdArgs.time)
    download_bin = cmdArgs.download_bin
    print(("Download_binaries: ", download_bin))
    if time < int(10):
        print('Modifying the time to 10s which is minimum\n')
        time = 10
    print(cmdArgs)
    ro = RunValidateDcgm()

    if download_bin:
        #Remove existing installation of dcgmi and dcgmproftestor
        ret = ro.removeBinaries(True)

        #download latest installers
        if ret == 0:
            dcgm_url, deb_url = ro.getLatestDcgm()
            ret = ro.downloadInstallers(dcgm_url, deb_url)
        else:
            print("ERROR: Some problem with removing binaries\n")
            print(ret)

        #Install latest dcgm
        if ret == 0:
            ret = ro.installDcgm()

        #Install latest dcgmproftester
        if ret == 0:
            ret = ro.installProfTester()
        else:
            print("Something went wrong installing dcgmproftester\n")

    #if(ret == 0):
    ret = ro.startNvHostEngine()

    print("\nSleeping for 2 seconds")
    tm.sleep(2)

    gpu_list = gpuid_list.split(",")
    ro.gpuCount = len(gpu_list)

    #spawn dcgmi thread to load the profiling module once.
    print("Start : %s" % tm.ctime())
    tm.sleep(2)
    dcgm_time = int(time) + 4
    dcgm_thread_load_profiling_module = Process(target=ro._runDcgmLoadProfilingModule, \
            name="dcgm_worker-%d" %metrics)
    dcgm_thread_load_profiling_module.start()

    #wait for the thread to finish
    dcgm_thread_load_profiling_module.join()

    smi_in_beg = ro.getSmiOp()
    mem_in_beg = ro.getMemUsage(smi_in_beg, gpu_list)
    #print ("In Beginning: \n" + str(smi_in_beg))

    #spawn dcgmproftester threads, one each for every GPU
    for i in range(0, len(gpu_list)):
        threadName = 'dcgmproftester_worker-' + str(gpu_list[i])
        print("\n&&&& RUNNING GPU_" + str(gpu_list[i]) + "_metric_validation_test")
        ro.prot_thread_gpu.append(Process(target=ro._runProftester, args=[gpu_list[i], metrics, \
                time], name=threadName))
        #print gpu_list, len(gpu_list)
        ro.prot_thread_gpu[i].start()

    #spawn dcgmi thread
    print("Start : %s" % tm.ctime())
    tm.sleep(2)
    dcgm_time = int(time) + 4
    dcgm_thread = Process(target=ro._runDcgm, args=[metrics, gpuid_list, dcgm_time], \
                  name="dcgm_worker-%s" %metrics)
    dcgm_thread.start()

    tm.sleep(time/2)

    smi_while_running = ro.getSmiOp()
    mem_in_between = ro.getMemUsage(smi_while_running, gpu_list)
    #print ("In Between: \n" + str(smi_while_running))

    #wait for the thread to finish
    dcgm_thread.join()

    for i in range(0, len(gpu_list)):
        ro.prot_thread_gpu[i].join()

    #Copy the dcgm data in csv file
    cmd = '{executable} python2 parse_dcgm_single_metric.py -f dcgmLogs_{0}.txt -m {1} -i {2}'.format(metrics, \
            metrics, gpuid_list, executable=sys.executable)
    ret = util.executeBashCmd(cmd, True)

    #Copy the dcgmproftester data in csv
    if metrics in ro.metrics_util_list:
        for i in range(0, len(gpu_list)):
            cmd = '{executable} parse_dcgmproftester_single_metric.py -f \
                  dcgmLogsproftester_{0}_gpu{1}.txt -m {2} -i {3}'.format(metrics, gpu_list[i], \
                  metrics, gpu_list[i], executable=sys.executable)
            ret = util.executeBashCmd(cmd, True)

    #Compare the results and determine pass and fail
    for i in range(0, len(gpu_list)):
        dcgm_file = 'dcgm_{0}.csv'.format(metrics)
        dcgmproftester_file = 'dcgmProfTester_{0}_gpu{1}.csv'.format(metrics, gpu_list[i])
        ret = ro.validateAccuracy(dcgm_file, dcgmproftester_file, int(gpu_list[i]), metrics)
        if ret == 0:
            print("\n&&&& PASSED GPU_" + str(gpu_list[i]) + "_metric_validation_test")
            ro.results[gpu_list[i]] = 'PASS'
        else:
            print("\n&&&& FAILED GPU_" + str(gpu_list[i]) + "_metric_validation_test")
            ro.results[gpu_list[i]] = 'FAIL'

    print("\n")
    #for i in range(0, len(gpu_list)):
        #print('Validation for GPU ' + str(gpu_list[i]) + ': ' + ro.results[gpu_list[i]])

    smi_at_end = ro.getSmiOp()
    mem_in_end = ro.getMemUsage(smi_at_end, gpu_list)


    print("\nMemory in Beg of test run of all GPU's under test: " + str(mem_in_beg))
    print("Memory in Between of test run of all GPU's under test: " + str(mem_in_between))
    print("Memory in end of test run of all GPU's under test: " + str(mem_in_end) + "\n")

    for i in range(0, len(gpu_list)):
        print("\n&&&& RUNNING GPU_" + str(gpu_list[i]) + "_memory_validation_test")
        val = int(mem_in_end[i][0:len(mem_in_end[i])-3])
        #print ("Val without string: ", val)
        if ((mem_in_beg[i] != mem_in_end[i]) or val > 156):
            print("\n&&&& FAILED GPU_" + str(gpu_list[i]) + "_memory_validation_test")
        else:
            print("\n&&&& PASSED GPU_" + str(gpu_list[i]) + "_memory_validation_test")

    if download_bin:
        ret = ro.removeBinaries(False)

    ret = ro.killNvHostEngine
示例#8
0
 def _runProftester(self, gpuIndex, metric, time):
     metrics = str(metric)
     print("\n&&&& INFO: Running dcgmproftester to collect metrics on gpu {0}".format(gpuIndex))
     os.environ['CUDA_VISIBLE_DEVICES'] = str(gpuIndex)
     util.executeBashCmd("echo {0} | /usr/bin/dcgmproftester -d {0} -t {1} 2>&1 | tee dcgmLogsproftester_{2}_gpu{3}.txt".format(time, metrics, metrics, gpuIndex), False)
示例#9
0
 def _runDcgm(self, metrics, gpuid_list, time):
     print("\n&&&& INFO: Running dcgm to collect metrics on {0}".format(metrics))
     ret = util.executeBashCmd("echo {0} | timeout {0}s /usr/bin/dcgmi dmon -e {1} -i {2} 2>&1 | tee dcgmLogs_{3}.txt".format(time, metrics, gpuid_list, metrics), False)
示例#10
0
 def _runDcgmLoadProfilingModule(self):
     print("\n&&&& INFO: Running dcgm to just to load profiling module once.")
     ret = util.executeBashCmd("timeout 3s /usr/bin/dcgmi dmon -e 1001 -i 0", False)
示例#11
0
    def killNvHostEngine(self):
        print("\n&&&& INFO: Killing any existing nvhostengine instance")
        ret = util.executeBashCmd("sudo /usr/bin/nv-hostengine -t", True)

        print("\n&&&& INFO: Stopping dcgm service ")
        ret = util.executeBashCmd("sudo service dcgm stop", True)