Exemplo n.º 1
0
def parse_cmdln():
    parser = get_args()
    args = parser.parse_args()
    if args.program == 'gpu_temp':

        assertion(nvmlInit,
                  ImportError('nvidia-ml-py is required for this program.'))

        assertion(mpl,
                  ImportError('matplotlib is required for this program.'))

        assertion(args.deviceID,
                  AssertionError('GPU index must be declared.'))

        nvmlInit()
        args.handle = nvmlDeviceGetHandleByIndex(args.deviceID)

    if args.program == 'cpu_usage':

        assertion(psutil,
                  ImportError('psutil is required for this program.'))

    if args.program == 'screen_glow':

        assertion(PIL,
                  ImportError('PIL is required for this program.'))

    return args
Exemplo n.º 2
0
    def __check_gpu(self):
        """ Check if the process list contains GPU processes and determine if
        GPUs exists. Add GPU processes to the processes list if required."""
        if not self.exp.meta_data.plugin_list._contains_gpu_processes():
            return
        try:
            import pynvml as pv
        except:
            logging.debug("pyNVML module not found")
            raise Exception("pyNVML module not found")
        try:
            pv.nvmlInit()
            count = int(pv.nvmlDeviceGetCount())
            logging.debug("%s GPUs have been found.", count)
        except:
            logging.debug("No GPUs have been found.")
            raise Exception("The process list contains GPU plugins, but "
                            " no GPUs have been found.")

        processes = self.exp.meta_data.get_meta_data('processes')
        if not [i for i in processes if 'GPU' in i]:
            logging.debug("GPU processes missing. GPUs found so adding them.")
            cpus = ['CPU'+str(i) for i in range(count)]
            gpus = ['GPU'+str(i) for i in range(count)]
            for i in range(min(count, len(processes))):
                processes[processes.index(cpus[i])] = gpus[i]
            self.exp.meta_data.set_meta_data('processes', processes)
Exemplo n.º 3
0
    def __check_gpu(self):
        """ Check if the process list contains GPU processes and determine if
        GPUs exists. Add GPU processes to the processes list if required."""
        if not self.exp.meta_data.plugin_list._contains_gpu_processes():
            return

        try:
            import pynvml as pv
        except:
            logging.debug("pyNVML module not found")
            raise Exception("pyNVML module not found")

        try:
            pv.nvmlInit()
            count = int(pv.nvmlDeviceGetCount())
            logging.debug("%s GPUs have been found.", count)

            if not self.exp.meta_data.get('test_state'):
                for i in range(count):
                    handle = pv.nvmlDeviceGetHandleByIndex(i)
                    if pv.nvmlDeviceGetComputeRunningProcesses(handle):
                        raise Exception("Unfortunately, GPU %i is busy. Try \
                            resubmitting the job to the queue." % i)
        except Exception as e:
            raise Exception("Unable to run GPU plugins: %s", e.message)
        self.__set_gpu_processes(count)
Exemplo n.º 4
0
 def initialize(self):
     """ Initialize pynvml """
     if not self.initialized:
         if IS_MACOS:
             if self.logger:
                 self.logger.debug("macOS Detected. Using pynvx")
             try:
                 pynvx.cudaInit()
             except RuntimeError:
                 self.initialized = True
                 return
         else:
             try:
                 if self.logger:
                     self.logger.debug("OS is not macOS. Using pynvml")
                 pynvml.nvmlInit()
             except (pynvml.NVMLError_LibraryNotFound,  # pylint: disable=no-member
                     pynvml.NVMLError_DriverNotLoaded,  # pylint: disable=no-member
                     pynvml.NVMLError_NoPermission):  # pylint: disable=no-member
                 self.initialized = True
                 return
         self.initialized = True
         self.get_device_count()
         self.get_active_devices()
         self.get_handles()
Exemplo n.º 5
0
def auto_select_gpu():
  """Select gpu which has largest free memory"""
  if HAS_NVML:
    pynvml.nvmlInit()
    deviceCount = pynvml.nvmlDeviceGetCount()
    largest_free_mem = 0
    largest_free_idx = 0
    for i in range(deviceCount):
      handle = pynvml.nvmlDeviceGetHandleByIndex(i)
      info = pynvml.nvmlDeviceGetMemoryInfo(handle)
      if info.free > largest_free_mem:
        largest_free_mem = info.free
        largest_free_idx = i
    pynvml.nvmlShutdown()
    largest_free_mem = largest_free_mem / 1024. / 1024.  # Convert to MB

    idx_to_gpu_id = {}
    for i in range(deviceCount):
      idx_to_gpu_id[i] = '{}'.format(i)

    gpu_id = idx_to_gpu_id[largest_free_idx]
    logging.info('Using largest free memory GPU {} with free memory {}MB'.format(gpu_id, largest_free_mem))
    return gpu_id
  else:
    logging.info('nvidia-ml-py is not installed, automatically select gpu is disabled!')
    return '0'
Exemplo n.º 6
0
def get_gpu_mem_used():
    try:
        from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(0)
        totalMemory = nvmlDeviceGetMemoryInfo(handle)
        return totalMemory.used
    except Exception:
        return -1
Exemplo n.º 7
0
def get_available_gpus():
    try:
        import pynvml as pv
    except:
        logging.debug("pyNVML module not found")
        raise Exception("pyNVML module not found")
    pv.nvmlInit()
    count = int(pv.nvmlDeviceGetCount())
    return pv, count
Exemplo n.º 8
0
def get_gpu_temperatures():
    nvmlInit()
    gpus = dict()
    for i in range(nvmlDeviceGetCount()):
        handle = nvmlDeviceGetHandleByIndex(i)
        gpus[i] = int(nvmlDeviceGetTemperature(handle, 0))

    nvmlShutdown()
    return gpus
Exemplo n.º 9
0
 def initialize(self):
     """ Initialize pynvml """
     if not self.initialized:
         try:
             pynvml.nvmlInit()
         except pynvml.NVMLError_LibraryNotFound:
             self.initialized = True
             return
         self.initialized = True
         self.get_device_count()
         self.get_handles()
Exemplo n.º 10
0
    def init_nvidia(self):
        """Init the NVIDIA API."""
        if import_error_tag:
            self.nvml_ready = False

        try:
            pynvml.nvmlInit()
            self.device_handles = get_device_handles()
            self.nvml_ready = True
        except Exception:
            logger.debug("pynvml could not be initialized.")
            self.nvml_ready = False

        return self.nvml_ready
    def _init_nvml(self):
        if self._load_nvidia_lib() == -1:
            return -1

        try:
            global pynvml
            import pip
            pip.main(['install', '--quiet', 'nvidia-ml-py'])
            import pynvml as pynvml
            pynvml.nvmlInit()
            return 0
        except pynvml.NVMLError, err:
            logger.debug('Failed to initialize NVML: ', err)
            return -1
Exemplo n.º 12
0
def main():
    port = int(sys.argv[1]) if len(sys.argv) > 1 else 9200

    try:
        pynvml.nvmlInit()
        atexit.register(pynvml.nvmlShutdown)

        register_standard_metrics()

        print('Starting on port {}'.format(port))
        httpd = HTTPServer(('', port), MetricsHandler)
        httpd.serve_forever()

    except pynvml.NVMLError, err:
        print('NVML error: {}'.format(err))
Exemplo n.º 13
0
    def __init__(self, protocols, **kwargs):
        Monitor.__init__(self, **kwargs)
        self.protocols = protocols
        self.cpuAlert = kwargs['cpuAlert']
        self.memAlert = kwargs['memAlert']
        self.swapAlert = kwargs['swapAlert']
        self._dataBase = kwargs.get('dbName', SYSTEM_LOG_SQLITE)
        self._tableName = kwargs.get('tableName', 'log')
        self.doGpu = kwargs['doGpu']
        self.doNetwork = kwargs['doNetwork']
        self.doDiskIO = kwargs['doDiskIO']
        self.samplingTime = 1.  # seconds

        self.labelList = ["cpu", "mem", "swap"]
        if self.doGpu:
            self.gpuLabelList = []
            # get Gpus to monitor
            self.gpusToUse = [int(n) for n in (kwargs['gpusToUse']).split()]
            for i in self.gpusToUse:
                self.gpuLabelList.append("gpuMem_%d" % i)
                self.gpuLabelList.append("gpuUse_%d" % i)
                self.gpuLabelList.append("gpuTem_%d" % i)
            # init GPUs
            nvmlInit()
            self.labelList += self.gpuLabelList
        else:
            self.gpusToUse = None
        if self.doNetwork:
            self.nif = kwargs['nif']
            self.netLabelList = []  # in the future we may display
            # all the network interfaces
            self.netLabelList.append("%s_send" % self.nif)
            self.netLabelList.append("%s_recv" % self.nif)
            self.labelList += self.netLabelList
        else:
            self.nif = None
        if self.doDiskIO:
            self.netLabelList = []  # in the future we may display
            # all the network interfaces
            self.netLabelList.append("disk_read")
            self.netLabelList.append("disk_write")
            self.labelList += self.netLabelList
        else:
            pass

        self.conn = lite.connect(os.path.join(self.workingDir, self._dataBase),
                                 isolation_level=None)
        self.cur = self.conn.cursor()
Exemplo n.º 14
0
def get_nvml_driver_version():
    try:
        from pynvml import nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion
        try:
            nvmlInit()
            v = nvmlSystemGetDriverVersion()
            log("nvmlSystemGetDriverVersion=%s", v)
            return v.split(".")
        except Exception as e:
            log.warn("Warning: failed to query the NVidia kernel module version via NVML:")
            log.warn(" %s", e)
        finally:
            nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return ""
Exemplo n.º 15
0
def request_mem(mem_mb, i_am_nice=True):
    # titanx' mem:        12,881,559,552 bytes
    # 12*1024*1024*1024 = 12,884,901,888
    mem = mem_mb * 1024 * 1024
    nvml.nvmlInit()
    # n = nvml.nvmlDeviceGetCount()
    try:
        handle = nvml.nvmlDeviceGetHandleByIndex(0)
        info   = nvml.nvmlDeviceGetMemoryInfo(handle)
        cap = info.total * nice_ratio
        # req = cap if mem > cap and i_am_nice else mem
        req = mem
        if req > cap and i_am_nice:
            raise MemoryError('You are supposed to be polite..')
        if req > info.free:
            raise MemoryError('Cannot fullfil the gpumem request')
        return req / info.free
    finally:
        nvml.nvmlShutdown()
Exemplo n.º 16
0
def getFreeId():
    import pynvml 

    pynvml.nvmlInit()
    def getFreeRatio(id):
        handle = pynvml.nvmlDeviceGetHandleByIndex(id)
        use = pynvml.nvmlDeviceGetUtilizationRates(handle)
        ratio = 0.5*(float(use.gpu+float(use.memory)))
        return ratio

    deviceCount = pynvml.nvmlDeviceGetCount()
    available = []
    for i in range(deviceCount):
        if getFreeRatio(i)<70:
            available.append(i)
    gpus = ''
    for g in available:
        gpus = gpus+str(g)+','
    gpus = gpus[:-1]
    return gpus
Exemplo n.º 17
0
    def collect_via_pynvml(self, stats_config):
        """
        Use pynvml python binding to collect metrics
        :param stats_config:
        :return:
        """
        try:
            NVML_TEMPERATURE_GPU = 0
            pynvml.nvmlInit()
            device_count = pynvml.nvmlDeviceGetCount()

            for device_index in xrange(device_count):
                handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
                memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle)

                metrics = {
                    'memory.total': memoryInfo.total / 1024 / 1024,
                    'memory.used': memoryInfo.total / 1024 / 1024,
                    'memory.free': memoryInfo.free / 1024 / 1024,
                    'utilization.gpu': utilizationRates.gpu,
                    'utilization.memory': utilizationRates.memory,
                    'temperature.gpu':
                        pynvml.nvmlDeviceGetTemperature(handle,
                                                        NVML_TEMPERATURE_GPU)
                }

                for stat_name in stats_config[1:]:
                    metric = metrics.get(stat_name)
                    if metric:
                        metric_name = 'gpu_{index}.{stat_name}'.format(
                            index=str(device_index),
                            stat_name=stat_name
                        )
                        self.publish(metric_name, metric)
        finally:
            pynvml.nvmlShutdown()
Exemplo n.º 18
0
def load_pynvml_env():
    import pynvml # nvidia-ml-py3

    #
    # BEGIN: Temporary workaround for nvml.dll load issue in Win10 (continued)
    _LoadNvmlLibrary()
    pynvml.nvmlLib = nvmlLib
    #
    # END: Temporary workaround for nvml.dll load issue in Win10
    #

    if platform.system() == "Darwin":
        try:
            from pynvx import pynvml
        except:
            print("please install pynvx on OSX: pip install pynvx")
            sys.exit(1)

        pynvml.nvmlInit()
        return pynvml

    pynvml.nvmlInit()

    return pynvml
Exemplo n.º 19
0
 def __init__(self, index):
     try:
         nv.nvmlInit()
         self._handle = nv.nvmlDeviceGetHandleByIndex(index)
     except nv.NVMLError_LibraryNotFound:
         pass
Exemplo n.º 20
0
def benchmark(input_arguments):
    args = process_arguments(input_arguments)
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    elif args.quiet:
        logging.basicConfig(level=logging.WARNING)
    else:
        logging.basicConfig(level=logging.INFO)
    source_db_user = args.user
    source_db_passwd = args.passwd
    source_db_server = args.server
    source_db_port = args.port
    source_db_name = args.name
    source_table = args.table
    label = args.label
    if args.queries_dir:
        queries_dir = args.queries_dir
    else:
        queries_dir = os.path.join(os.path.dirname(__file__), "queries")
    iterations = int(args.iterations)
    if (iterations > 1) is not True:
        # Need > 1 iteration as first iteration is dropped from calculations
        logging.error("Iterations must be greater than 1")
        exit(1)
    gpu_count = args.gpu_count
    gpu_name = args.gpu_name
    no_gather_conn_gpu_info = args.no_gather_conn_gpu_info
    gather_nvml_gpu_info = args.gather_nvml_gpu_info
    no_gather_nvml_gpu_info = args.no_gather_nvml_gpu_info
    machine_name = args.machine_name
    machine_uname = args.machine_uname
    destinations = args.destination.split(",")
    if "mapd_db" in destinations:
        valid_destination_set = True
        dest_db_user = args.dest_user
        dest_db_passwd = args.dest_passwd
        if args.dest_server is None:
            # If dest_server is not set for mapd_db, then exit
            logging.error(
                '"dest_server" is required when destination = "mapd_db"')
            exit(1)
        else:
            dest_db_server = args.dest_server
        dest_db_port = args.dest_port
        dest_db_name = args.dest_name
        dest_table = args.dest_table
        dest_table_schema_file = args.dest_table_schema_file
    if "file_json" in destinations:
        valid_destination_set = True
        if args.output_file_json is None:
            # If output_file_json is not set for file_json, then exit
            logging.error(
                '"output_file_json" is required when destination = "file_json"'
            )
            exit(1)
        else:
            output_file_json = args.output_file_json
    if "output" in destinations:
        valid_destination_set = True
    if "jenkins_bench" in destinations:
        valid_destination_set = True
        if args.output_file_jenkins is None:
            # If output_file_jenkins is not set for jenkins_bench, then exit
            logging.error('"output_file_jenkins" is required ' +
                          'when destination = "jenkins_bench"')
            exit(1)
        else:
            output_file_jenkins = args.output_file_jenkins
    output_tag_jenkins = args.output_tag_jenkins
    if not valid_destination_set:
        logging.error("No valid destination(s) have been set. Exiting.")
        exit(1)

    # Establish connection to mapd db
    con = get_connection(
        db_user=source_db_user,
        db_passwd=source_db_passwd,
        db_server=source_db_server,
        db_port=source_db_port,
        db_name=source_db_name,
    )
    if not con:
        exit(1)  # Exit if cannot connect to db

    # Set run vars
    run_guid = str(uuid.uuid4())
    logging.debug("Run guid: " + run_guid)
    run_timestamp = datetime.datetime.now()
    run_connection = str(con)
    logging.debug("Connection string: " + run_connection)
    run_driver = ""  # TODO
    run_version = con._client.get_version()
    if "-" in run_version:
        run_version_short = run_version.split("-")[0]
    else:
        run_version_short = run_version
    conn_machine_name = re.search(r"@(.*?):", run_connection).group(1)
    # Set GPU info fields
    conn_gpu_count = None
    source_db_gpu_count = None
    source_db_gpu_mem = None
    source_db_gpu_driver_ver = ""
    source_db_gpu_name = ""
    if no_gather_conn_gpu_info:
        logging.debug(
            "--no-gather-conn-gpu-info passed, " +
            "using blank values for source database GPU info fields " +
            "[run_gpu_count, run_gpu_mem_mb] ")
    else:
        logging.debug("Gathering source database GPU info fields " +
                      "[run_gpu_count, run_gpu_mem_mb] " +
                      "using pymapd connection info. ")
        conn_hardware_info = con._client.get_hardware_info(con._session)
        conn_gpu_count = conn_hardware_info.hardware_info[0].num_gpu_allocated
    if conn_gpu_count == 0 or conn_gpu_count is None:
        no_gather_nvml_gpu_info = True
        if conn_gpu_count == 0:
            logging.warning(
                "0 GPUs detected from connection info, " +
                "using blank values for source database GPU info fields " +
                "If running against cpu-only server, make sure to set " +
                "--no-gather-nvml-gpu-info and --no-gather-conn-gpu-info.")
    else:
        source_db_gpu_count = conn_gpu_count
        try:
            source_db_gpu_mem = int(
                conn_hardware_info.hardware_info[0].gpu_info[0].memory /
                1000000)
        except IndexError:
            logging.error("GPU memory info not available from connection.")
    if no_gather_nvml_gpu_info:
        logging.debug(
            "--no-gather-nvml-gpu-info passed, " +
            "using blank values for source database GPU info fields " +
            "[gpu_driver_ver, run_gpu_name] ")
    elif conn_machine_name == "localhost" or gather_nvml_gpu_info:
        logging.debug("Gathering source database GPU info fields " +
                      "[gpu_driver_ver, run_gpu_name] " +
                      "from local GPU using pynvml. ")
        import pynvml

        pynvml.nvmlInit()
        source_db_gpu_driver_ver = pynvml.nvmlSystemGetDriverVersion().decode()
        for i in range(source_db_gpu_count):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            # Assume all cards are the same, overwrite name value
            source_db_gpu_name = pynvml.nvmlDeviceGetName(handle).decode()
        pynvml.nvmlShutdown()
    # If gpu_count argument passed in, override gathered value
    if gpu_count:
        source_db_gpu_count = gpu_count
    # Set machine names, using local info if connected to localhost
    if conn_machine_name == "localhost":
        local_uname = os.uname()
    if machine_name:
        run_machine_name = machine_name
    else:
        if conn_machine_name == "localhost":
            run_machine_name = local_uname.nodename.split(".")[0]
        else:
            run_machine_name = conn_machine_name
    if machine_uname:
        run_machine_uname = machine_uname
    else:
        if conn_machine_name == "localhost":
            run_machine_uname = " ".join(local_uname)
        else:
            run_machine_uname = ""

    # Read query files contents and write to query_list
    query_list = []
    logging.debug("Queries dir: " + queries_dir)
    try:
        for query_filename in os.listdir(queries_dir):
            logging.debug("Validating query filename: " + query_filename)
            if validate_query_file(query_filename=query_filename):
                with open(queries_dir + "/" + query_filename,
                          "r") as query_filepath:
                    logging.debug("Reading query with filename: " +
                                  query_filename)
                    query_mapdql = query_filepath.read().replace("\n", " ")
                    query_mapdql = query_mapdql.replace(
                        "##TAB##", source_table)
                    query_list.append({
                        "name": query_filename,
                        "mapdql": query_mapdql
                    })
        logging.info("Read all query files")
    except FileNotFoundError:
        logging.exception("Could not find queries directory.")
        exit(1)  # Exit if cannot get queries dir

    # Run queries
    for query in query_list:
        # Set additional query vars
        # Query ID = filename without extention
        query_id = query["name"].rsplit(".")[0]

        # Run iterations of query
        query_results = []
        logging.info("Running query: " + query["name"] + " iterations: " +
                     str(iterations))
        query_total_start_time = timeit.default_timer()
        for iteration in range(iterations):
            # Gather memory before running query iteration
            logging.debug("Getting pre-query memory usage on CPU")
            pre_query_cpu_mem_usage = get_mem_usage(con=con, mem_type="cpu")
            logging.debug("Getting pre-query memory usage on GPU")
            pre_query_gpu_mem_usage = get_mem_usage(con=con, mem_type="gpu")
            # Run query iteration
            logging.debug("Running iteration " + str(iteration) +
                          " of query " + query["name"])
            query_result = execute_query(
                query_name=query["name"],
                query_mapdql=query["mapdql"],
                iteration=iteration,
                con=con,
            )
            # Gather memory after running query iteration
            logging.debug("Getting post-query memory usage on CPU")
            post_query_cpu_mem_usage = get_mem_usage(con=con, mem_type="cpu")
            logging.debug("Getting post-query memory usage on GPU")
            post_query_gpu_mem_usage = get_mem_usage(con=con, mem_type="gpu")
            # Calculate total (post minus pre) memory usage after query iteration
            query_cpu_mem_usage = round(
                post_query_cpu_mem_usage["usedram"] -
                pre_query_cpu_mem_usage["usedram"],
                1,
            )
            query_gpu_mem_usage = round(
                post_query_gpu_mem_usage["usedram"] -
                pre_query_gpu_mem_usage["usedram"],
                1,
            )
            if query_result:
                query.update({"succeeded": True})
                query_error_info = ""  # TODO - interpret query error info
                # Assign first query iteration times
                if iteration == 0:
                    first_execution_time = round(
                        query_result["execution_time"], 1)
                    first_connect_time = round(query_result["connect_time"], 1)
                    first_results_iter_time = round(
                        query_result["results_iter_time"], 1)
                    first_total_time = (first_execution_time +
                                        first_connect_time +
                                        first_results_iter_time)
                    first_cpu_mem_usage = query_cpu_mem_usage
                    first_gpu_mem_usage = query_gpu_mem_usage
                else:
                    # Put noninitial iterations into query_result list
                    query_results.append(query_result)
                    # Verify no change in memory for noninitial iterations
                    if query_cpu_mem_usage != 0.0:
                        logging.error(
                            ("Noninitial iteration ({0}) of query ({1}) " +
                             "shows non-zero CPU memory usage: {2}").format(
                                 iteration, query["name"],
                                 query_cpu_mem_usage))
                    if query_gpu_mem_usage != 0.0:
                        logging.error(
                            ("Noninitial iteration ({0}) of query ({1}) " +
                             "shows non-zero GPU memory usage: {2}").format(
                                 iteration, query["name"],
                                 query_gpu_mem_usage))
            else:
                query.update({"succeeded": False})
                logging.warning("Error detected during execution of query: " +
                                query["name"] +
                                ". This query will be skipped and " +
                                "times will not reported")
            if query["succeeded"] is False:
                # Do not run any more iterations of the failed query
                break
        if query["succeeded"] is False:
            # Do not calculate results for the failed query, move on to the next
            continue

        # Calculate time for all iterations to run
        query_total_elapsed_time = round(
            ((timeit.default_timer() - query_total_start_time) * 1000), 1)
        logging.info("Completed all iterations of query " + query["name"])

        # Aggregate iteration values
        execution_times, connect_times, results_iter_times, total_times = (
            [],
            [],
            [],
            [],
        )
        for query_result in query_results:
            execution_times.append(query_result["execution_time"])
            connect_times.append(query_result["connect_time"])
            results_iter_times.append(query_result["results_iter_time"])
            total_times.append(query_result["total_time"])
            # Overwrite result count, since should be the same for each iteration
            result_count = query_result["result_count"]

        # Calculate query times
        logging.debug("Calculating times from query " + query["name"])
        query_times = calculate_query_times(
            total_times=total_times,
            execution_times=execution_times,
            connect_times=connect_times,
            results_iter_times=results_iter_times,
        )

        # Update query dict entry with all values
        query.update({
            "results": {
                "run_guid": run_guid,
                "run_timestamp": run_timestamp,
                "run_connection": run_connection,
                "run_machine_name": run_machine_name,
                "run_machine_uname": run_machine_uname,
                "run_driver": run_driver,
                "run_version": run_version,
                "run_version_short": run_version_short,
                "run_label": label,
                "run_gpu_count": source_db_gpu_count,
                "run_gpu_driver_ver": source_db_gpu_driver_ver,
                "run_gpu_name": source_db_gpu_name,
                "run_gpu_mem_mb": source_db_gpu_mem,
                "run_table": source_table,
                "query_id": query_id,
                "query_result_set_count": result_count,
                "query_error_info": query_error_info,
                "query_conn_first": first_connect_time,
                "query_conn_avg": query_times["connect_time_avg"],
                "query_conn_min": query_times["connect_time_min"],
                "query_conn_max": query_times["connect_time_max"],
                "query_conn_85": query_times["connect_time_85"],
                "query_exec_first": first_execution_time,
                "query_exec_avg": query_times["execution_time_avg"],
                "query_exec_min": query_times["execution_time_min"],
                "query_exec_max": query_times["execution_time_max"],
                "query_exec_85": query_times["execution_time_85"],
                "query_exec_25": query_times["execution_time_25"],
                "query_exec_stdd": query_times["execution_time_std"],
                # Render queries not supported yet
                "query_render_first": None,
                "query_render_avg": None,
                "query_render_min": None,
                "query_render_max": None,
                "query_render_85": None,
                "query_render_25": None,
                "query_render_stdd": None,
                "query_total_first": first_total_time,
                "query_total_avg": query_times["total_time_avg"],
                "query_total_min": query_times["total_time_min"],
                "query_total_max": query_times["total_time_max"],
                "query_total_85": query_times["total_time_85"],
                "query_total_all": query_total_elapsed_time,
                "results_iter_count": iterations,
                "results_iter_first": first_results_iter_time,
                "results_iter_avg": query_times["results_iter_time_avg"],
                "results_iter_min": query_times["results_iter_time_min"],
                "results_iter_max": query_times["results_iter_time_max"],
                "results_iter_85": query_times["results_iter_time_85"],
                "cpu_mem_usage_mb": first_cpu_mem_usage,
                "gpu_mem_usage_mb": first_gpu_mem_usage,
            }
        })
        logging.debug("All values set for query " + query["name"] + ": " +
                      str(query))
    logging.debug("Closing source db connection.")
    con.close()
    logging.info("Completed all queries.")

    # Create list of successful queries
    logging.debug(
        "Removing failed queries from results going to destination db(s)")
    succesful_query_list = query_list
    for index, query in enumerate(succesful_query_list):
        if query["succeeded"] is False:
            del succesful_query_list[index]
    # Create successful query results list for upload to destination(s)
    query_results = []
    for query in succesful_query_list:
        query_results.append(query["results"])
    # Convert query list to json for outputs
    query_list_json = json.dumps(query_list,
                                 default=json_format_handler,
                                 indent=2)

    # Send results
    if "mapd_db" in destinations:
        # Create dataframe from list of query results
        logging.debug("Converting results list to pandas dataframe")
        results_df = DataFrame(query_results)
        # Establish connection to destination mapd db
        logging.debug("Connecting to destination mapd db")
        dest_con = get_connection(
            db_user=dest_db_user,
            db_passwd=dest_db_passwd,
            db_server=dest_db_server,
            db_port=dest_db_port,
            db_name=dest_db_name,
        )
        if not dest_con:
            exit(1)  # Exit if cannot connect to destination db
        # Load results into db, creating table if it does not exist
        tables = dest_con.get_tables()
        if dest_table not in tables:
            logging.info("Destination table does not exist. Creating.")
            try:
                with open(dest_table_schema_file, "r") as table_schema:
                    logging.debug("Reading table_schema_file: " +
                                  dest_table_schema_file)
                    create_table_sql = table_schema.read().replace("\n", " ")
                    create_table_sql = create_table_sql.replace(
                        "##TAB##", dest_table)
            except FileNotFoundError:
                logging.exception("Could not find table_schema_file.")
                exit(1)
            try:
                logging.debug("Executing create destination table query")
                res = dest_con.execute(create_table_sql)
                logging.debug("Destination table created.")
            except (
                    pymapd.exceptions.ProgrammingError,
                    pymapd.exceptions.Error,
            ):
                logging.exception("Error running table creation")
                exit(1)
        logging.info("Loading results into destination db")
        dest_con.load_table_columnar(
            dest_table,
            results_df,
            preserve_index=False,
            chunk_size_bytes=0,
            col_names_from_schema=True,
        )
        dest_con.close()
    if "file_json" in destinations:
        # Write to json file
        logging.debug("Opening json output file for writing")
        file_json_open = open(output_file_json, "w")
        logging.info("Writing to output json file: " + output_file_json)
        file_json_open.write(query_list_json)
    if "jenkins_bench" in destinations:
        # Write output to file formatted for jenkins benchmark plugin
        # https://github.com/jenkinsci/benchmark-plugin
        jenkins_bench_results = []
        for query_result in query_results:
            logging.debug("Constructing output for jenkins benchmark plugin")
            jenkins_bench_results.append({
                "name":
                query_result["query_id"],
                "description":
                "",
                "parameters": [],
                "results": [{
                    "name": query_result["query_id"] + " average",
                    "description": "",
                    "unit": "ms",
                    "dblValue": query_result["query_exec_avg"],
                }],
            })
        jenkins_bench_json = json.dumps({
            "groups": [{
                "name": source_table + output_tag_jenkins,
                "description": "Source table: " + source_table,
                "tests": jenkins_bench_results,
            }]
        })
        # Write to json file
        logging.debug("Opening jenkins_bench json output file for writing")
        file_jenkins_open = open(output_file_jenkins, "w")
        logging.info("Writing to jenkins_bench json file: " +
                     output_file_jenkins)
        file_jenkins_open.write(jenkins_bench_json)
    if "output" in destinations:
        logging.info("Printing query results to output")
        print(query_list_json)

    logging.info("Succesfully loaded query results info into destination(s)")
Exemplo n.º 21
0
 def init(self):
     
     self.util_history = []
     self.temp_history = []
     pynvml.nvmlInit()
     self.gpu_handles = []
     self.deviceCount = pynvml.nvmlDeviceGetCount()
     
     for i in range(self.deviceCount):
         self.gpu_handles.append(pynvml.nvmlDeviceGetHandleByIndex(i))
     
     self.cpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6)
     self.cpu_prog_bars = []
     self.gpu_boxes = []
     self.gpu_prog_bars = []
     
     self.prev_idle = []
     self.prev_total = []
     self.idle = []
     self.total = []
     
     #---cpu_box---
     try:
         stat = open("/proc/stat")
         
         statlines = stat.read().splitlines()
         stat.close()
         
         self.corecount = -1
         
         for line in statlines:
             if (line[0:2] == "cp"):
                 self.corecount+= 1
             else:
                 break
         
     except IOError:
         print("Problem opening /proc/stat, exiting..")
         pynvml.nvmlShutdown()
         quit()
     
     for i in range(self.corecount):
         self.cpu_prog_bars.append(Gtk.ProgressBar(text="CPU %d" % i, show_text=True))
         self.cpu_box.pack_start(self.cpu_prog_bars[i], True, True, 0)
         
         self.prev_idle.append(0)
         self.prev_total.append(0)
         self.idle.append(0)
         self.total.append(0)
     
     #---gpu_boxes---
     for i in range(self.deviceCount):
         product_name = pynvml.nvmlDeviceGetName(self.gpu_handles[i])
         product_name = product_name.decode('utf-8')
         
         gpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8)
         
         label = Gtk.Label(product_name)
         
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="GPU", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Utilization", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Usage", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Temperature", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Encoder", show_text=True))
         self.gpu_prog_bars.append(Gtk.ProgressBar(text="Decoder", show_text=True))
         
         gpu_box.pack_start(label, True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +1], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +2], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +3], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +4], True, True, 0)
         gpu_box.pack_start(self.gpu_prog_bars[i*6 +5], True, True, 0)
         
         self.gpu_boxes.append(gpu_box)
     
     #---proc---
     proc_liststore = Gtk.ListStore(int, str, int)
     
     self.tree = Gtk.TreeView(model=proc_liststore)
     
     renderer_pid = Gtk.CellRendererText()
     column_pid = Gtk.TreeViewColumn("Proccess ID", renderer_pid, text=0)
     column_pid.set_resizable(True)
     self.tree.append_column(column_pid)
     
     renderer_path = Gtk.CellRendererText()
     column_path = Gtk.TreeViewColumn("Command Line", renderer_path, text=1)
     column_path.set_resizable(True)
     column_path.set_fixed_width(250)
     self.tree.append_column(column_path)
     
     renderer_mem = Gtk.CellRendererText()
     column_mem = Gtk.TreeViewColumn("Memory (MiB)", renderer_mem, text=2)
     column_mem.set_resizable(True)
     self.tree.append_column(column_mem)
Exemplo n.º 22
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder',
                           long(util_encoder[0]),
                           tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder',
                           long(util_decoder[0]),
                           tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory,
                               tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Exemplo n.º 23
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()    # for python3, to unicode
            return b

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU
                )
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    # TODO: could be more information such as system memory
                    # usage, CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                'power.draw': power // 1000 if power is not None else None,
                'enforced.power.limit': power_limit // 1000
                if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used': memory.used // MB if memory else None,
                'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        # 2. additional info (driver version, etc).
        try:
            driver_version = _decode(N.nvmlSystemGetDriverVersion())
        except N.NVMLError:
            driver_version = None    # N/A

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list, driver_version=driver_version)
Exemplo n.º 24
0
def gpu_profile(frame, event, arg):
    print_tensor_sizes = True
    last_tensor_sizes = set()
    gpu_profile_fn = f'{datetime.datetime.now():%d-%b-%y-%H:%M:%S}-gpu_mem_prof.txt'
    if 'GPU_DEBUG' in os.environ:
        print('profiling gpu usage to ', gpu_profile_fn)

    # it is _about to_ execute (!)
    # global last_tensor_sizes
    # global lineno, func_name, filename, module_name

    lineno = 1
    func_name = 'loss'
    filename = 'train'
    module_name = 'cityscape_pspnet'
    os.environ['GPU_DEBUG'] = '2'

    if event == 'line':
        try:
            # about _previous_ line (!)
            if lineno is not None:
                pynvml.nvmlInit()
                handle = pynvml.nvmlDeviceGetHandleByIndex(int(os.environ['GPU_DEBUG']))
                meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                line = linecache.getline(filename, lineno)
                where_str = module_name+' '+func_name+':'+str(lineno)

                with open(gpu_profile_fn, 'a+') as f:
                    f.write(f"{where_str:<50}"
                            f":{meminfo.used/1024**2:<7.1f}Mb "
                            f"{line.rstrip()}\n")

                    if print_tensor_sizes is True:
                        for tensor in get_tensors():
                            if not hasattr(tensor, 'dbg_alloc_where'):
                                tensor.dbg_alloc_where = where_str
                        new_tensor_sizes = {(type(x), tuple(x.size()), x.dbg_alloc_where)
                                            for x in get_tensors()}

                        for t, s, loc in new_tensor_sizes:
                            f.write(f'+ {loc},{str(s)},{str(t)}\n')
                        # for t, s, loc in new_tensor_sizes - last_tensor_sizes:
                        #     f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n')
                        # for t, s, loc in last_tensor_sizes - new_tensor_sizes:
                        #     f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n')
                        # last_tensor_sizes = new_tensor_sizes
                pynvml.nvmlShutdown()

            # save details about line _to be_ executed
            # lineno = None

            func_name = frame.f_code.co_name
            filename = frame.f_globals["__file__"]
            if (filename.endswith(".pyc") or
                    filename.endswith(".pyo")):
                filename = filename[:-1]
            module_name = frame.f_globals["__name__"]
            lineno = frame.f_lineno

            if 'gmwda-pytorch' not in os.path.dirname(os.path.abspath(filename)):
                lineno = None  # skip current line evaluation

            if ('car_datasets' in filename
                    or '_exec_config' in func_name
                    or 'gpu_profile' in module_name
                    or 'tee_stdout' in module_name):
                lineno = None  # skip current

            return gpu_profile

        except (KeyError, AttributeError):
            pass

    return gpu_profile
Exemplo n.º 25
0
 def __init__(self, index):
     nv.nvmlInit()
     self._handle = nv.nvmlDeviceGetHandleByIndex(index)
Exemplo n.º 26
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""

            def get_process_info(pid):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=pid)
                process['username'] = ps_process.username()
                # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
                process['command'] = os.path.basename(ps_process.cmdline()[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024)
                process['pid'] = nv_process.pid
                return process

            def _decode(b):
                if isinstance(b, bytes):
                    return b.decode()    # for python3, to unicode
                return b

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                memory = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            processes = []
            try:
                nv_processes = N.nvmlDeviceGetComputeRunningProcesses(handle)
                # dict type is mutable
                for nv_process in nv_processes:
                    #TODO: could be more information such as system memory usage,
                    # CPU percentage, create time etc.
                    process = get_process_info(nv_process.pid)
                    processes.append(process)
            except N.NVMLError:
                processes = None  # Not supported

            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                'temperature.gpu': temperature,
                'utilization.gpu': utilization.gpu if utilization else None,
                # Convert bytes into MBytes
                'memory.used': int(memory.used / 1024 / 1024) if memory else None,
                'memory.total': int(memory.total / 1024 / 1024) if memory else None,
                'processes': processes,
            }
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list)
Exemplo n.º 27
0
def get_gpu_count():
    pynvml.nvmlInit()
    return pynvml.nvmlDeviceGetCount()
Exemplo n.º 28
0
def set_affinity(
    gpu_id,
    nproc_per_node,
    *,
    mode="socket_unique_contiguous",
    cores="all_logical",
    balanced=True,
):
    """
    The process is assigned with a proper CPU affinity that matches CPU-GPU
    hardware architecture on a given platform. Usually, it improves and
    stabilizes the performance of deep learning training workloads.

    This function assumes that the workload runs in multi-process single-device
    mode (there are multiple training processes, and each process is running on
    a single GPU). This is typical for multi-GPU data-parallel training
    workloads (e.g., using `torch.nn.parallel.DistributedDataParallel`).

    Available affinity modes:
    * 'socket' - the process is assigned with all available physical CPU cores
    from the CPU socket connected to the GPU with a given id.
    * 'socket_single' - the process is assigned with the first available
    physical CPU core from the list of all CPU cores from the CPU socket
    connected to the GPU with a given id (multiple GPUs could be assigned with
    the same CPU core).
    * 'socket_single_unique' - the process is assigned with a single unique
    available physical CPU core from the list of all CPU cores from the CPU
    socket connected to the GPU with a given id.
    * 'socket_unique_interleaved' - the process is assigned with a unique
    subset of available physical CPU cores from the CPU socket connected to a
    GPU with a given id, cores are assigned with interleaved indexing pattern
    * 'socket_unique_contiguous' - (the default) the process is assigned with a
    unique subset of available physical CPU cores from the CPU socket connected
    to a GPU with a given id, cores are assigned with contiguous indexing
    pattern

    Available "cores" modes:
    * 'all_logical' - assigns the process with all logical cores associated with
    a given corresponding physical core (i.e., automatically includes all
    available hyperthreading siblings)
    * 'single_logical' - assigns the process with only one logical core
    associated with a given corresponding physical core (i.e., excludes
    hyperthreading siblings)

    'socket_unique_contiguous' is the recommended mode for deep learning
    training workloads on NVIDIA DGX machines.

    Args:
        gpu_id: integer index of a GPU, value from 0 to 'nproc_per_node' - 1
        nproc_per_node: number of processes per node
        mode: affinity mode
        balanced: assign an equal number of physical cores to each process,
            affects only 'socket_unique_interleaved' and
            'socket_unique_contiguous' affinity modes
        cores: 'all_logical' or 'single_logical'

    Returns a set of logical CPU cores on which the process is eligible to run.

    Example:

    import argparse
    import os

    import gpu_affinity
    import torch


    def main():
        parser = argparse.ArgumentParser()
        parser.add_argument(
            '--local_rank',
            type=int,
            default=os.getenv('LOCAL_RANK', 0),
        )
        args = parser.parse_args()

        nproc_per_node = torch.cuda.device_count()

        affinity = gpu_affinity.set_affinity(args.local_rank, nproc_per_node)
        print(f'{args.local_rank}: core affinity: {affinity}')


    if __name__ == "__main__":
        main()

    Launch the example with:
    python -m torch.distributed.launch --nproc_per_node <#GPUs> example.py


    WARNING: On DGX A100, only half of the CPU cores have direct access to GPUs.
    This function restricts execution only to the CPU cores directly connected
    to GPUs, so on DGX A100, it will limit the code to half of the CPU cores and
    half of CPU memory bandwidth (which may be fine for many DL models).

    WARNING: Intel's OpenMP implementation resets affinity on the first call to
    an OpenMP function after a fork. It's recommended to run with env variable:
    `KMP_AFFINITY=disabled` if the affinity set by gpu_affinity should be
    preserved after a fork (e.g. in PyTorch DataLoader workers).
    """
    pynvml.nvmlInit()

    if mode == "socket":
        set_socket_affinity(gpu_id, nproc_per_node, cores)
    elif mode == "socket_single":
        set_socket_single_affinity(gpu_id, nproc_per_node, cores)
    elif mode == "socket_single_unique":
        set_socket_single_unique_affinity(gpu_id, nproc_per_node, cores)
    elif mode == "socket_unique_interleaved":
        set_socket_unique_affinity(gpu_id, nproc_per_node, cores, "interleaved", balanced)
    elif mode == "socket_unique_contiguous":
        set_socket_unique_affinity(gpu_id, nproc_per_node, cores, "contiguous", balanced)
    else:
        raise RuntimeError("Unknown affinity mode")

    affinity = os.sched_getaffinity(0)
    return affinity
def train(cudaid, args, model):
    pynvml.nvmlInit()
    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=args.size,
                            rank=cudaid)

    random.seed(1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    print('params: ', " T_warm: ", T_warm, " all_iteration: ", all_iteration,
          " lr: ", lr)
    #cuda_list=range(args.size)
    print('rank: ', cudaid)
    torch.cuda.set_device(cudaid)
    model.cuda(cudaid)

    accumulation_steps = int(args.batch_size / args.size / args.gpu_size)
    #optimizer = torch.optim.Adam(model.parameters(), lr=lr,betas=(0.9,0.98),eps=1e-6,weight_decay=0.0)
    optimizer = apex.optimizers.FusedLAMB(model.parameters(),
                                          lr=lr,
                                          betas=(0.9, 0.98),
                                          eps=1e-6,
                                          weight_decay=0.0,
                                          max_grad_norm=1.0)

    model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    model = DDP(model)

    accum_batch_loss = 0
    history_file = os.path.join(args.data_dir, args.history_file)
    if 'last' in args.field:
        abs_file = os.path.join(args.data_dir, args.abs_file)
    else:
        abs_file = ''
    iterator = NewsIterator(batch_size=args.gpu_size,
                            npratio=4,
                            feature_file=os.path.join(args.data_dir,
                                                      args.feature_file),
                            history_file=history_file,
                            abs_file=abs_file,
                            field=args.field,
                            fp16=True)
    train_file = os.path.join(args.data_dir, args.data_file)
    batch_t = 0
    iteration = 0
    print('train...', args.field)
    if cudaid == 0:
        writer = SummaryWriter(os.path.join(args.data_dir, args.log_file))
    epoch = 0
    model.train()
    batch_t = 0
    iteration = 0
    step = 0
    best_score = -1

    for epoch in range(0, 10):
        all_loss = 0
        all_batch = 0
        data_batch = iterator.load_data_from_file(train_file, cudaid,
                                                  args.size)
        print('load ok...')
        for imp_index, user_index, his_id, candidate_id, label in data_batch:
            batch_t += 1
            assert candidate_id.shape[1] == 2

            # if cudaid==1:
            #     torch.set_printoptions(profile="full")
            #     print(his_id)
            his_id = his_id.cuda(cudaid)
            candidate_id = candidate_id.cuda(cudaid)
            label = label.cuda(cudaid)
            loss = model(his_id, candidate_id, label)

            sample_size = candidate_id.shape[0]
            loss = loss.sum() / sample_size / math.log(2)

            accum_batch_loss += float(loss)

            all_loss += float(loss)
            all_batch += 1

            # if cudaid==1:

            # torch.set_printoptions(profile="full")
            # w=open('input.txt','w')
            # w.write(str(his_id.cpu()))
            # w.close()
            # assert 1==0

            loss = loss / accumulation_steps

            #loss.backward()
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            if (batch_t) % accumulation_steps == 0:

                iteration += 1
                adjust_learning_rate(optimizer, iteration)
                optimizer.step()
                optimizer.zero_grad()
                if cudaid == 0:
                    # handle = pynvml.nvmlDeviceGetHandleByIndex(cudaid)
                    # meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
                    # #print(int(meminfo.used)/1024/1024)
                    # print('loss: ',loss,int(meminfo.used)/1024/1024)
                    print(' batch_t: ', batch_t, ' iteration: ', iteration,
                          ' epoch: ', epoch, ' accum_batch_loss: ',
                          accum_batch_loss / accumulation_steps, ' lr: ',
                          optimizer.param_groups[0]['lr'])
                    writer.add_scalar('Loss/train',
                                      accum_batch_loss / accumulation_steps,
                                      iteration)
                    writer.add_scalar('Ltr/train',
                                      optimizer.param_groups[0]['lr'],
                                      iteration)
                accum_batch_loss = 0
                if iteration % 500 == 0 and cudaid == 0:
                    torch.cuda.empty_cache()
                    model.eval()
                    if cudaid == 0:
                        auc = test(model, args)
                        print(auc)
                        writer.add_scalar('auc/valid', auc, step)
                        step += 1
                        if auc > best_score:
                            torch.save(
                                model.state_dict(),
                                os.path.join(args.save_dir,
                                             'Plain_robert_dot_best.pkl'))
                            best_score = auc
                            print('best score: ', best_score)
                    torch.cuda.empty_cache()
                    model.train()

        if cudaid == 0:
            torch.save(
                model.state_dict(),
                os.path.join(args.save_dir,
                             'Plain_robert_dot' + str(epoch) + '.pkl'))
Exemplo n.º 30
0
    def setup(self):
        self.data["root"] = os.getcwd()
        try:
            import __main__
            self.data["program"] = __main__.__file__
        except (ImportError, AttributeError):
            self.data["program"] = '<python with no main file>'
            if wandb._get_python_type() != "python":
                if os.getenv(env.NOTEBOOK_NAME):
                    self.data["program"] = os.getenv(env.NOTEBOOK_NAME)
                else:
                    meta = wandb.jupyter.notebook_metadata()
                    if meta.get("path"):
                        if "fileId=" in meta["path"]:
                            self.data[
                                "colab"] = "https://colab.research.google.com/drive/" + meta[
                                    "path"].split("fileId=")[1]
                            self.data["program"] = meta["name"]
                        else:
                            self.data["program"] = meta["path"]
                            self.data["root"] = meta["root"]

        program = os.path.join(self.data["root"], self.data["program"])
        if not os.getenv(env.DISABLE_CODE):
            if self._api.git.enabled:
                self.data["git"] = {
                    "remote": self._api.git.remote_url,
                    "commit": self._api.git.last_commit
                }

                self.data["email"] = self._api.git.email
                self.data["root"] = self._api.git.root or self.data["root"]

            if os.path.exists(program) and self._api.git.is_untracked(
                    self.data["program"]):
                util.mkdir_exists_ok(
                    os.path.join(self.out_dir, "code",
                                 os.path.dirname(self.data["program"])))
                saved_program = os.path.join(self.out_dir, "code",
                                             self.data["program"])
                if not os.path.exists(saved_program):
                    self.data["codeSaved"] = True
                    copyfile(program, saved_program)

        self.data["startedAt"] = datetime.utcfromtimestamp(
            wandb.START_TIME).isoformat()
        try:
            username = getpass.getuser()
        except KeyError:
            # getuser() could raise KeyError in restricted environments like
            # chroot jails or docker containers.  Return user id in these cases.
            username = str(os.getuid())

        # Host names, usernames, emails, the root directory, and executable paths are sensitive for anonymous users.
        if self._api.settings().get('anonymous') != 'true':
            self.data["host"] = os.environ.get(env.HOST, socket.gethostname())
            self.data["username"] = os.getenv(env.USERNAME, username)
            self.data["executable"] = sys.executable
        else:
            self.data.pop("email", None)
            self.data.pop("root", None)

        self.data["os"] = platform.platform(aliased=True)
        self.data["python"] = platform.python_version()

        if env.get_docker():
            self.data["docker"] = env.get_docker()
        try:
            pynvml.nvmlInit()
            self.data["gpu"] = pynvml.nvmlDeviceGetName(
                pynvml.nvmlDeviceGetHandleByIndex(0)).decode("utf8")
            self.data["gpu_count"] = pynvml.nvmlDeviceGetCount()
        except pynvml.NVMLError:
            pass
        try:
            self.data["cpu_count"] = multiprocessing.cpu_count()
        except NotImplementedError:
            pass
        # TODO: we should use the cuda library to collect this
        if os.path.exists("/usr/local/cuda/version.txt"):
            self.data["cuda"] = open(
                "/usr/local/cuda/version.txt").read().split(" ")[-1].strip()
        self.data["args"] = sys.argv[1:]
        self.data["state"] = "running"
Exemplo n.º 31
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                ps_process = psutil.Process(pid=nv_process.pid)
                process['username'] = ps_process.username()
                # cmdline returns full path; as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:  # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                # Bytes to MBytes
                process['gpu_memory_usage'] = int(nv_process.usedGpuMemory /
                                                  1024 / 1024)
                process['pid'] = nv_process.pid
                return process

            def _decode(b):
                if isinstance(b, bytes):
                    return b.decode()  # for python3, to unicode
                return b

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except:
                power_limit = None

            processes = []
            try:
                nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(
                    handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses(
                    handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None  # Not supported (in both cases)
            else:
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in (nv_comp_processes + nv_graphics_processes):
                    # TODO: could be more information such as system memory usage,
                    # CPU percentage, create time etc.
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'power.draw':
                int(power / 1000) if power is not None else None,
                'enforced.power.limit':
                int(power_limit / 1000) if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                int(memory.used / 1024 / 1024) if memory else None,
                'memory.total':
                int(memory.total / 1024 / 1024) if memory else None,
                'processes':
                processes,
            }
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list)
Exemplo n.º 32
0
def get_gpu_used(index):
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(index)
    memoryinfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return memoryinfo.used
Exemplo n.º 33
0
def check_perf():
    "Suggest how to improve the setup to speed things up"

    from PIL import features, Image
    from packaging import version
    import pynvml

    print("Running performance checks.")

    # libjpeg_turbo check
    print("\n*** libjpeg-turbo status")
    if version.parse(Image.PILLOW_VERSION) >= version.parse("5.4.0"):
        if features.check_feature('libjpeg_turbo'):
            print("✔ libjpeg-turbo is on")
        else:
            print(
                "✘ libjpeg-turbo is not on. It's recommended you install libjpeg-turbo to speed up JPEG decoding. See https://docs.fast.ai/performance.html#libjpeg-turbo"
            )
    else:
        print(
            f"❓ libjpeg-turbo's status can't be derived - need Pillow(-SIMD)? >= 5.4.0 to tell, current version {Image.PILLOW_VERSION}"
        )
        # XXX: remove this check/note once Pillow and Pillow-SIMD 5.4.0 is available
        pillow_ver_5_4_is_avail = pypi_module_version_is_available(
            "Pillow", "5.4.0")
        if pillow_ver_5_4_is_avail == False:
            print(
                "5.4.0 is not yet available, other than the dev version on github, which can be installed via pip from git+https://github.com/python-pillow/Pillow. See https://docs.fast.ai/performance.html#libjpeg-turbo"
            )

    # Pillow-SIMD check
    print("\n*** Pillow-SIMD status")
    if re.search(r'\.post\d+', Image.PILLOW_VERSION):
        print(f"✔ Running Pillow-SIMD {Image.PILLOW_VERSION}")
    else:
        print(
            f"✘ Running Pillow {Image.PILLOW_VERSION}; It's recommended you install Pillow-SIMD to speed up image resizing and other operations. See https://docs.fast.ai/performance.html#pillow-simd"
        )

    # CUDA version check
    # compatibility table: k: min nvidia ver is required for v: cuda ver
    # note: windows nvidia driver version is slightly higher, see:
    # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
    # note: add new entries if pytorch starts supporting new cudaXX
    nvidia2cuda = {
        "410.00": "10.0",
        "384.81": "9.0",
        "367.48": "8.0",
    }
    print("\n*** CUDA status")
    if torch.cuda.is_available():
        pynvml.nvmlInit()
        nvidia_ver = pynvml.nvmlSystemGetDriverVersion().decode('utf-8')
        cuda_ver = torch.version.cuda
        max_cuda = "8.0"
        for k in sorted(nvidia2cuda.keys()):
            if version.parse(nvidia_ver) > version.parse(k):
                max_cuda = nvidia2cuda[k]
        if version.parse(str(max_cuda)) <= version.parse(cuda_ver):
            print(
                f"✔ Running the latest CUDA {cuda_ver} with NVIDIA driver {nvidia_ver}"
            )
        else:
            print(
                f"✘ You are running pytorch built against cuda {cuda_ver}, your NVIDIA driver {nvidia_ver} supports cuda10. See https://pytorch.org/get-started/locally/ to install pytorch built against the faster CUDA version."
            )
    else:
        print(f"❓ Running cpu-only torch version, CUDA check is not relevant")

    print(
        "\nRefer to https://docs.fast.ai/performance.html to make sense out of these checks and suggestions."
    )
Exemplo n.º 34
0
def train_classifier(config_file, data_dir, knee_type, feature_extract, gpu,
                     model_name, pretraining, overwrite_flag, learning_rate,
                     drop_rate, num_epochs, batch_size, num_classes, patience,
                     prev_checkpoint_path, model_evaluate, progression,
                     biomarker, sampling, norm):
    '''This script trains a classifier CNN on a specified dataset. It can also infer on a previous checkpoint. The model availables are: "densenet121", "resnet18", "squeezenet", "alexnet", "Vgg11", "inceptionv3".'''

    feature_extract = int(feature_extract)

    if pretraining == 'Diagnosis':
        if '50' not in model_name:
            return
    elif pretraining == 'Random':
        if feature_extract > 0:
            return

    print(f'PyTorch Version: {torch.__version__}')
    print(f'Torchvision Version: {torchvision.__version__}')

    if progression in ['DiagnosisAll', 'Pain']:
        data_dir = join(data_dir, sampling, norm, biomarker, knee_type,
                        progression)
    else:
        data_dir = join(data_dir, biomarker, knee_type, progression)
    log_dir = data_dir.replace('on/data', 'on/logs')

    # GPU ID to use
    torch.cuda.set_device(int(gpu))
    nvmlInit()
    h = nvmlDeviceGetHandleByIndex(0)

    # Flag for feature extracting. When False, we finetune the whole model,
    # when True we only update the reshaped layer params

    transfer_learning_type = [
        'Finetuned', 'FeatureExtract', 'FirstLayerExtract',
        'FirstTwoLayersExtract'
    ]
    transfer_learning_dict = {}
    transfer_learning_dict[1] = 'Linear'
    transfer_learning_dict[2] = 'layer2'
    transfer_learning_dict[3] = 'layer3'

    # Learning rate string for naming the checkpoint directory
    lr_exp = 'e'.join('{:.0E}'.format(Decimal(learning_rate)).split('E-'))

    # Unique training name
    if pretraining == 'Diagnosis' or model_evaluate:
        if progression in ['DiagnosisAll', 'Pain']:
            if pretraining == 'Diagnosis':
                diagnosispdpath = join(
                    log_dir.split(sampling)[0], 'diagnosisallperformance.csv')
                checkpointpd = pd.read_csv(diagnosispdpath,
                                           index_col=[
                                               'Sampling', 'Normalization',
                                               'Bone', 'Incidence', 'Fusion'
                                           ])
                best_checkpoint_path = checkpointpd.loc[(sampling, norm,
                                                         knee_type,
                                                         'DiagnosisAll',
                                                         biomarker),
                                                        'Checkpoint']
            elif pretraining == 'ImageNet':
                pass
            else:
                pdpath = join(
                    log_dir.split(sampling)[0],
                    progression.lower() + 'performance.csv')
                checkpointpd = pd.read_csv(pdpath,
                                           index_col=[
                                               'Sampling', 'Normalization',
                                               'Bone', 'Incidence', 'Fusion'
                                           ])
                best_checkpoint_path = checkpointpd.loc[(sampling, norm,
                                                         knee_type,
                                                         progression,
                                                         biomarker),
                                                        'Checkpoint']
        else:
            pdpath = join(
                log_dir.split(biomarker)[0],
                progression.lower() + 'performance.csv')
            checkpointpd = pd.read_csv(
                pdpath, index_col=['Bone', 'Incidence', 'Fusion'])
            best_checkpoint_path = checkpointpd.loc[(knee_type, progression,
                                                     biomarker), 'Checkpoint']

        best_checkpoint_dir = dirname(best_checkpoint_path)
        prev_training_name = best_checkpoint_dir.split('/')[-1]

        print('Previous training name =', prev_training_name)
        print('Previous training directory =', best_checkpoint_dir)

        prev_feature_extract = transfer_learning_type.index([
            item for item in transfer_learning_type
            if item in prev_training_name
        ][0])
        if learning_rate == 1e-4:
            learning_rate = float(prev_training_name[-3:].replace('e',
                                                                  'e-')) / 10
        elif learning_rate == 1e-5:
            learning_rate = float(prev_training_name[-3:].replace('e',
                                                                  'e-')) / 100
        elif learning_rate == 1e-6:
            learning_rate = float(prev_training_name[-3:].replace('e',
                                                                  'e-')) / 1000
        elif learning_rate == 1e-7:
            learning_rate = float(prev_training_name[-3:].replace(
                'e', 'e-')) / 10000
        lr_exp = 'e'.join('{:.0E}'.format(Decimal(learning_rate)).split('E-'))
        if pretraining == 'Diagnosis':
            training_name = 'Diagnosis' + model_name.capitalize(
            ) + transfer_learning_type[feature_extract] + lr_exp
        elif 'prev' in prev_checkpoint_path.lower() and model_evaluate:
            feature_extract = prev_feature_extract
            training_name = prev_training_name
        else:
            training_name = 'Prev' + model_name.capitalize(
            ) + transfer_learning_type[feature_extract] + lr_exp
    elif pretraining == 'ImageNet':
        training_name = 'ImageNet' + model_name.capitalize(
        ) + transfer_learning_type[feature_extract] + lr_exp
    elif pretraining == 'Random':
        training_name = 'Random' + model_name.capitalize(
        ) + transfer_learning_type[feature_extract] + lr_exp

    # Current Checkpoint path
    current_checkpoint_path = join(log_dir, training_name)

    print(f'Checkpoint Path: {current_checkpoint_path}')
    print(f'Transfer Learning: {transfer_learning_type[feature_extract]}')

    if not isdir(current_checkpoint_path):
        os.makedirs(current_checkpoint_path)
    elif os.listdir(current_checkpoint_path) and not model_evaluate:
        if not overwrite_flag:
            raise Exception(
                'Previous checkpoints found and no overwrite flag specified.')

    if not model_evaluate:
        checkpoint_log = join(current_checkpoint_path, 'log_file.txt')
        checkpoint_header = '*Model Name*: ' + model_name.capitalize(
        ) + '_' + knee_type.capitalize() + '_' + progression.capitalize(
        ) + '\t*Transfer Learning Type*: ' + transfer_learning_type[
            feature_extract] + '\t*Learning Rate*: ' + lr_exp + '\n\n'
        write_type = 'w'  # Write mode if file does not exist
        with open(checkpoint_log, write_type) as f_checkpoint:
            if pretraining == 'Diagnosis':
                f_checkpoint.write(
                    checkpoint_header.replace(
                        transfer_learning_type[feature_extract],
                        transfer_learning_type[feature_extract] +
                        '_Diagnosis'))
            elif pretraining == 'ImageNet':
                f_checkpoint.write(
                    checkpoint_header.replace(
                        transfer_learning_type[feature_extract],
                        transfer_learning_type[feature_extract] + '_ImageNet'))
            elif pretraining == 'Random':
                f_checkpoint.write(
                    checkpoint_header.replace(
                        transfer_learning_type[feature_extract],
                        transfer_learning_type[feature_extract] + '_Random'))
        write_type = 'a'  # Append mode after file is created

    def train_model(model,
                    dataloaders,
                    criterion,
                    optimizer,
                    current_checkpoint_path,
                    num_epochs=25,
                    prev_model=0,
                    is_inception=False):
        total_time = time.time()
        output_dict = {}
        output_dict['best_val_MCC'] = -1
        output_dict['best_val_TPN'] = 0
        output_dict['best_epoch'] = prev_model
        for phase in ['Train', 'Val']:
            output_dict[phase] = {}
            output_dict[phase]['auc'] = np.zeros(num_epochs)
            output_dict[phase]['acc'] = np.zeros(num_epochs)
            output_dict[phase]['mcc'] = np.zeros(num_epochs)
            output_dict[phase]['tpn'] = np.zeros(num_epochs)
            output_dict[phase]['loss'] = 100 * np.ones(num_epochs)
            output_dict[phase]['epoch'] = np.zeros(num_epochs, dtype=int)

        best_model_wts = copy.deepcopy(model.state_dict())
        best_MCC = -1
        best_TPN = 0
        best_epoch = 0

        # for epoch in tqdm(range(num_epochs)):
        for epoch in range(num_epochs):
            since = time.time()
            # print(f'\nEpoch {epoch}/{(num_epochs - 1)}')
            # print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['Train', 'Val']:
                if phase == 'Train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()  # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0
                running_corrects_positive = 0
                running_corrects_negative = 0
                running_positives = 0
                running_negatives = 0
                cnt = 0
                num_hold = len(dataloaders[phase].dataset)
                output_dict[phase]['labels'] = np.zeros(num_hold) - 1
                output_dict[phase]['softmax'] = np.zeros([num_hold, 2])

                # Iterate over data.
                for inputs, labels, paths in dataloaders[phase]:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'Train'):
                        # Get model outputs and calculate loss
                        # Special case for inception because in training it has an auxiliary output. In train
                        #   mode we calculate the loss by summing the final output and the auxiliary output
                        #   but in testing we only consider the final output.
                        if is_inception and phase == 'Train':
                            # From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
                            outputs, aux_outputs = model(inputs)
                            loss1 = criterion(outputs, labels)
                            loss2 = criterion(aux_outputs, labels)
                            loss = loss1 + 0.4 * loss2
                        else:
                            outputs = model(inputs)
                            loss = criterion(outputs, labels)

                        _, preds = torch.max(torch.softmax(outputs, dim=1), 1)

                        # backward + optimize only if in training phase
                        if phase == 'Train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                    running_corrects_positive += np.logical_and(
                        preds.cpu(), labels.data.cpu()).sum()
                    running_corrects_negative += len(
                        labels.data) - np.logical_or(preds.cpu(),
                                                     labels.data.cpu()).sum()
                    running_positives += torch.sum(labels.data)
                    running_negatives += len(labels.data) - torch.sum(
                        labels.data).int()
                    output_dict[phase]['labels'][(cnt * batch_size):(
                        (cnt + 1) * batch_size)] = labels.cpu().numpy()
                    output_dict[phase]['softmax'][(cnt * batch_size):(
                        (cnt + 1) * batch_size), :] = torch.softmax(
                            outputs, dim=1).detach().cpu().numpy()
                    cnt += 1
#                     print('True Positives: {} All Positives: {}'.format(running_corrects_positive, running_positives))
#                     print('True Negatives: {} All Negatives: {}'.format(running_corrects_negative, running_negatives))torch.sigmoid(outputs)

                epoch_loss = running_loss / len(dataloaders[phase].dataset)
                epoch_acc = running_corrects.double() / len(
                    dataloaders[phase].dataset)
                epoch_tpr = running_corrects_positive / running_positives.double(
                )
                epoch_tnr = running_corrects_negative / running_negatives.double(
                )
                output_dict[phase]['auc'][epoch] = roc_auc_score(
                    output_dict[phase]['labels'],
                    output_dict[phase]['softmax'][:, 1])
                output_dict[phase]['mcc'][epoch] = matthews_corrcoef(
                    output_dict[phase]['labels'],
                    output_dict[phase]['softmax'][:, 1] >= 0.5)
                output_dict[phase]['tpn'][
                    epoch] = epoch_tpr + epoch_tnr + output_dict[phase]['auc'][
                        epoch]
                # print(f'TPR: {epoch_tpr.numpy():5.3f}')
                # print(f'TNR: {epoch_tnr.numpy():5.3f}')
                # print(f'AUC: {output_dict[phase]["auc"][epoch]:5.3f}')
                # print(f'MCC: {output_dict[phase]["mcc"][epoch]:6.3f}')
                # print(f'TPN: {output_dict[phase]["tpn"][epoch]:6.3f}')

                # if phase == 'Val':
                #     time_elapsed = time.time() - since
                #     print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f} \nTime since last epoch: {(time_elapsed // 60):.0f}m {(time_elapsed % 60):.0f}s')
                # else:
                #     print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

                # deep copy the model
                if phase == 'Val' and (output_dict[phase]['tpn'][epoch] >
                                       best_TPN) and (epoch_tpr >= 0.5) and (
                                           epoch_tnr >= 0.5):
                    best_MCC = output_dict[phase]['mcc'][epoch]
                    best_TPN = output_dict[phase]['tpn'][epoch]
                    output_dict['best_val_MCC'] = best_MCC
                    output_dict['best_val_TPN'] = best_TPN
                    output_dict['best_epoch'] = prev_model + epoch
                    chkpath = join(current_checkpoint_path, 'best_epoch')
                    torch.save(
                        {
                            'epoch': epoch,
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'output_dict': output_dict
                        }, chkpath)
                    print('Saved model in ' + chkpath)
                    # best_model_wts = copy.deepcopy(model.state_dict())
                if phase == 'Val':
                    output_dict['Val']['acc'][epoch] = epoch_acc
                    output_dict['Val']['loss'][epoch] = epoch_loss
                    output_dict['Val']['epoch'][epoch] = prev_model + epoch


#                     scheduler.step(1 - output_dict['Val']['auc'][epoch])
                if phase == 'Val' and (np.argmin(output_dict['Val']['loss']) +
                                       patience < epoch):
                    print(
                        f'Early stopping due to validation loss not improving for {patience} epochs'
                    )
                    quit()
                if phase == 'Train':
                    output_dict['Train']['acc'][epoch] = epoch_acc
                    output_dict['Train']['loss'][epoch] = epoch_loss
                    output_dict['Train']['epoch'][epoch] = prev_model + epoch

            if epoch % 1 == 0:
                current_train_acc = output_dict['Train']['acc'][epoch]
                current_val_acc = output_dict['Val']['acc'][epoch]
                current_train_loss = output_dict['Train']['loss'][epoch]
                current_val_loss = output_dict['Val']['loss'][epoch]
                best_val_MCC = output_dict['best_val_MCC']
                best_val_TPN = output_dict['best_val_TPN']
                with open(checkpoint_log, write_type) as f_checkpoint:
                    f_checkpoint.write(f'*Epoch*: {output_dict["Train"]["epoch"][epoch]:3}  *Train Loss*: {current_train_loss:5.3f}' + \
                    f'  *Val Loss*: {current_val_loss:5.3f}  *TPR*: {epoch_tpr.numpy():5.3f}' + \
                    f'  *TNR*: {epoch_tnr.numpy():5.3f}  *AUC*: {output_dict["Val"]["auc"][epoch]:5.3f}  *MCC*: {output_dict["Val"]["mcc"][epoch]:6.3f}' + \
                    f'  *TPN*: {output_dict["Val"]["tpn"][epoch]:5.3f}  *Best Val TPN*: {best_val_TPN:6.3f}  *Best Epoch*: {output_dict["best_epoch"]:3}\n')

            # print()

        total_time_elapsed = time.time() - total_time
        print(
            f'Training complete in {(total_time_elapsed // 3600):.0f}h {(total_time_elapsed // 60):.0f}m {(total_time_elapsed % 60):.0f}s'
        )
        print(f'Best Val MCC: {best_MCC:6.3f}')

        # load best model weights
        model.load_state_dict(best_model_wts)
        return model

    # Set Model Parameters’ .requires_grad attribute
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    # This helper function sets the .requires_grad attribute of the
    # parameters in the model to False when we are feature extracting. By
    # default, when we load a pretrained model all of the parameters have
    # .requires_grad=True, which is fine if we are training from scratch
    # or finetuning. However, if we are feature extracting and only want to
    # compute gradients for the newly initialized layer then we want all of
    # the other parameters to not require gradients. This will make more sense
    # later.

    def set_parameter_requires_grad(model, feature_extracting):
        if feature_extracting > 0:
            bypass_condition = 0
            for param_name, param in model.named_parameters():
                if transfer_learning_dict[
                        feature_extracting] in param_name or bypass_condition:
                    param.requires_grad = True
                    bypass_condition = 1
                else:
                    param.requires_grad = False

    def initialize_model(model_name,
                         num_classes,
                         feature_extract,
                         use_pretrained=True):
        # Initialize these variables which will be set in this if statement. Each of these
        #   variables is model specific.
        model_ft = None
        input_size = 0

        if 'resnet' in model_name.lower():
            ''' Resnet50
            '''
            if '18' in model_name.lower():
                model_ft = models.resnet18(pretrained=use_pretrained)
            elif '34' in model_name.lower():
                model_ft = models.resnet34(pretrained=use_pretrained)
            elif '50' in model_name.lower():
                model_ft = models.resnet50(pretrained=use_pretrained)
            set_parameter_requires_grad(model_ft, feature_extract)
            num_ftrs = model_ft.fc.in_features
            print(f'Fully Connected Layer Features = {num_ftrs}')
            # model_ft.fc.register_forward_hook(lambda m, inp, out: nn.functional.dropout(out, p=drop_rate, training=m.training))
            # if not feature_extract:
            if pretraining == 'Diagnosis':
                model_ft.layer1 = nn.Sequential(nn.Dropout(drop_rate),
                                                model_ft.layer1)
                model_ft.layer2 = nn.Sequential(nn.Dropout(drop_rate),
                                                model_ft.layer2)
                model_ft.layer3 = nn.Sequential(nn.Dropout(drop_rate),
                                                model_ft.layer3)
                model_ft.layer4 = nn.Sequential(nn.Dropout(drop_rate),
                                                model_ft.layer4)

                model_ft.fc = nn.Sequential(nn.Dropout(drop_rate),
                                            nn.Linear(num_ftrs, num_classes))
            else:
                model_ft.fc = nn.Linear(num_ftrs, num_classes)
            # print(f'Layers = {model_ft.children}')
            input_size = 224

        elif model_name.lower() == 'alexnet':
            ''' Alexnet
            '''
            model_ft = models.alexnet(pretrained=use_pretrained)
            set_parameter_requires_grad(model_ft, feature_extract)
            num_ftrs = model_ft.classifier[6].in_features
            model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)
            input_size = 224

        elif model_name.lower() == 'vgg':
            ''' VGG11_bn
            '''
            model_ft = models.vgg11_bn(pretrained=use_pretrained)
            set_parameter_requires_grad(model_ft, feature_extract)
            num_ftrs = model_ft.classifier[6].in_features
            model_ft.classifier[6] = nn.Linear(num_ftrs, num_classes)
            input_size = 224

        elif model_name.lower() == 'squeezenet':
            ''' Squeezenet
            '''
            model_ft = models.squeezenet1_0(pretrained=use_pretrained)
            set_parameter_requires_grad(model_ft, feature_extract)
            model_ft.classifier[1] = nn.Conv2d(512,
                                               num_classes,
                                               kernel_size=(1, 1),
                                               stride=(1, 1))
            model_ft.num_classes = num_classes
            input_size = 224

        elif model_name.lower() == 'densenet':
            ''' Densenet
            '''
            model_ft = models.densenet121(pretrained=use_pretrained)
            set_parameter_requires_grad(model_ft, feature_extract)
            num_ftrs = model_ft.classifier.in_features
            model_ft.classifier = nn.Linear(num_ftrs, num_classes)
            input_size = 224

        elif model_name.lower() == 'inception':
            ''' Inception v3 
            Be careful, expects (299,299) sized images and has auxiliary output
            '''
            model_ft = models.inception_v3(pretrained=use_pretrained)
            set_parameter_requires_grad(model_ft, feature_extract)
            # Handle the auxilary net
            num_ftrs = model_ft.AuxLogits.fc.in_features
            model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
            # Handle the primary net
            num_ftrs = model_ft.fc.in_features
            model_ft.fc = nn.Linear(num_ftrs, num_classes)
            input_size = 299

        else:
            print('Invalid model name, exiting...')
            exit()

        return model_ft, input_size

    # Initialize the model for this run
    model_ft, input_size = initialize_model(
        model_name,
        num_classes,
        feature_extract,
        use_pretrained=(pretraining == 'ImageNet'))

    # Load Data
    # ---------
    # Now that we know what the input size must be, we can initialize the data
    # transforms, image datasets, and the dataloaders. Notice, the models were
    # pretrained with the hard-coded normalization values, as described
    # https://pytorch.org/docs/master/torchvision/models.html

    # Data augmentation and normalization for training
    # Just normalization for validation

    data_transforms = {
        'Train':
        transforms.Compose([
            # No augmentation necessary for spherical maps since that is done before the transformation
            transforms.RandomResizedCrop(input_size),
            transforms.RandomHorizontalFlip(p=0.25),
            transforms.RandomVerticalFlip(p=0.25),
            transforms.Resize(input_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
            # transforms.RandomErasing(p=0.25)
        ]),
        'Val':
        transforms.Compose([
            transforms.Resize(input_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        'Hold':
        transforms.Compose([
            transforms.Resize(input_size),
            transforms.CenterCrop(input_size),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
    }
    print(f'Transforms: {data_transforms}')

    # print('Initializing Datasets and Dataloaders...')

    # Create training and validation datasets
    image_datasets = {
        x: datasets.ImageFolder(join(data_dir, x), data_transforms[x])
        for x in ['Train', 'Val', 'Hold']
    }

    for split in ['Train', 'Val', 'Hold']:
        if 'year' in progression:
            image_datasets[split].class_to_idx = {'Healthy': 0, 'Incidence': 1}
        elif 'pain' in progression.lower():
            image_datasets[split].class_to_idx = {'Healthy': 0, 'Pain': 1}

    # Create training and validation dataloaders
    dataloaders_dict = {
        x: torch.utils.data.DataLoader(image_datasets[x],
                                       batch_size=batch_size,
                                       shuffle=True,
                                       num_workers=8)
        for x in ['Train', 'Val', 'Hold']
    }

    # Detect if we have a GPU available3
    device = torch.device('cuda:' +
                          gpu if torch.cuda.is_available() else 'cpu')

    # Send the model to GPU
    model_ft = model_ft.to(device)

    # Gather the parameters to be optimized/updated in this run. If we are
    #  finetuning we will be updating all parameters. However, if we are
    #  doing feature extract method, we will only update the parameters
    #  that we have just initialized, i.e. the parameters with requires_grad
    #  is True.
    params_to_update = model_ft.parameters()
    if feature_extract > 0:
        params_to_update = []
        for name, param in model_ft.named_parameters():
            if param.requires_grad == True:
                params_to_update.append(param)
        num_params = len(params_to_update)
    else:
        num_params = len(list(model_ft.parameters()))

    # Observe that all parameters are being optimized
    print('Number of training parameters:', num_params)
    optimizer_ft = optim.Adam(params_to_update,
                              lr=learning_rate,
                              weight_decay=0.1)

    # Run Training and Validation Step
    # --------------------------------
    # Finally, the last step is to setup the loss for the model, then run the
    # training and validation function for the set number of epochs. Notice,
    # depending on the number of epochs this step may take a while on a CPU.
    # Also, the default learning rate is not optimal for all of the models, so
    # to achieve maximum accuracy it would be necessary to tune for each model
    # separately.

    pain_weights = [0.86589497, 1.18325617]
    # Setup the loss fxn
    class_weights = torch.FloatTensor(pain_weights).cuda()
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    print(f'Loss Class Weights = {pain_weights}')

    info = nvmlDeviceGetMemoryInfo(h)

    if feature_extract == 1:
        if '18' in model_name:
            batch_size = 905
        elif '34' in model_name:
            batch_size = 905
        else:
            batch_size = 705
        batch_step = 60
    elif feature_extract == 3:
        if '18' in model_name:
            batch_size = 855
        elif '34' in model_name:
            batch_size = 855
        else:
            batch_size = 305
        batch_step = 60
    elif feature_extract == 2:
        if '18' in model_name:
            batch_size = 955
        elif '34' in model_name:
            batch_size = 705
        else:
            batch_size = 155
        batch_step = 30
    elif feature_extract == 0:
        if '18' in model_name:
            batch_size = 425
        elif '34' in model_name:
            batch_size = 305
        else:
            batch_size = 105
        batch_step = 10

    if info.total > 1.5e10:
        batch_size = round(2.8 * batch_size)
        batch_step = round(2 * batch_step)

    n_channels = 3
    batch_adapt = 1
    print('Batch Size:', batch_size)
    print('Batch Step:', batch_step)
    model_ft.train()
    while batch_adapt:
        input_shape = (batch_size, n_channels, input_size, input_size)
        try:
            inputs = torch.randn(*input_shape, dtype=torch.float32).cuda()
            labels = torch.ones(batch_size, dtype=torch.int64).cuda()
            # zero the parameter gradients
            optimizer_ft.zero_grad()
            outputs = model_ft(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer_ft.step()
            print('Allocated:', torch.cuda.max_memory_allocated())
            print('Cached:', torch.cuda.max_memory_cached())
            del inputs, labels, outputs, loss
            torch.cuda.empty_cache()
            if (info.total - torch.cuda.max_memory_cached()) < 2e9:
                batch_adapt = 0
                print('Final Batch Size:', batch_size)
            else:
                batch_size += batch_step
        except RuntimeError as error:
            print(error)
            batch_size -= round(1.5 * batch_step)
            batch_adapt = 0
            print('Final Batch Size:', batch_size)

    dataloaders_dict = {
        x: torch.utils.data.DataLoader(image_datasets[x],
                                       batch_size=batch_size,
                                       shuffle=True,
                                       num_workers=8)
        for x in ['Train', 'Val', 'Hold']
    }

    if pretraining == 'Diagnosis':
        best_checkpoint_load = torch.load(best_checkpoint_path,
                                          map_location='cuda:' + gpu)
        model_ft.load_state_dict(best_checkpoint_load['model_state_dict'])
        print(f'Loading previous model: {best_checkpoint_path}')

    if model_evaluate:
        best_checkpoint_load = torch.load(best_checkpoint_path,
                                          map_location='cuda:' + gpu)
        model_ft.load_state_dict(best_checkpoint_load['model_state_dict'])
        print(f'Loading best model: {best_checkpoint_path}')

        # We're evaluating the model_load here:
        def model_eval(model_load, dataloaders, phase_range=['Val']):

            model_load.eval()  # Set model_load to evaluate mode
            results_dict = {}

            for phase in phase_range:
                cnt = 0
                num_hold = len(dataloaders[phase].dataset)
                results_dict[phase] = {}
                results_dict[phase]['file_names'] = []
                results_dict[phase]['labels'] = np.zeros(num_hold) - 1
                results_dict[phase]['class_predict'] = np.zeros(num_hold) - 1
                results_dict[phase]['logits'] = np.zeros([num_hold, 2])
                results_dict[phase]['softmax'] = np.zeros([num_hold, 2])
                #                     results_dict[phase]['features'] = np.zeros([num_hold,2048])

                # Iterate over data.
                for inputs, labels, paths in tqdm(dataloaders[phase]):
                    inputs = inputs.to(device)
                    labels = labels.to(device)
                    results_dict[phase]['file_names'].extend(list(paths))
                    results_dict[phase]['labels'][(cnt * batch_size):(
                        (cnt + 1) * batch_size)] = labels.cpu().numpy()

                    with torch.set_grad_enabled(False):
                        outputs = model_load(inputs)
                        results_dict[phase]['softmax'][(cnt * batch_size):(
                            (cnt + 1) * batch_size), :] = torch.softmax(
                                outputs, dim=1).cpu().numpy()
                        results_dict[phase]['logits'][(cnt * batch_size):(
                            (cnt + 1) *
                            batch_size), :] = outputs.cpu().numpy()
                        #                             results_dict[phase]['features'][(cnt*batch_size):((cnt + 1)*batch_size),:] = outputs.cpu().numpy()
                        # forward
                        _, preds = torch.max(torch.softmax(outputs, dim=1), 1)
                        results_dict[phase]['class_predict'][(
                            cnt *
                            batch_size):((cnt + 1) *
                                         batch_size)] = preds.cpu().numpy()
                        # statistics
                        pred_comp = (
                            labels.cpu().numpy() == preds.cpu().numpy())
                        cnt += 1

            return results_dict

        def perf_measure(y_actual, y_hat):
            TP = 0
            FP = 0
            TN = 0
            FN = 0

            for i in range(len(y_hat)):
                if y_actual[i] == y_hat[i] == 1:
                    TP += 1
                if y_hat[i] == 1 and y_actual[i] != y_hat[i]:
                    FP += 1
                if y_actual[i] == y_hat[i] == 0:
                    TN += 1
                if y_hat[i] == 0 and y_actual[i] != y_hat[i]:
                    FN += 1

            return TP, FP, TN, FN

        if model_evaluate.lower() == 'all':
            phase_range = ['Train', 'Val', 'Hold']
            eval_dict = model_eval(model_ft, dataloaders_dict, phase_range)
        elif model_evaluate.lower() == 'trainval':
            phase_range = ['Train', 'Val']
            eval_dict = model_eval(model_ft, dataloaders_dict, phase_range)
        elif model_evaluate.lower() == 'valhold':
            phase_range = ['Val', 'Hold']
            eval_dict = model_eval(model_ft, dataloaders_dict, phase_range)
        else:
            phase_range = [model_evaluate.lower().capitalize()]
            eval_dict = model_eval(model_ft, dataloaders_dict, phase_range)

        for i in phase_range:
            TP, FP, TN, FN = perf_measure(eval_dict[i]['labels'],
                                          eval_dict[i]['class_predict'])
            # print(TP, FP, TN, FN)
            sensitivity = TP / (TP + FN)
            specificity = TN / (TN + FP)
            eval_dict[i]['sensitivity'] = sensitivity
            eval_dict[i]['specificity'] = specificity
            eval_dict[i]['auc'] = roc_auc_score(eval_dict[i]['labels'],
                                                eval_dict[i]['softmax'][:, 1])
            if i == 'Hold':
                continue
            else:
                print("*{}* \nSensitivity = {} \nSpecificity = {} \nAUC = {}".
                      format(i.capitalize(), sensitivity, specificity,
                             eval_dict[i]['auc']))

        with open(join(best_checkpoint_dir, 'model_perf.pickle'), 'wb') as f:
            eval_dict['checkpoint'] = [best_checkpoint_path]
            pickle.dump(eval_dict, f)

        quit()

    else:
        if pretraining == 'Diagnosis':
            model_ft.layer1 = model_ft.layer1[1]
            model_ft.layer2 = model_ft.layer2[1]
            model_ft.layer3 = model_ft.layer3[1]
            model_ft.layer4 = model_ft.layer4[1]
            model_ft.fc = model_ft.fc[1]
        model_ft = train_model(model_ft,
                               dataloaders_dict,
                               criterion,
                               optimizer_ft,
                               current_checkpoint_path,
                               num_epochs=num_epochs,
                               is_inception=(model_name == "inception"))
Exemplo n.º 35
0
 def __init__(self, name, init_config, instances):
     super(NvmlCheck, self).__init__(name, init_config, instances)
     pynvml.nvmlInit()
     pynvml.nvmlShutdown()
Exemplo n.º 36
0
def identify_cards():
    devices = {}
    try:
        import pynvml
        from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex
        deviceCount = None
        try:
            nvmlInit()
            deviceCount = nvmlDeviceGetCount()
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                props = {}
                def meminfo(memory):
                    return {
                            "total"  : int(memory.total),
                            "free"   : int(memory.free),
                            "used"   : int(memory.used),
                            }
                def pciinfo(pci):
                    i = {}
                    for x in ("domain", "bus", "device", "pciDeviceId", "pciSubSystemId"):
                        try:
                            i[x] = int(getattr(pci, x))
                        except:
                            pass
                    try:
                        i["busId"] = str(pci.busId)
                    except:
                        pass
                    return i
                for prop, fn_name, args, conv in (
                       ("name",                     "nvmlDeviceGetName",                    (),     str),
                       ("serial",                   "nvmlDeviceGetSerial",                  (),     str),
                       ("uuid",                     "nvmlDeviceGetUUID",                    (),     str),
                       ("pci",                      "nvmlDeviceGetPciInfo",                 (),     pciinfo),
                       ("memory",                   "nvmlDeviceGetMemoryInfo",              (),     meminfo),
                       ("pcie-link-generation-max", "nvmlDeviceGetMaxPcieLinkGeneration",   (),     int),
                       ("pcie-link-width-max",      "nvmlDeviceGetMaxPcieLinkWidth",        (),     int),
                       ("pcie-link-generation",     "nvmlDeviceGetCurrPcieLinkGeneration",  (),     int),
                       ("pcie-link-width",          "nvmlDeviceGetCurrPcieLinkWidth",       (),     int),
                       ("clock-info-graphics",      "nvmlDeviceGetClockInfo",               (0,),   int),
                       ("clock-info-sm",            "nvmlDeviceGetClockInfo",               (1,),   int),
                       ("clock-info-mem",           "nvmlDeviceGetClockInfo",               (2,),   int),
                       ("clock-info-graphics-max",  "nvmlDeviceGetMaxClockInfo",            (0,),   int),
                       ("clock-info-sm-max",        "nvmlDeviceGetMaxClockInfo",            (1,),   int),
                       ("clock-info-mem-max",       "nvmlDeviceGetMaxClockInfo",            (2,),   int),
                       ("fan-speed",                "nvmlDeviceGetFanSpeed",                (),     int),
                       ("temperature",              "nvmlDeviceGetTemperature",             (0,),   int),
                       ("power-state",              "nvmlDeviceGetPowerState",              (),     int),
                       ("vbios-version",            "nvmlDeviceGetVbiosVersion",            (),     str),
                       ):
                    try:
                        fn = getattr(pynvml, fn_name)
                        v = fn(handle, *args)
                        if conv:
                            v = conv(v)
                        props[prop] = v
                    except Exception as e:
                        log("identify_cards() cannot query %s using %s on device %i with handle %s: %s", prop, fn, i, handle, e)
                        continue
                devices[i] = props
            #unitCount = nvmlUnitGetCount()
            #log.info("unitCount=%s", unitCount)
        except Exception as e:
            log("identify_cards() pynvml error", exc_info=True)
            log.warn("Warning: failed to query the NVidia cards via NVML:")
            log.warn(" %s", e)
        finally:
            if deviceCount is not None:
                nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return devices
Exemplo n.º 37
0
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode('utf-8')  # for python3, to unicode
            return b

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]

                # TODO: ps_process is being cached, but the dict below is not.
                process['username'] = ps_process.username()
                # cmdline returns full path;
                # as in `ps -o comm`, get short cmdnames.
                _cmdline = ps_process.cmdline()
                if not _cmdline:
                    # sometimes, zombie or unknown (e.g. [kworker/8:2H])
                    process['command'] = '?'
                    process['full_command'] = ['?']
                else:
                    process['command'] = os.path.basename(_cmdline[0])
                    process['full_command'] = _cmdline
                # Bytes to MBytes
                # if drivers are not TTC this will be None.
                usedmem = nv_process.usedGpuMemory // MB if \
                          nv_process.usedGpuMemory else None
                process['gpu_memory_usage'] = usedmem
                process['cpu_percent'] = ps_process.cpu_percent()
                process['cpu_memory_usage'] = \
                    round((ps_process.memory_percent() / 100.0) *
                          psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            try:
                temperature = N.nvmlDeviceGetTemperature(
                    handle, N.NVML_TEMPERATURE_GPU)
            except N.NVMLError:
                temperature = None  # Not supported

            try:
                fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            except N.NVMLError:
                fan_speed = None  # Not supported

            try:
                memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            except N.NVMLError:
                memory = None  # Not supported

            try:
                utilization = N.nvmlDeviceGetUtilizationRates(handle)
            except N.NVMLError:
                utilization = None  # Not supported

            try:
                utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle)
            except N.NVMLError:
                utilization_enc = None  # Not supported

            try:
                utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle)
            except N.NVMLError:
                utilization_dec = None  # Not supported

            try:
                power = N.nvmlDeviceGetPowerUsage(handle)
            except N.NVMLError:
                power = None

            try:
                power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            except N.NVMLError:
                power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                # A single process might run in both of graphics and compute mode,
                # However we will display the process only once
                seen_pids = set()
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    if nv_process.pid in seen_pids:
                        continue
                    seen_pids.add(nv_process.pid)
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass
                    except FileNotFoundError:
                        # Ignore the exception which probably has occured
                        # from psutil, due to a non-existent PID (see #95).
                        # The exception should have been translated, but
                        # there appears to be a bug of psutil. It is unlikely
                        # FileNotFoundError is thrown in different situations.
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index':
                index,
                'uuid':
                uuid,
                'name':
                name,
                'temperature.gpu':
                temperature,
                'fan.speed':
                fan_speed,
                'utilization.gpu':
                utilization.gpu if utilization else None,
                'utilization.enc':
                utilization_enc[0] if utilization_enc else None,
                'utilization.dec':
                utilization_dec[0] if utilization_dec else None,
                'power.draw':
                power // 1000 if power is not None else None,
                'enforced.power.limit':
                power_limit // 1000 if power_limit is not None else None,
                # Convert bytes into MBytes
                'memory.used':
                memory.used // MB if memory else None,
                'memory.total':
                memory.total // MB if memory else None,
                'processes':
                processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        # 2. additional info (driver version, etc).
        try:
            driver_version = _decode(N.nvmlSystemGetDriverVersion())
        except N.NVMLError:
            driver_version = None  # N/A

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list, driver_version=driver_version)
Exemplo n.º 38
0
    def __init__(self, ground_policy, F_s, F_sa, env, device, log,
                 hyperparameters):
        self.env = env
        self.device = device
        self.log = log
        self.hyperparameters = hyperparameters
        self.ground_policy = ground_policy
        self.name = ""
        self.verbose = hyperparameters["verbose"]

        # Check env:
        self.discrete_env = True if 'Discrete' in str(
            env.action_space) else False
        if self.discrete_env:
            self.num_actions = self.env.action_space.n
            self.action_low = torch.zeros(self.num_actions, device=self.device)
            self.action_high = torch.ones(self.num_actions, device=self.device)
            if self.verbose:
                print("Num actions: ", self.num_actions)
        else:
            self.num_actions = len(self.env.action_space.high)
            self.action_low = torch.tensor(env.action_space.low,
                                           device=self.device)
            self.action_high = torch.tensor(env.action_space.high,
                                            device=self.device)
            if self.verbose:
                print("Env action low: ", self.action_low)
                print("Env action high: ", self.action_high)

        # Set up params:
        # Actor-Critic:
        self.use_actor_critic = hyperparameters["use_actor_critic"]
        self.use_CACLA_V = hyperparameters["use_CACLA_V"]
        self.use_CACLA_Q = hyperparameters["use_CACLA_Q"]
        self.use_DDPG = hyperparameters["use_DDPG"]
        self.use_SPG = hyperparameters["use_SPG"]
        self.use_GISPG = hyperparameters["use_GISPG"]
        # QV:
        self.use_QV = hyperparameters["use_QV"]
        self.use_QVMAX = hyperparameters["use_QVMAX"]
        # Exploration:
        self.gaussian_action_noise = hyperparameters["action_sigma"]
        self.boltzmann_exploration_temp = hyperparameters["boltzmann_temp"]
        self.epsilon = hyperparameters["epsilon"]
        self.epsilon_mid = hyperparameters["epsilon_mid"]
        if self.epsilon_mid:
            self.eps_factor = self.epsilon_mid**(1 / hyperparameters["steps"])
            self.epsilon = 1
        # General:
        self.use_half = hyperparameters["use_half"]
        self.batch_size = hyperparameters["batch_size"]
        self.use_world_model = hyperparameters["use_world_model"]

        # TODO: -Include PER with prioritization based on Upper Bound of Gradient Norm.
        # TODO: -include different sampling schemes from the papers investigatin PER in SL (small and big buffer for gradient norm too)

        # TODO: -add goal to replay buffer and Transition (For HRL)
        # Eligibility traces:
        if torch.cuda.is_available():
            nvmlInit()
            self.nvml_handle = nvmlDeviceGetHandleByIndex(0)
            self.max_gpu_bytes = torch.cuda.get_device_properties(
                self.device).total_memory
        self.mem_usage = None
        self.current_episode = []
        self.use_efficient_traces = hyperparameters["use_efficient_traces"]
        self.elig_traces_update_steps = hyperparameters[
            "elig_traces_update_steps"]
        self.elig_traces_anneal_lambda = hyperparameters[
            "elig_traces_anneal_lambda"]
        self.lambda_val = hyperparameters["elig_traces_lambda"]
        # Set up replay buffer:
        self.stack_dim = hyperparameters["stack_dim"]
        self.stack_count = hyperparameters["frame_stack"]
        self.buffer_size = hyperparameters[
            "replay_buffer_size"] + hyperparameters["num_expert_samples"]
        self.use_PER = hyperparameters["use_PER"]
        self.use_CER = hyperparameters["use_CER"]
        self.PER_alpha = hyperparameters["PER_alpha"]
        self.PER_start_beta = hyperparameters["PER_beta"]
        self.PER_beta = self.PER_start_beta
        self.PER_anneal_beta = hyperparameters["PER_anneal_beta"]
        self.PER_max_priority = hyperparameters["PER_max_priority"]
        self.PER_running_avg = hyperparameters["PER_running_avg"]
        self.importance_weights = None

        # Create replay buffer:
        self.memory = self.create_replay_buffer()

        # Feature extractors:
        self.F_s = F_s
        self.F_sa = F_sa
        self.state_feature_len = F_s.layers_merge[-1].out_features
        if F_sa is not None:
            self.state_action_feature_len = F_sa.layers_merge[-1].out_features

        # Set up Networks:
        self.use_half = hyperparameters[
            "use_half"] and torch.cuda.is_available()
        self.nets = []
        self.actor, self.Q, self.V = self.init_actor_critic(
            self.F_s, self.F_sa)
Exemplo n.º 39
0
def nccl_GPU(input, layer_id, construct_log, name, struct=None, splits=[], **kwargs):
    with tf.device("/cpu:0"):
        if "number_of_GPUs" not in construct_log:
            pynvml.nvmlInit()
            nb_GPU = pynvml.nvmlDeviceGetCount()
            construct_log["number_of_GPUs"] = nb_GPU
        else:
            nb_GPU = construct_log["number_of_GPUs"]
        gpu_input = [None]*nb_GPU
        towers_args = []
        towers_dict = []
        for g in range(nb_GPU):
            towers_args.append(dict(kwargs))
            towers_dict.append(dict())

        original_data = {}
        for key in splits:
            if key == "input":
                gpu_input = tf.split(input, nb_GPU)
            elif key in list(kwargs.keys()):
                if type(kwargs[key]) == str:
                    value_to_split = construct_log[kwargs[key]]
                else:
                    value_to_split = kwargs[key]
                value_splits = tf.split(value_to_split, nb_GPU)
                for i, targs in enumerate(towers_args):
                    targs[key]=value_splits[i]
            else:
                value_to_split = construct_log[key]
                value_splits = tf.split(value_to_split, nb_GPU)
                for i, tdic in enumerate(towers_dict):
                    tdic[key] = value_splits[i]
                original_data[key]=value_to_split

            
    variables = []
    outs = []
    destinations = []
    for i in range(nb_GPU):
        with tf.device("/gpu:"+str(i)):
            destinations.append("/gpu:"+str(i))
            for key in towers_dict[i]:
                construct_log[key[3:]] = towers_dict[i][key]

            replica_name = name if i == 0 else name+"_"+str(i)
            net_output = network(gpu_input[i],
                                 layer_id,
                                 construct_log,
                                 replica_name,
                                 struct=struct,
                                 var_scope=True,
                                 **towers_args[i])
            replica_variables = tf.global_variables(scope=construct_log["network_scope"][replica_name].name)
            replica_variables = sorted(replica_variables, key = lambda x : x.name)
            variables.append(replica_variables)
            outs.append(net_output)

    construct_log["tower_devices"] = destinations
    master = variables[0]

    variables = list(zip(*variables))



    for var in variables:
        for replic in var[1:]:
            construct_log["initialization_opps:[]"]= tf.assign(replic, var[0])

    for key in original_data:
        construct_log[key[3:]] = original_data[key]
    return input
Exemplo n.º 40
0
    def _initialize(self, log=False):
        """ Initialize the library that will be returning stats for the system's GPU(s).
        For Nvidia (on Linux and Windows) the library is `pynvml`. For Nvidia (on macOS) the
        library is `pynvx`. For AMD `plaidML` is used.

        Parameters
        ----------
        log: bool, optional
            Whether the class should output information to the logger. There may be occasions where
            the logger has not yet been set up when this class is queried. Attempting to log in
            these instances will raise an error. If GPU stats are being queried prior to the
            logger being available then this parameter should be set to ``False``. Otherwise set
            to ``True``. Default: ``False``
        """
        if not self._initialized:
            if get_backend() == "cpu":
                pass
            elif get_backend() == "amd":
                self._log("debug", "AMD Detected. Using plaidMLStats")
                loglevel = "INFO" if self._logger is None else self._logger.getEffectiveLevel()
                if plaidlib:
                    self._plaid = plaidlib(log_level=loglevel, log=log)
            elif IS_MACOS:
                self._log("debug", "macOS Detected. Using pynvx")
                try:
                    pynvx.cudaInit()
                except RuntimeError:
                    self._initialized = True
                    return
            else:
                try:
                    self._log("debug", "OS is not macOS. Trying pynvml")
                    pynvml.nvmlInit()
                except (pynvml.NVMLError_LibraryNotFound,  # pylint: disable=no-member
                        pynvml.NVMLError_DriverNotLoaded,  # pylint: disable=no-member
                        pynvml.NVMLError_NoPermission) as err:  # pylint: disable=no-member
                    if plaidlib is not None:
                        self._log("debug", "pynvml errored. Trying plaidML")
                        self._plaid = plaidlib(log=log)
                    else:
                        msg = ("There was an error reading from the Nvidia Machine Learning "
                               "Library. Either you do not have an Nvidia GPU (in which case "
                               "this warning can be ignored) or the most likely cause is "
                               "incorrectly installed drivers. If this is the case, Please remove "
                               "and reinstall your Nvidia drivers before reporting."
                               "Original Error: {}".format(str(err)))
                        self._log("warning", msg)
                        self._initialized = True
                        return
                except Exception as err:  # pylint: disable=broad-except
                    msg = ("An unhandled exception occured loading pynvml. "
                           "Original error: {}".format(str(err)))
                    if self._logger:
                        self._logger.error(msg)
                    else:
                        print(msg)
                    self._initialized = True
                    return
            self._initialized = True
            self._get_device_count()
            self._get_active_devices()
            self._get_handles()
Exemplo n.º 41
0
def main():
    """Start the simulation server."""
    # the following config variables read from the config.json file
    # are described here:
    #
    # port:              local port on which the server is listening (launching webots instances).
    # sslKey:            private key for a SSL enabled server.
    # sslCertificate:    certificate for a SSL enabled server.
    # projectsDir:       directory in which projects are located.
    # keyDir:            directory where the host keys needed for validation are stored.
    # logDir:            directory where the log files are written.
    # monitorLogEnabled: specify if the monitor data have to be stored in a file.
    # maxConnections:    maximum number of simultaneous Webots instances.
    #
    global config
    global snapshots
    global nvidia
    global network_sent
    global network_received
    global monitorFile
    n = psutil.net_io_counters()
    network_sent = n.bytes_sent
    network_received = n.bytes_recv
    snapshots = []
    config['WEBOTS_HOME'] = os.getenv('WEBOTS_HOME',
                                      '../../..').replace('\\', '/')
    config['webots'] = config['WEBOTS_HOME']
    if sys.platform == 'darwin':
        config['webots'] += '/Contents/MacOS/webots'
    elif sys.platform == 'win32':
        config['webots'] += '/msys64/mingw64/bin/webots.exe'
    else:  # linux
        config['webots'] += '/webots'
    if 'projectsDir' not in config:
        config['projectsDir'] = config[
            'WEBOTS_HOME'] + '/projects/samples/robotbenchmark'
    else:
        config['projectsDir'] = expand_path(config['projectsDir'])
    if 'keyDir' not in config:
        config['keyDir'] = 'key'
    else:
        config['keyDir'] = expand_path(config['keyDir'])
    if 'port' not in config:
        config['port'] = 2000
    if 'maxConnections' not in config:
        config['maxConnections'] = 100
    os.environ['WEBOTS_FIREJAIL_CONTROLLERS'] = '1'
    config['instancesPath'] = tempfile.gettempdir().replace(
        '\\', '/') + '/webots/instances/'
    # create the instances path
    if os.path.exists(config['instancesPath']):
        shutil.rmtree(config['instancesPath'])
    mkdir_p(config['instancesPath'])

    # logging system
    log_formatter = logging.Formatter(
        '%(asctime)-15s [%(levelname)-7s]  %(message)s')
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)

    if 'logDir' not in config:
        config['logDir'] = 'log'
    else:
        config['logDir'] = expand_path(config['logDir'])
    simulationLogDir = os.path.join(config['logDir'], 'simulation')
    logFile = os.path.join(simulationLogDir, 'output.log')
    try:
        if not os.path.exists(simulationLogDir):
            os.makedirs(simulationLogDir)
        file_handler = logging.FileHandler(logFile)
        file_handler.setFormatter(log_formatter)
        file_handler.setLevel(logging.INFO)
        root_logger.addHandler(file_handler)
    except (OSError, IOError) as e:
        sys.exit("Log file '" + logFile + "' cannot be created: " + str(e))

    # create monitor.csv used by Snapshot if needed
    if 'monitorLogEnabled' not in config:
        config['monitorLogEnabled'] = True
    if config['monitorLogEnabled']:
        monitorFile = os.path.join(simulationLogDir, 'monitor.csv')
        try:
            if not os.path.exists(simulationLogDir):
                os.makedirs(simulationLogDir)
            file = open(monitorFile, 'w')
            file.write(
                "Timestamp, Webots running, Webots idle, CPU load, CPU memory, "
                "GPU load compute, GPU load memory, GPU memory, Swap, Disk, Network sent, Network received\n"
            )
            file.close()
        except (OSError, IOError) as e:
            logging.error("Log file '" + monitorFile +
                          "' cannot be created: " + str(e))

    # startup janus server if needed
    if 'multimediaServer' in config:
        subprocess.Popen(["/opt/janus/bin/janus"])

    # startup the server
    logging.info("Running simulation server on port %d" % config['port'])

    handlers = []
    handlers.append((r'/monitor', MonitorHandler))
    handlers.append((r'/client', ClientWebSocketHandler))
    handlers.append((r'/load', LoadHandler))
    handlers.append((r'/(.*)', tornado.web.StaticFileHandler, {
        'path': config['WEBOTS_HOME'] + '/resources/web/server/www',
        'default_filename': 'index.html'
    }))
    application = tornado.web.Application(handlers)
    if 'sslCertificate' in config and 'sslKey' in config:
        config['ssl'] = True
        ssl_certificate = os.path.abspath(expand_path(
            config['sslCertificate']))
        ssl_key = os.path.abspath(expand_path(config['sslKey']))
        ssl_options = {"certfile": ssl_certificate, "keyfile": ssl_key}
        http_server = tornado.httpserver.HTTPServer(application,
                                                    ssl_options=ssl_options)
    else:
        config['ssl'] = False
        http_server = tornado.httpserver.HTTPServer(application)
    http_server.listen(config['port'])
    message = "Simulation server running on port %d (" % config['port']
    if not config['ssl']:
        message += 'no '
    message += 'SSL)'
    print(message)
    sys.stdout.flush()
    try:
        nvmlInit()
        nvidia = True
    except NVMLError:
        nvidia = False
    update_snapshot()
    try:
        tornado.ioloop.IOLoop.current().start()
    except Exception:
        logging.info(traceback.format_exc())
        for client in ClientWebSocketHandler.clients:
            del client
    if nvidia:
        nvmlShutdown()
Exemplo n.º 42
0
def initGPU():
    nvmlInit()
Exemplo n.º 43
0
            # if Settings.execution_mode == ExecutionMode.GENERATOR:
            #     print("==============================")
            #     break

    executionTest(queryType)

    end_mem = gpuMemory.capture_gpu_memory_usage()

    gpuMemory.log_memory_usage(queryType, start_mem, end_mem)


if __name__ == "__main__":

    Execution.getArgs()

    nvmlInit()

    drill = "drill"  # None
    spark = "spark"

    compareResults = True
    if "compare_results" in Settings.data["RunSettings"]:
        compareResults = Settings.data["RunSettings"]["compare_results"]

    if ((Settings.execution_mode == ExecutionMode.FULL and
         compareResults == "true") or
            Settings.execution_mode == ExecutionMode.GENERATOR):
        # Create Table Drill ------------------------------------------------
        from pydrill.client import PyDrill
        drill = PyDrill(host="localhost", port=8047)
        cs.init_drill_schema(drill,
Exemplo n.º 44
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        for device_id in xrange(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
		self.log.info('nvml.util.encoder %s' % long(util_encoder[0]))
                self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
		self.log.info('nvml.util.decoder %s' % long(util_decoder[0]))
                self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = psutil.Process(ps.pid).name()
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = u','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = u'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)
Exemplo n.º 45
0
#       extension: .py
#       format_name: hydrogen
#       format_version: '1.2'
#       jupytext_version: 1.2.1
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# %%
import torch
import torch.nn as nn
import torchvision.models as models
import pynvml
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0)


class Model(nn.Module):
    def __init__(self, pretrained_weights_path, device):
        super(Model, self).__init__()
        self.model = models.inception_v3()
        self.model.AuxLogits.fc = nn.Linear(
            self.model.AuxLogits.fc.in_features, 6)
        self.model.fc = nn.Linear(self.model.fc.in_features, 6)
        self.model.load_state_dict(
            torch.load(pretrained_weights_path, map_location=device))
        del self.model._modules['AuxLogits']  #删除AuxLogits模块
        #del self.model._modules['fc']
        #self.model.AuxLogits.fc = nn.Linear(self.model.AuxLogits.fc.in_features, self.args.n_classes) #将模型AuxLogits模块的fc输出通道数改成我们需要的分类数
Exemplo n.º 46
0
def get_gpus_info() -> Dict[str, Any]:
    """Get information about GPU devices: driver version, memory, utilization etc.

    The example below shows what kind of information is returned as the result. All
    figures about memory are given in bytes.

    Returns:
        Information about GPU devices.
    Raises:
        RuntimeError: if necessary cuda-related libraries are not found. Usually, it
            means that the function is run on a machine without GPU.

    Warning:
        The 'devices' value contains information about *all* gpus regardless of the
        value of :code:`CUDA_VISIBLE_DEVICES`.

    Examples:
        .. code-block::

            print(get_gpu_info())

        Output example (formatted for convenience):

        .. code-block:: none

            {
                'driver': '440.33.01',
                'devices': [
                    {
                        'name': 'GeForce RTX 2080 Ti',
                        'memory_total': 11554717696,
                        'memory_free': 11554652160,
                        'memory_used': 65536,
                        'utilization': 0,
                    },
                    {
                        'name': 'GeForce RTX 2080 Ti',
                        'memory_total': 11552096256,
                        'memory_free': 11552030720,
                        'memory_used': 65536,
                        'utilization': 0,
                    },
                ],
            }
    """
    try:
        pynvml.nvmlInit()
    except NVMLError_LibraryNotFound as err:
        raise RuntimeError(
            'Failed to get information about GPU memory. '
            'Make sure that you actually have GPU and all relevant software installed.'
        ) from err
    n_devices = pynvml.nvmlDeviceGetCount()
    devices = []
    for device_id in range(n_devices):
        handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
        memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        devices.append({
            'name':
            str(pynvml.nvmlDeviceGetName(handle), 'utf-8'),
            'memory_total':
            memory_info.total,
            'memory_free':
            memory_info.free,
            'memory_used':
            memory_info.used,
            'utilization':
            pynvml.nvmlDeviceGetUtilizationRates(handle).gpu,
        })
    return {
        'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'),
        'devices': devices,
    }
Exemplo n.º 47
0
 def __calculate_GPU_index(self, nNodes):
     pv.nvmlInit()
     nGPUs = int(pv.nvmlDeviceGetCount())
     rank = self.new_comm.Get_rank()
     return int(rank/nNodes) % nGPUs
Exemplo n.º 48
0
import os.path
import sys
import socket
import random
import numbers
from PIL import ImageOps
import requests
from urlparse import urlparse
import cStringIO
import PIL.Image
from PIL import Image
from visdom import Visdom
import numpy as np
import config
import pynvml 
pynvml.nvmlInit()

vis = Visdom()

all_wins = {}


def plot(title, name, i, v):
    win = all_wins.get(title, None) 
    if win is None:
        win = vis.line(env=config.experiment_name, X=np.array([i]), Y=np.array([v]), opts={'legend':[name], 'title':title})
        all_wins[title] = win
    else:
        vis.updateTrace(env=config.experiment_name, win=win,  X=np.array([i]), Y=np.array([v]), name=name)
    #viz.image( np.random.rand(3,64, 64), win="abxxx", opts=dict(title='sr', caption='sr images'))
Exemplo n.º 49
0
Arquivo: cli1.py Projeto: tasksss/task
    def new_query():
        """Query the information of all the GPUs on local machine"""

        N.nvmlInit()

        def _decode(b):
            if isinstance(b, bytes):
                return b.decode()  # for python3, to unicode
            return b

        def get_gpu_info(handle):
            """Get one GPU information specified by nvml handle"""
            def get_process_info(nv_process):
                """Get the process information of specific pid"""
                process = {}
                if nv_process.pid not in GPUStatCollection.global_processes:
                    GPUStatCollection.global_processes[nv_process.pid] = \
                        psutil.Process(pid=nv_process.pid)
                ps_process = GPUStatCollection.global_processes[nv_process.pid]
                process['username'] = ps_process.username()

                # _cmdline = ps_process.cmdline()
                # if not _cmdline:

                #     process['command'] = '?'
                #     process['full_command'] = ['?']
                # else:
                #     process['command'] = os.path.basename(_cmdline[0])
                #     process['full_command'] = _cmdline

                # process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB
                # process['cpu_percent'] = ps_process.cpu_percent()
                # process['cpu_memory_usage'] = \
                #     round((ps_process.memory_percent() / 100.0) *
                #           psutil.virtual_memory().total)
                process['pid'] = nv_process.pid
                return process

            name = _decode(N.nvmlDeviceGetName(handle))
            uuid = _decode(N.nvmlDeviceGetUUID(handle))

            # try:
            #     temperature = N.nvmlDeviceGetTemperature(
            #         handle, N.NVML_TEMPERATURE_GPU
            #     )
            # except N.NVMLError:
            #     temperature = None  # Not supported

            # try:
            #     fan_speed = N.nvmlDeviceGetFanSpeed(handle)
            # except N.NVMLError:
            #     fan_speed = None  # Not supported

            # try:
            #     memory = N.nvmlDeviceGetMemoryInfo(handle)  # in Bytes
            # except N.NVMLError:
            #     memory = None  # Not supported

            # try:
            #     utilization = N.nvmlDeviceGetUtilizationRates(handle)
            # except N.NVMLError:
            #     utilization = None  # Not supported

            # try:
            #     power = N.nvmlDeviceGetPowerUsage(handle)
            # except N.NVMLError:
            #     power = None

            # try:
            #     power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle)
            # except N.NVMLError:
            #     power_limit = None

            try:
                nv_comp_processes = \
                    N.nvmlDeviceGetComputeRunningProcesses(handle)
            except N.NVMLError:
                nv_comp_processes = None  # Not supported
            try:
                nv_graphics_processes = \
                    N.nvmlDeviceGetGraphicsRunningProcesses(handle)
            except N.NVMLError:
                nv_graphics_processes = None  # Not supported

            if nv_comp_processes is None and nv_graphics_processes is None:
                processes = None
            else:
                processes = []
                nv_comp_processes = nv_comp_processes or []
                nv_graphics_processes = nv_graphics_processes or []
                for nv_process in nv_comp_processes + nv_graphics_processes:
                    try:
                        process = get_process_info(nv_process)
                        processes.append(process)
                    except psutil.NoSuchProcess:
                        # TODO: add some reminder for NVML broken context
                        # e.g. nvidia-smi reset  or  reboot the system
                        pass

                # TODO: Do not block if full process info is not requested
                time.sleep(0.1)
                for process in processes:
                    pid = process['pid']
                    cache_process = GPUStatCollection.global_processes[pid]
                    # process['cpu_percent'] = cache_process.cpu_percent()

            index = N.nvmlDeviceGetIndex(handle)
            gpu_info = {
                'index': index,
                'uuid': uuid,
                'name': name,
                # 'temperature.gpu': temperature,
                # 'fan.speed': fan_speed,
                # 'utilization.gpu': utilization.gpu if utilization else None,
                # 'power.draw': power // 1000 if power is not None else None,
                # 'enforced.power.limit': power_limit // 1000
                # if power_limit is not None else None,
                # Convert bytes into MBytes
                # 'memory.used': memory.used // MB if memory else None,
                # 'memory.total': memory.total // MB if memory else None,
                'processes': processes,
            }
            GPUStatCollection.clean_processes()
            return gpu_info

        # 1. get the list of gpu and status
        gpu_list = []
        device_count = N.nvmlDeviceGetCount()

        for index in range(device_count):
            handle = N.nvmlDeviceGetHandleByIndex(index)
            gpu_info = get_gpu_info(handle)
            gpu_stat = GPUStat(gpu_info)
            gpu_list.append(gpu_stat)

        # 2. additional info (driver version, etc).
        try:
            driver_version = _decode(N.nvmlSystemGetDriverVersion())
        except N.NVMLError:
            driver_version = None  # N/A

        N.nvmlShutdown()
        return GPUStatCollection(gpu_list, driver_version=driver_version)
Exemplo n.º 50
0
def count_gpus():
    nvmlInit()
    count = nvmlDeviceGetCount()
    nvmlShutdown()
    return count
Exemplo n.º 51
0
    def training_step(self, batch, batch_idx) -> Dict:

        global isEmUpdateBusy  # use to check whether the entire embedding update process is finished or not
        global isAddIndexBusy  # use to check whether the entire indexing process  is finished or not
        global processes  # use to keep threads embedding update processes
        global threadHandle_index  # use to keep thread in embedding indexing processes

        if (self.trainer.global_rank == 0) and (self.custom_config.end2end):

            if (not batch_idx == 0) and (
                    batch_idx % self.custom_config.indexing_freq == 0):
                free_gpu_list = []
                nvmlInit()
                deviceCount = nvmlDeviceGetCount()

                my_list = json.loads(self.custom_config.gpu_order)

                for i in range(deviceCount):
                    handle = nvmlDeviceGetHandleByIndex(i)
                    info = nvmlDeviceGetMemoryInfo(handle)

                    if info.used / 1e6 < 15:
                        position = my_list.index(i)
                        free_gpu_list.append("cuda:" + str(position))

                if len(free_gpu_list) >= self.custom_config.index_gpus:
                    has_free_gpus = True

                else:
                    has_free_gpus = False

                if (not isEmUpdateBusy) and has_free_gpus:

                    model_copy = type(self.model.rag.ctx_encoder)(
                        self.config_dpr
                    )  # get a new instance  #this will be load in the CPU
                    model_copy.load_state_dict(self.model.rag.ctx_encoder.
                                               state_dict())  # copy weights

                    processes = []

                    if len(free_gpu_list) > self.custom_config.index_gpus:
                        cuda_devices = random.sample(
                            free_gpu_list, self.custom_config.index_gpus)
                    else:
                        cuda_devices = free_gpu_list

                    num_processes = len(cuda_devices)

                    for rank in range(num_processes):
                        logger.info(
                            "Iniitializing  embedding calculation process rank{}"
                            .format(rank))
                        device = cuda_devices[rank]
                        p = multiprocessing.Process(
                            target=embed_update,
                            args=(
                                copy.deepcopy(model_copy),
                                num_processes,
                                device,
                                rank,
                                self.custom_config.shard_dir,
                                self.custom_config.csv_path,
                            ),
                        )
                        processes.append(p)

                    for p in processes:
                        p.start()

                    isEmUpdateBusy = True

            if isEmUpdateBusy and (not isAddIndexBusy):
                index_process_list = [
                    processes[k].is_alive()
                    for k in range(self.custom_config.index_gpus)
                ]
                if (
                        sum(index_process_list) == 0
                ):  # If entire list is false, we can say all embedding calculation process has finished
                    logger.info("Start adding the index")
                    threadHandle_index = multiprocessing.Process(
                        target=add_index,
                        args=(
                            self.custom_config.shard_dir,
                            self.config.index_path,
                        ),
                    )
                    threadHandle_index.start()
                    isAddIndexBusy = True

            # check when index building has started
            if isAddIndexBusy:

                # check still the index_building process is happening
                if not threadHandle_index.is_alive():

                    logger.info("Merging the dataset shards")
                    saved_dataset_shards = []

                    for address in glob(
                            str(self.custom_config.shard_dir) + "/*/"):
                        saved_dataset_shards.append(load_from_disk(address))

                    concat = concatenate_datasets(saved_dataset_shards)
                    concat.save_to_disk(
                        self.config.passages_path
                    )  # here we update the main passage file on the disk
                    logger.info("done updating the dataset")

                    # To Do (@Aaron) : Useful in the future dynamic memory implementation.
                    # if you load the index from the disk make sure to update the index file here, otherwise it is ok to update the index file from the worker.
                    # logger.info("then updating the index")
                    # shutil.copy(self.custom_config.temp_index, self.config.idex_path)

                    logger.info(
                        "Loading new passages and iniitalzing new index")
                    self.trainer.model.module.module.model.rag.retriever.re_load(
                    )
                    self.trainer.model.module.module.model.rag.retriever.init_retrieval(
                    )

                    isEmUpdateBusy = False
                    isAddIndexBusy = False
        self.trainer.strategy.barrier("barrier")

        loss_tensors = self._step(batch)

        logs = {
            name: loss
            for name, loss in zip(self.loss_names, loss_tensors)
        }
        # tokens per batch
        tgt_pad_token_id = (self.tokenizer.generator.pad_token_id
                            if isinstance(self.tokenizer, RagTokenizer) else
                            self.tokenizer.pad_token_id)
        src_pad_token_id = (self.tokenizer.question_encoder.pad_token_id
                            if isinstance(self.tokenizer, RagTokenizer) else
                            self.tokenizer.pad_token_id)
        logs["tpb"] = (batch["input_ids"].ne(src_pad_token_id).sum() +
                       batch["decoder_input_ids"].ne(tgt_pad_token_id).sum())
        self.log("loss", loss_tensors[0])
        return loss_tensors[0]
Exemplo n.º 52
0
def identify_cards():
    devices = {}
    try:
        import pynvml
        from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex
        deviceCount = None
        try:
            nvmlInit()
            deviceCount = nvmlDeviceGetCount()
            log("identify_cards() will probe %i cards", deviceCount)
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                log("identify_cards() handle(%i)=%s", i, handle)
                props = {}
                def meminfo(memory):
                    return {
                            "total"  : int(memory.total),
                            "free"   : int(memory.free),
                            "used"   : int(memory.used),
                            }
                def pciinfo(pci):
                    i = {}
                    for x in ("domain", "bus", "device", "pciDeviceId", "pciSubSystemId"):
                        try:
                            i[x] = int(getattr(pci, x))
                        except:
                            pass
                    try:
                        i["busId"] = str(pci.busId)
                    except:
                        pass
                    return i
                for prop, fn_name, args, conv in (
                       ("name",                     "nvmlDeviceGetName",                    (),     str),
                       ("serial",                   "nvmlDeviceGetSerial",                  (),     str),
                       ("uuid",                     "nvmlDeviceGetUUID",                    (),     str),
                       ("pci",                      "nvmlDeviceGetPciInfo",                 (),     pciinfo),
                       ("memory",                   "nvmlDeviceGetMemoryInfo",              (),     meminfo),
                       ("pcie-link-generation-max", "nvmlDeviceGetMaxPcieLinkGeneration",   (),     int),
                       ("pcie-link-width-max",      "nvmlDeviceGetMaxPcieLinkWidth",        (),     int),
                       ("pcie-link-generation",     "nvmlDeviceGetCurrPcieLinkGeneration",  (),     int),
                       ("pcie-link-width",          "nvmlDeviceGetCurrPcieLinkWidth",       (),     int),
                       ("clock-info-graphics",      "nvmlDeviceGetClockInfo",               (0,),   int),
                       ("clock-info-sm",            "nvmlDeviceGetClockInfo",               (1,),   int),
                       ("clock-info-mem",           "nvmlDeviceGetClockInfo",               (2,),   int),
                       ("clock-info-graphics-max",  "nvmlDeviceGetMaxClockInfo",            (0,),   int),
                       ("clock-info-sm-max",        "nvmlDeviceGetMaxClockInfo",            (1,),   int),
                       ("clock-info-mem-max",       "nvmlDeviceGetMaxClockInfo",            (2,),   int),
                       ("fan-speed",                "nvmlDeviceGetFanSpeed",                (),     int),
                       ("temperature",              "nvmlDeviceGetTemperature",             (0,),   int),
                       ("power-state",              "nvmlDeviceGetPowerState",              (),     int),
                       ("vbios-version",            "nvmlDeviceGetVbiosVersion",            (),     str),
                       ):
                    try:
                        fn = getattr(pynvml, fn_name)
                        v = fn(handle, *args)
                        if conv:
                            v = conv(v)
                        props[prop] = v
                    except Exception as e:
                        log("identify_cards() cannot query %s using %s on device %i with handle %s: %s", prop, fn, i, handle, e)
                        continue
                log("identify_cards() [%i]=%s", i, props)
                devices[i] = props
            #unitCount = nvmlUnitGetCount()
            #log.info("unitCount=%s", unitCount)
        except Exception as e:
            log("identify_cards() pynvml error", exc_info=True)
            log.warn("Warning: failed to query the NVidia cards via NVML:")
            log.warn(" %s", e)
        finally:
            if deviceCount is not None:
                nvmlShutdown()
    except ImportError as e:
        log("cannot use nvml to query the kernel module version:")
        log(" %s", e)
    return devices
Exemplo n.º 53
0
	def do_GET(self):
		#checks if the server is alive
		if self.path == '/test':
			send_header(self)
			self.wfile.write(bytes('passed<br>', 'utf-8'))
			self.wfile.write(bytes('server is responding', 'utf-8'))
		#returns the running processes
		if self.path == '/runningProcesses':
			send_header(self)
			#send response:
			if modules['psutil']:
				for proc in psutil.process_iter():
					try:
						pinfo = proc.as_dict(attrs=['pid', 'name'])
					except psutil.NoSuchProcess:
						pass
					print(pinfo)
					self.wfile.write(bytes(str(pinfo), 'utf-8'))
			else:
				self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8')
		#returns the CPU utilization and number of cores
		elif self.path == '/cpuInfo':
			send_header(self)
			#get CPU info
			cpuInfo = {}
			if modules['psutil']:
				cpuInfo['CPU Utilization'] = int(psutil.cpu_percent())
				cpuInfo['CPU Cores'] = int(psutil.cpu_count())
			else:
				cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.'
			json_dump = json.dumps(cpuInfo)
			self.wfile.write(bytes(json_dump, 'utf-8'))
			#get GPU info
			if modules['pynvml']:
				try:
					pynvml.nvmlInit()
					gpus = pynvml.nvmlDeviceGetCount()
				except:
					gpus = 0
					self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8'))
			else:
				gpus = 0
				self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8'))
			for i in range(gpus):
				handle = pynvml.nvmlDeviceGetHandleByIndex(i)
				self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8'))
				try:
					self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '&deg;C', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8'))
				try:
					gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
					self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8'))
					self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8'))
				except:
					self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8'))
			if gpus > 0:
				try:
					pynvml.nvmlShutdown()
				except:
					pass

		elif self.path == '/availableComputers':
			send_header(self)
			s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
			s.connect(('google.com', 0))
			global myownsocket
			myownsocket = s.getsockname()[0]
			port = 8003
			available_computers = []
			for i in range(1, 256):
				host = '192.168.178.' + str(i) 
				sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
				sock.settimeout(0.2)
				try:
					alive = sock.connect_ex((host, port))
				except:
					alive = -1
				if alive == 0:
					print('available')
					
					available_computers.append(host)
				else:
					print('not available')
				print(host)
			self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8'))
			cmd_txt = """@echo off

call &quot;C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat&quot;

echo ##### start_rendering

xsibatch -render &quot;Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn&quot; -frames #1#-#2# -pass &quot;BEAUTY&quot; -skip on -verbose on

echo ##### rendering_done """
			self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8'))
			self.wfile.write(bytes('<table border="1">\n', 'utf-8'))
			self.wfile.write(bytes('<tr>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8'))
			self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8'))

			available_cpus = {}
			for host in available_computers:
				available_cpus[host] = abs(get_cpu_cores(host))

			total_cpus = sum(available_cpus.values())

			frame_list = {}
			start_frame = 0
			for host in available_computers:
				start_frame += 1
				frame_list[host] = [start_frame]
				start_frame =  start_frame + int(100 * (available_cpus[host] / total_cpus))
				if start_frame > 100:
					start_frame = 100
				frame_list[host].append(start_frame)
			index = 0
			for host in available_computers:
				index += 1
				self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
				self.wfile.write(bytes(host, 'utf-8'))
				self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
				self.wfile.write(bytes('</tr>', 'utf-8'))
			index = 2
			self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8'))
			self.wfile.write(bytes(host, 'utf-8'))
			self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8'))
			self.wfile.write(bytes('</tr>', 'utf-8'))
				
			self.wfile.write(bytes('</table>\n', 'utf-8'))
			self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8'))
			self.wfile.write(bytes('</form>\n', 'utf-8'))
			self.wfile.write(bytes('</body>\n', 'utf-8'))
			self.wfile.write(bytes('</html>\n', 'utf-8'))
		elif self.path == '/execute_job':
			send_header(self)
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)

		elif '/submit_job' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			parsed = urlparse(self.path)
			parameters = parse_qs(parsed.query)
			#print(parsed)
			print(parameters)
			self.wfile.write(bytes('<body>', 'utf-8'))
			for index in range(1, 100):
				if not parameters.get('host' + str(index)).strip():
					pass
				elif not parameters.get('start' + str(index)).strip():
					pass
				elif not parameters.get('end' + str(index)).strip():
					pass
				elif parameters.get('command'):
					cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip())
					cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip())
					self.wfile.write(bytes(escape(cmd_txt), 'utf-8'))
					self.wfile.write(bytes('<br>', 'utf-8'))
					print(cmd_txt)
			self.wfile.write(bytes('</body></html>', 'utf-8'))
		elif '/shutdown' in self.path:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("Server will be shut down now......", 'utf-8'))
			server.shutdown()
			sys.exit()

		else:
			send_header(self)
			self.wfile.write(bytes(str(self.client_address), 'utf-8'))
			self.wfile.write(bytes("<br>", 'utf-8'))
			self.wfile.write(bytes(self.path, 'utf-8'))
			print(self.path)
Exemplo n.º 54
0
 def __init__(self):
     nv.nvmlInit()
     self._device_count = nv.nvmlDeviceGetCount()
     self._specs = [DeviceSpec(i) for i in range(self.device_count)]
Exemplo n.º 55
0
 def __calculate_GPU_index(self, nNodes):
     pv.nvmlInit()
     nGPUs = int(pv.nvmlDeviceGetCount())
     rank = self.new_comm.Get_rank()
     return int(rank / nNodes) % nGPUs
Exemplo n.º 56
0
def get_trainer(config, base_model, diff_attention_model, loss, device, logger,
                query_loader, gallery_loader, diff_optimizer, diff_scheduler):
    num_batch_images = config['dataset'].getint('num_batch_images')

    val_per_epochs = config['trainer'].getint('val_per_epochs')
    log_iteration = config['trainer'].getint('log_iteration')

    save = config['trainer'].getboolean('save')
    save_per_epochs = config['trainer'].getint('save_per_epochs')
    save_path = config['trainer']['save_path']
    save_path = os.path.join(save_path,
                             time.strftime("%Y%m%d", time.localtime()))

    torch.cuda.empty_cache()
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)

    trainer = create_trainer(diff_attention_model,
                             diff_optimizer,
                             loss,
                             num_batch_images,
                             device=device)

    RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'avg_loss')
    if loss.do_loss('triplet'):
        RunningAverage(output_transform=lambda x: x[1]).attach(
            trainer, 'avg_triplet_loss')
    if loss.do_loss('reg'):
        RunningAverage(output_transform=lambda x: x[2]).attach(
            trainer, 'avg_reg_loss')

    if save:
        checkpointer = ModelCheckpoint(save_path,
                                       'supervised_offline',
                                       n_saved=10,
                                       require_empty=False)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED(every=save_per_epochs), checkpointer, {
                'base_model': base_model,
                'diff_attention_model': diff_attention_model
            })

    @trainer.on(Events.ITERATION_COMPLETED)
    def summary_iteration(engine):
        iteration = trainer.state.iteration
        if iteration % log_iteration == 0 and iteration != 0:
            logger.info('Epoch[{}/{}] Iteration[{}] Loss: {:.3f}'.format(
                trainer.state.epoch, trainer.state.max_epochs, iteration,
                trainer.state.metrics['avg_loss']))

    @trainer.on(Events.EPOCH_COMPLETED)
    def summary_epoch(engine):
        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
        logger.info('GPU Memory Used(GB): {:.3f} GB'.format(meminfo.used /
                                                            1024**3))
        logger.info('Epoch[{}] Loss: {:.3f} Base Lr: {:.2e}'.format(
            trainer.state.epoch, trainer.state.metrics['avg_loss'],
            diff_scheduler.get_last_lr()[0]))

        if loss.do_loss('triplet'):
            logger.info('Epoch[{}] Triplet_Loss: {:.3f}'.format(
                trainer.state.epoch,
                trainer.state.metrics['avg_triplet_loss']))
        if loss.do_loss('reg'):
            logger.info('Epoch[{}] Regularization_Loss: {:.3f}'.format(
                trainer.state.epoch, trainer.state.metrics['avg_reg_loss']))
        torch.cuda.empty_cache()

    @trainer.on(Events.EPOCH_COMPLETED)
    def change_lr(engine):
        diff_scheduler.step()

    @trainer.on(Events.EPOCH_COMPLETED(every=val_per_epochs))
    def val_per_val_epochs(engine):
        logger.info('Start validation every {} epochs at epoch: {}'.format(
            val_per_epochs, trainer.state.epoch))
        val.val(base_model, diff_attention_model, True, query_loader,
                gallery_loader, logger, device)

    return trainer
Exemplo n.º 57
0
    def check(self, instance):
        pynvml.nvmlInit()

        msg_list = []
        try:
            deviceCount = pynvml.nvmlDeviceGetCount()
        except:
            deviceCount = 0
        # Number of active GPUs
        self.gauge('nvml.gpus.number', deviceCount)
        for device_id in range(deviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            name = pynvml.nvmlDeviceGetName(handle)
            tags = dict(name="{}-{}".format(name, device_id))
            d_tags = self._dict2list(tags)
            # temperature info
            try:
                temp = pynvml.nvmlDeviceGetTemperature(
                    handle, pynvml.NVML_TEMPERATURE_GPU)
                self.gauge('nvml.temp.', temp, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetTemperature:{}'.format(err))
            # power info
            try:
                pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000
                self.gauge('nvml.power.', pwr, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetPowerUsage:{}'.format(err))
            # fan info
            try:
                fan = pynvml.nvmlDeviceGetFanSpeed(handle)
                self.gauge('nvml.fan.', fan, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetFanSpeed:{}'.format(err))
            # memory info
            try:
                mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
                self.gauge('nvml.mem.total', mem.total, tags=d_tags)
                self.gauge('nvml.mem.used', mem.used, tags=d_tags)
                self.gauge('nvml.mem.free', mem.free, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append('nvmlDeviceGetMemoryInfo:{}'.format(err))
            # utilization GPU/Memory info
            try:
                util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
                self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
                self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetUtilizationRates:{}'.format(err))
            # utilization Encoder info
            try:
                util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
                self.log.debug('nvml.util.encoder %s' % int(util_encoder[0]))
                self.gauge('nvml.util.encoder', int(
                    util_encoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetEncoderUtilization:{}'.format(err))
            # utilization Decoder info
            try:
                util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
                self.log.debug('nvml.util.decoder %s' % int(util_decoder[0]))
                self.gauge('nvml.util.decoder', int(
                    util_decoder[0]), tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetDecoderUtilization:{}'.format(err))
            # Compute running processes
            try:
                cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
                for ps in cps:
                    p_tags = tags.copy()
                    p_tags['pid'] = ps.pid
                    p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid)
                    p_tags = self._dict2list(p_tags)
                    self.gauge('nvml.process.used_gpu_memory',
                               ps.usedGpuMemory, tags=p_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
            # Clocks throttling info
            # Divide by the mask so that the value is either 0 or 1 per GPU
            try:
                throttle_reasons = (
                    pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle))
                self.gauge('nvml.throttle.appsettings', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting) /
                    pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting,
                    tags=d_tags)
                self.gauge('nvml.throttle.display', (throttle_reasons &
                    GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS) /
                    GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS,
                    tags=d_tags)
                self.gauge('nvml.throttle.hardware', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonHwSlowdown) /
                    pynvml.nvmlClocksThrottleReasonHwSlowdown,
                    tags=d_tags)
                self.gauge('nvml.throttle.idle', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonGpuIdle) /
                    pynvml.nvmlClocksThrottleReasonGpuIdle,
                    tags=d_tags)
                self.gauge('nvml.throttle.power.hardware', (throttle_reasons &
                    GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE) /
                    GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.power.software', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonSwPowerCap) /
                    pynvml.nvmlClocksThrottleReasonSwPowerCap,
                    tags=d_tags)
                self.gauge('nvml.throttle.syncboost', (throttle_reasons &
                    GPU_THROTTLE_SYNCBOOST) / GPU_THROTTLE_SYNCBOOST,
                    tags=d_tags)
                self.gauge('nvml.throttle.temp.hardware', (throttle_reasons &
                    GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE) /
                    GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.temp.software', (throttle_reasons &
                    GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE) /
                    GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE,
                    tags=d_tags)
                self.gauge('nvml.throttle.unknown', (throttle_reasons &
                    pynvml.nvmlClocksThrottleReasonUnknown) /
                    pynvml.nvmlClocksThrottleReasonUnknown,
                    tags=d_tags)
            except pynvml.NVMLError as err:
                msg_list.append(
                    'nvmlDeviceGetCurrentClocksThrottleReasons:{}'.format(err))
        if msg_list:
            status = AgentCheck.CRITICAL
            msg = ','.join(msg_list)
        else:
            status = AgentCheck.OK
            msg = 'Ok'
        pynvml.nvmlShutdown()

        self.service_check('nvml.check', status, message=msg)