def cb(): nonlocal last_time nonlocal nvlink_state now = time.time() src_dict = {"time": [now * 1000]} nvlink_state["tx-ref"] = nvlink_state["tx"].copy() nvlink_state["rx-ref"] = nvlink_state["rx"].copy() nvlink_state["tx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["tx"] for j in range(nlinks) ]) for i in range(ngpus) ] nvlink_state["rx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["rx"] for j in range(nlinks) ]) for i in range(ngpus) ] tx_diff = [ max(a - b, 0.0) * 5.0 for (a, b) in zip(nvlink_state["tx"], nvlink_state["tx-ref"]) ] rx_diff = [ max(a - b, 0.0) * 5.0 for (a, b) in zip(nvlink_state["rx"], nvlink_state["rx-ref"]) ] for i in range(ngpus): src_dict["nvlink-tx-" + str(i)] = [tx_diff[i]] src_dict["nvlink-rx-" + str(i)] = [rx_diff[i]] source.stream(src_dict, 1000) last_time = now
def cb(): nvlink_state["tx-ref"] = nvlink_state["tx"].copy() nvlink_state["rx-ref"] = nvlink_state["rx"].copy() src_dict = {} nvlink_state["tx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["tx"] for j in range(nlinks) ]) for i in range(ngpus) ] nvlink_state["rx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["rx"] for j in range(nlinks) ]) for i in range(ngpus) ] src_dict["count-tx"] = [ max(a - b, 0.0) * 5.0 for (a, b) in zip(nvlink_state["tx"], nvlink_state["tx-ref"]) ] src_dict["count-rx"] = [ max(a - b, 0.0) * 5.0 for (a, b) in zip(nvlink_state["rx"], nvlink_state["rx-ref"]) ] source.data.update(src_dict)
def test_nvml_nvlink_counters(ngpus, handles, counter, control, driver): if driver > 450.0: pytest.xfail(XFAIL_LEGACY_NVLINK_MSG) reset = 0 for i in range(ngpus): for j in range(pynvml.NVML_NVLINK_MAX_LINKS): assert ( pynvml.nvmlDeviceResetNvLinkUtilizationCounter(handles[i], j, counter) == pynvml.NVML_SUCCESS ) pynvml.nvmlDeviceSetNvLinkUtilizationControl( handles[i], j, counter, control, reset ) countdict = pynvml.nvmlDeviceGetNvLinkUtilizationCounter( handles[i], j, counter ) ctl = pynvml.nvmlDeviceGetNvLinkUtilizationControl(handles[i], j, counter) assert countdict["rx"] >= 0 assert countdict["tx"] >= 0 assert ctl == control assert ( pynvml.nvmlDeviceFreezeNvLinkUtilizationCounter( handles[i], j, counter, 1 ) == pynvml.NVML_SUCCESS ) assert ( pynvml.nvmlDeviceFreezeNvLinkUtilizationCounter( handles[i], j, counter, 0 ) == pynvml.NVML_SUCCESS )
def test_nvml_nvlink_counters(ngpus, handles, counter, control): reset = 0 for i in range(ngpus): for j in range(pynvml.NVML_NVLINK_MAX_LINKS): assert pynvml.nvmlDeviceResetNvLinkUtilizationCounter( handles[i], j, counter) == pynvml.NVML_SUCCESS pynvml.nvmlDeviceSetNvLinkUtilizationControl( handles[i], j, counter, control, reset) countdict = pynvml.nvmlDeviceGetNvLinkUtilizationCounter( handles[i], j, counter) ctl = pynvml.nvmlDeviceGetNvLinkUtilizationControl( handles[i], j, counter) assert countdict['rx'] >= 0 assert countdict['tx'] >= 0 assert ctl == control assert pynvml.nvmlDeviceFreezeNvLinkUtilizationCounter( handles[i], j, counter, 1) == pynvml.NVML_SUCCESS assert pynvml.nvmlDeviceFreezeNvLinkUtilizationCounter( handles[i], j, counter, 0) == pynvml.NVML_SUCCESS
def total_nvlink_transfer(): import pynvml pynvml.nvmlShutdown() pynvml.nvmlInit() try: cuda_dev_id = int(os.environ["CUDA_VISIBLE_DEVICES"].split(",")[0]) except Exception as e: print(e) cuda_dev_id = 0 nlinks = pynvml.NVML_NVLINK_MAX_LINKS handle = pynvml.nvmlDeviceGetHandleByIndex(cuda_dev_id) rx = 0 tx = 0 for i in range(nlinks): transfer = pynvml.nvmlDeviceGetNvLinkUtilizationCounter(handle, i, 0) rx += transfer["rx"] tx += transfer["tx"] return rx, tx
def nvlink_timeline(doc): # X Range x_range = DataRange1d(follow="end", follow_interval=20000, range_padding=0) tools = "reset,xpan,xwheel_zoom" item_dict = {"time": []} for i in range(ngpus): item_dict["nvlink-tx-" + str(i)] = [] item_dict["nvlink-rx-" + str(i)] = [] source = ColumnDataSource(item_dict) def _get_color(ind): color_list = [ "blue", "red", "green", "black", "brown", "cyan", "orange", "pink", "purple", "gold", ] return color_list[ind % len(color_list)] tx_fig = figure( title="TX NVLink (per Device) [B/s]", sizing_mode="stretch_both", x_axis_type="datetime", x_range=x_range, tools=tools, ) rx_fig = figure( title="RX NVLink (per Device) [B/s]", sizing_mode="stretch_both", x_axis_type="datetime", x_range=x_range, tools=tools, ) for i in range(ngpus): tx_fig.line(source=source, x="time", y="nvlink-tx-" + str(i), color=_get_color(i)) rx_fig.line(source=source, x="time", y="nvlink-rx-" + str(i), color=_get_color(i)) tx_fig.yaxis.formatter = NumeralTickFormatter(format="0.0 b") rx_fig.yaxis.formatter = NumeralTickFormatter(format="0.0 b") doc.title = "NVLink Throughput Timeline" doc.add_root(column(tx_fig, rx_fig, sizing_mode="stretch_both")) counter = 1 nlinks = pynvml.NVML_NVLINK_MAX_LINKS nvlink_state = {} nvlink_state["tx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["tx"] for j in range(nlinks) ]) for i in range(ngpus) ] nvlink_state["rx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["rx"] for j in range(nlinks) ]) for i in range(ngpus) ] nvlink_state["tx-ref"] = nvlink_state["tx"].copy() nvlink_state["rx-ref"] = nvlink_state["rx"].copy() last_time = time.time() def cb(): nonlocal last_time nonlocal nvlink_state now = time.time() src_dict = {"time": [now * 1000]} nvlink_state["tx-ref"] = nvlink_state["tx"].copy() nvlink_state["rx-ref"] = nvlink_state["rx"].copy() nvlink_state["tx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["tx"] for j in range(nlinks) ]) for i in range(ngpus) ] nvlink_state["rx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["rx"] for j in range(nlinks) ]) for i in range(ngpus) ] tx_diff = [ max(a - b, 0.0) * 5.0 for (a, b) in zip(nvlink_state["tx"], nvlink_state["tx-ref"]) ] rx_diff = [ max(a - b, 0.0) * 5.0 for (a, b) in zip(nvlink_state["rx"], nvlink_state["rx-ref"]) ] for i in range(ngpus): src_dict["nvlink-tx-" + str(i)] = [tx_diff[i]] src_dict["nvlink-rx-" + str(i)] = [rx_diff[i]] source.stream(src_dict, 1000) last_time = now doc.add_periodic_callback(cb, 200)
def nvlink(doc): import subprocess as sp # Use device-0/link-0 to get "upper bound" counter = 1 nlinks = pynvml.NVML_NVLINK_MAX_LINKS nvlink_ver = pynvml.nvmlDeviceGetNvLinkVersion(gpu_handles[0], 0) nvlink_link_bw = { # Keys = NVLink Version, Values = Max Link BW (per direction) # [Note: Using specs at https://en.wikichip.org/wiki/nvidia/nvlink] 1: 20.0 * GB, # GB/s 2: 25.0 * GB, # GB/s } # Max NVLink Throughput = BW-per-link * nlinks max_bw = nlinks * nvlink_link_bw.get(nvlink_ver, 25.0 * GB) # nvmlDeviceSetNvLinkUtilizationControl seems limited, using smi: sp.call([ "nvidia-smi", "nvlink", "--setcontrol", str(counter) + "bz", # Get output in bytes ]) tx_fig = figure(title="TX NVLink [B/s]", sizing_mode="stretch_both", y_range=[0, max_bw]) tx_fig.yaxis.formatter = NumeralTickFormatter(format="0.0 b") nvlink_state = {} nvlink_state["tx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["tx"] for j in range(nlinks) ]) for i in range(ngpus) ] nvlink_state["tx-ref"] = nvlink_state["tx"].copy() left = list(range(ngpus)) right = [l + 0.8 for l in left] source = ColumnDataSource({ "left": left, "right": right, "count-tx": [0.0 for i in range(ngpus)], "count-rx": [0.0 for i in range(ngpus)], }) mapper = LinearColorMapper(palette=all_palettes["RdYlBu"][4], low=0, high=max_bw) tx_fig.quad( source=source, left="left", right="right", bottom=0, top="count-tx", color={ "field": "count-tx", "transform": mapper }, ) tx_fig.toolbar_location = None rx_fig = figure(title="RX NVLink [B/s]", sizing_mode="stretch_both", y_range=[0, max_bw]) rx_fig.yaxis.formatter = NumeralTickFormatter(format="0.0 b") nvlink_state["rx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["rx"] for j in range(nlinks) ]) for i in range(ngpus) ] nvlink_state["rx-ref"] = nvlink_state["rx"].copy() rx_fig.quad( source=source, left="left", right="right", bottom=0, top="count-rx", color={ "field": "count-rx", "transform": mapper }, ) rx_fig.toolbar_location = None doc.title = "NVLink Utilization Counters" doc.add_root(column(tx_fig, rx_fig, sizing_mode="stretch_both")) def cb(): nvlink_state["tx-ref"] = nvlink_state["tx"].copy() nvlink_state["rx-ref"] = nvlink_state["rx"].copy() src_dict = {} nvlink_state["tx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["tx"] for j in range(nlinks) ]) for i in range(ngpus) ] nvlink_state["rx"] = [ sum([ pynvml.nvmlDeviceGetNvLinkUtilizationCounter( gpu_handles[i], j, counter)["rx"] for j in range(nlinks) ]) for i in range(ngpus) ] src_dict["count-tx"] = [ max(a - b, 0.0) * 5.0 for (a, b) in zip(nvlink_state["tx"], nvlink_state["tx-ref"]) ] src_dict["count-rx"] = [ max(a - b, 0.0) * 5.0 for (a, b) in zip(nvlink_state["rx"], nvlink_state["rx-ref"]) ] source.data.update(src_dict) doc.add_periodic_callback(cb, 200)