def monitor(path: Path, period: int): """Performs monitoring of GPU utilization Args: path: Path to write monitors. period: Period of writting information. """ logger = SummaryWriter(str(path)) k = 0 while True: monitors = GpuUtils.analyzeSystem(pandas_format=False) if len(monitors['gpu_index']) == 0: print('No GPUs found') break for i, utilization, memory_available, memory_utilization \ in zip(monitors['gpu_index'], monitors['utilizations'], monitors['available_memories_in_mb'], monitors['memory_usage_percentage']): logger.add_scalar(f'Monitoring/GPU{i}/utilization', utilization, k) logger.add_scalar(f'Monitoring/GPU{i}/MB left', memory_available, k) logger.add_scalar(f'Monitoring/GPU{i}/memory utilization', memory_utilization, k) k += 1 time.sleep(period)
import numpy as np import matplotlib.pyplot as plt import itertools import os from gpuutils import GpuUtils GpuUtils.allocate(gpu_count=1, framework='keras') import tensorflow as tf physical_devices = tf.config.list_physical_devices('GPU') for device in physical_devices: tf.config.experimental.set_memory_growth(device, True) from tensorflow import keras from generator import list_of_file_ids_test, n_events_per_file, n_files_train, n_files_val, batch_size, TestDataset from sklearn.metrics import confusion_matrix def plot_confusion_matrix(cm, classes, normalize=True, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ fig = plt.figure() plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar()
from gpuutils import GpuUtils import pandas as pd #------------------------------ pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) #------------------------------ print("GPU analysis of a machine have 8 GPUs") mock_response = 'Tue Apr 21 09:58:12 2020 \n+-----------------------------------------------------------------------------+\n| NVIDIA-SMI 418.67 Driver Version: 418.67 CUDA Version: 10.1 |\n|-------------------------------+----------------------+----------------------+\n| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n|===============================+======================+======================|\n| 0 Tesla V100-SXM2... Off | 00000000:15:00.0 Off | N/A |\n| N/A 34C P0 56W / 300W | 1280MiB / 32480MiB | 0% Default |\n+-------------------------------+----------------------+----------------------+\n| 1 Tesla V100-SXM2... Off | 00000000:16:00.0 Off | 0 |\n| N/A 34C P0 43W / 300W | 11MiB / 32480MiB | 0% Default |\n+-------------------------------+----------------------+----------------------+\n| 2 Tesla V100-SXM2... Off | 00000000:3A:00.0 Off | 0 |\n| N/A 33C P0 41W / 300W | 11MiB / 32480MiB | 0% Default |\n+-------------------------------+----------------------+----------------------+\n| 3 Tesla V100-SXM2... Off | 00000000:3B:00.0 Off | 0 |\n| N/A 35C P0 42W / 300W | 11MiB / 32480MiB | 0% Default |\n+-------------------------------+----------------------+----------------------+\n| 4 Tesla V100-SXM2... Off | 00000000:89:00.0 Off | 0 |\n| N/A 31C P0 42W / 300W | 11MiB / 32480MiB | 0% Default |\n+-------------------------------+----------------------+----------------------+\n| 5 Tesla V100-SXM2... Off | 00000000:8A:00.0 Off | 0 |\n| N/A 33C P0 41W / 300W | 11MiB / 32480MiB | 0% Default |\n+-------------------------------+----------------------+----------------------+\n| 6 Tesla V100-SXM2... Off | 00000000:B2:00.0 Off | 0 |\n| N/A 33C P0 43W / 300W | 11MiB / 32480MiB | 0% Default |\n+-------------------------------+----------------------+----------------------+\n| 7 Tesla V100-SXM2... Off | 00000000:B3:00.0 Off | 0 |\n| N/A 33C P0 43W / 300W | 11MiB / 32480MiB | 0% Default |\n+-------------------------------+----------------------+----------------------+\n \n+-----------------------------------------------------------------------------+\n| Processes: GPU Memory |\n| GPU PID Type Process name Usage |\n|=============================================================================|\n+-----------------------------------------------------------------------------+\n' df = GpuUtils.analyzeSystem(mock_response=mock_response) print(type(df)) print(df) required_memory = 10000 gpu_count = 1 df = df[(df.available_memories_in_mb > required_memory)] print("--------------------------------------------") print("GPU analysis of this machine:") GpuUtils.analyzeSystem() print("--------------------------------------------")