def gpu_info(): """Return a list of namedtuples representing attributes of each GPU device. """ GPUInfo = namedtuple('GPUInfo', ['name', 'driver', 'totalmem', 'freemem']) gpus = GPUtil.getGPUs() info = [] for g in gpus: info.append(GPUInfo(g.name, g.driver, g.memoryTotal, g.memoryFree)) return info
def gpu_load(wproc=0.5, wmem=0.5): """Return a list of namedtuples representing the current load for each GPU device. The processor and memory loads are fractions between 0 and 1. The weighted load represents a weighted average of processor and memory loads using the parameters `wproc` and `wmem` respectively. """ GPULoad = namedtuple('GPULoad', ['processor', 'memory', 'weighted']) gpus = GPUtil.getGPUs() load = [] for g in gpus: wload = (wproc * g.load + wmem * g.memoryUtil) / (wproc + wmem) load.append(GPULoad(g.load, g.memoryUtil, wload)) return load
def main(): max_devices = 16 # Check which devices we have locally available_devices = GPUtil.getAvailable(limit=max_devices) # Use one worker per device cluster = LocalCluster(n_workers=len(available_devices), threads_per_worker=4) client = Client(cluster) # Set up a relatively large regression problem n = 100 m = 10000000 partition_size = 100000 X = da.random.random((m, n), partition_size) y = da.random.random(m, partition_size) xgb.dask.run(client, train, X, y, available_devices)
def is_nvidia_gpu_present(): try: import GPUtil except ImportError: # py36 ModuleNotFoundError try: import gpu_dfcc except ImportError: # py36 ModuleNotFoundError # who knows? return False else: return gpu_dfcc.cudaGetDeviceCount() > 0 else: try: ngpu = len(GPUtil.getGPUs()) except OSError: # py3 FileNotFoundError # no `nvidia-smi` return False else: return ngpu > 0
''' Created on 1 Mar 2018 @author: lbtanh ''' # usage gpu: import GPUtil GPUtil.showUtilization()
def run_evaluation_one_dataset(idx, area_ini, training_root_dir, template_dir): curr_dir = os.getcwd() run_eval_dir = os.path.basename(area_ini)[:-4] + '_%d' % idx main_para = 'main_para_eval_on_testData.ini' area_ini_name = os.path.basename(area_ini) if os.path.isdir(run_eval_dir) is False: io_function.mkdir(run_eval_dir) os.chdir(run_eval_dir) # copy and modify parameters io_function.copy_file_to_dst(os.path.join(template_dir, main_para), main_para) io_function.copy_file_to_dst(area_ini, area_ini_name) # set training_data_per=0, then all the data will be input for evaluation modify_parameter(main_para, 'training_regions', area_ini_name) io_function.copy_file_to_dst( os.path.join(template_dir, 'deeplabv3plus_xception65.ini'), 'deeplabv3plus_xception65.ini') if 'login' in machine_name or 'shas' in machine_name or 'sgpu' in machine_name: io_function.copy_file_to_dst( os.path.join(template_dir, 'exe_curc.sh'), 'exe_curc.sh') io_function.copy_file_to_dst( os.path.join(template_dir, 'run_INsingularity_curc_GPU_tf.sh'), 'run_INsingularity_curc_GPU_tf.sh') io_function.copy_file_to_dst( os.path.join(template_dir, 'job_tf_GPU.sh'), 'job_tf_GPU.sh') job_name = 'eval_%d_area' % idx slurm_utility.modify_slurm_job_sh('job_tf_GPU.sh', 'job-name', job_name) else: # copy io_function.copy_file_to_dst( os.path.join(template_dir, 'exe_eval.sh'), 'exe_eval.sh') else: os.chdir(run_eval_dir) # if run in curc cluster if 'login' in machine_name or 'shas' in machine_name or 'sgpu' in machine_name: while True: job_count = slurm_utility.get_submit_job_count( curc_username, job_name_substr='eval') if job_count >= max_run_jobs: print( machine_name, datetime.now(), 'You have submitted %d or more jobs, wait ' % max_run_jobs) time.sleep(60) # continue break # submit a job res = os.system('sbatch job_tf_GPU.sh') if res != 0: sys.exit(1) else: deviceIDs = [] while True: # get available GPUs # https://github.com/anderskm/gputil deviceIDs = GPUtil.getAvailable(order='memory', limit=100, maxLoad=0.5, maxMemory=0.5, includeNan=False, excludeID=[], excludeUUID=[]) basic.outputlogMessage('deviceIDs: %s' % str(deviceIDs)) if len(deviceIDs) < 1: time.sleep( 60) # wait one minute, then check the available GPUs again continue break while True: job_count = basic.alive_process_count(local_tasks) if job_count >= max_run_jobs: print( machine_name, datetime.now(), '%d (>%d) jobs are running, wait ' % (job_count, max_run_jobs)) time.sleep(60) # continue break job_sh = 'exe_eval.sh' gpuid = deviceIDs[0] # modify gpuid in exe_eval.sh with open(job_sh, 'r') as inputfile: list_of_all_the_lines = inputfile.readlines() for i in range(0, len(list_of_all_the_lines)): line = list_of_all_the_lines[i] if 'CUDA_VISIBLE_DEVICES' in line: list_of_all_the_lines[ i] = 'export CUDA_VISIBLE_DEVICES=%d\n' % gpuid print('Set %s' % list_of_all_the_lines[i]) # write the new file and overwrite the old one with open(job_sh, 'w') as outputfile: outputfile.writelines(list_of_all_the_lines) outputfile.close() # run sub_process = Process(target=run_exe_eval) sub_process.start() local_tasks.append(sub_process) # wait until the assigned is used or exceed 100 seconds t0 = time.time() while True: gpu_ids = GPUtil.getAvailable(order='memory', limit=100, maxLoad=0.5, maxMemory=0.5, includeNan=False, excludeID=[], excludeUUID=[]) t1 = time.time() # print(gpu_ids, t1-t0) if len(gpu_ids) < 1 or gpu_ids[0] != gpuid or (t1 - t0) > 100: break else: time.sleep(0.5) if sub_process.exitcode is not None and sub_process.exitcode != 0: sys.exit(1) os.chdir(curr_dir)
def _run(self, _, frontend, sink, backend): # bind all sockets self.logger.info('bind all sockets') frontend.bind('tcp://*:%d' % self.port) addr_front2sink = auto_bind(sink) addr_backend = auto_bind(backend) # start the sink process self.logger.info('start the sink') proc_sink = BertSink(self.args, addr_front2sink) self.processes.append(proc_sink) proc_sink.start() addr_sink = sink.recv().decode('ascii') self.logger.info('get devices') run_on_gpu = False device_map = [-1] * self.num_worker if not self.args.cpu: try: import GPUtil num_all_gpu = len(GPUtil.getGPUs()) avail_gpu = GPUtil.getAvailable(order='memory', limit=min( num_all_gpu, self.num_worker)) num_avail_gpu = len(avail_gpu) if num_avail_gpu >= self.num_worker: run_on_gpu = True elif 0 < num_avail_gpu < self.num_worker: self.logger.warning( 'only %d out of %d GPU(s) is available/free, but "-num_worker=%d"' % (num_avail_gpu, num_all_gpu, self.num_worker)) self.logger.warning( 'multiple workers will be allocated to one GPU, ' 'may not scale well and may raise out-of-memory') run_on_gpu = True else: self.logger.warning('no GPU available, fall back to CPU') if run_on_gpu: device_map = (avail_gpu * self.num_worker)[:self.num_worker] except FileNotFoundError: self.logger.warning( 'nvidia-smi is missing, often means no gpu on this machine. ' 'fall back to cpu!') self.logger.info( 'device map: \n\t\t%s' % '\n\t\t'.join('worker %2d -> %s' % (w_id, ('gpu %2d' % g_id) if g_id >= 0 else 'cpu') for w_id, g_id in enumerate(device_map))) # start the backend processes for idx, device_id in enumerate(device_map): process = BertWorker(idx, self.args, addr_backend, addr_sink, device_id, self.graph_path) self.processes.append(process) process.start() num_req = defaultdict(int) while True: try: request = frontend.recv_multipart() client, msg, req_id, msg_len = request if msg == ServerCommand.terminate: break elif msg == ServerCommand.show_config: num_req['config'] += 1 self.logger.info( 'new config request\treq id: %d\tclient: %s' % (int(req_id), client)) status_runtime = { 'client': client.decode('ascii'), 'num_process': len(self.processes), 'ventilator -> worker': addr_backend, 'worker -> sink': addr_sink, 'ventilator <-> sink': addr_front2sink, 'server_current_time': str(datetime.now()), 'num_config_request': num_req['config'], 'num_data_request': num_req['data'], 'run_on_gpu': run_on_gpu } sink.send_multipart([ client, msg, jsonapi.dumps({ **status_runtime, **self.status_args, **self.status_static }), req_id ]) else: num_req['data'] += 1 self.logger.info( 'new encode request\treq id: %d\tsize: %d\tclient: %s' % (int(req_id), int(msg_len), client)) # register a new job at sink sink.send_multipart( [client, ServerCommand.new_job, msg_len, req_id]) job_id = client + b'#' + req_id if int(msg_len) > self.max_batch_size: seqs = jsonapi.loads(msg) # partition the large batch into small batches s_idx = 0 while s_idx < int(msg_len): tmp = seqs[s_idx:(s_idx + self.max_batch_size)] if tmp: partial_job_id = job_id + b'@%d' % s_idx backend.send_multipart( [partial_job_id, jsonapi.dumps(tmp)]) s_idx += len(tmp) else: backend.send_multipart([job_id, msg]) except ValueError: self.logger.error( 'received a wrongly-formatted request (expected 4 frames, got %d)' % len(request)) self.logger.error('\n'.join('field %d: %s' % (idx, k) for idx, k in enumerate(request))) self.logger.info('terminated!')
for iteration, (batch, c) in enumerate(tqdm.tqdm(dl)): with torch.no_grad(): batch = batch.cuda() c = c.cuda() preds = model(batch) pred_cat.append(preds) c_cat.append(c) pred_cat = torch.cat(pred_cat, dim=0) c_cat = torch.cat(c_cat, dim=0) return auc_check(pred_cat, c_cat) if __name__ == '__main__': if opt.cuda: base_gpu_list = GPUtil.getAvailable(order='memory', limit=8) if 5 in base_gpu_list: base_gpu_list.remove(5) base_gpu = base_gpu_list[0] cudnn.benchmark = True elif torch.cuda.is_available() and not opt.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) torch.cuda.set_device(base_gpu) for p in [3]: opt.dataset_index = p # 0 = mnist, 1 = fashion, 2 = celeb perf_vals = [] for seed in range(3): opt.epochs = epochs[opt.dataset_index] opt.channels = channels[opt.dataset_index]
!mv pubfig83lfw_raw_in_dirs rephrase-pubfig831/correct !rm -r rephrase-pubfig831/correct/distract !cp -r rephrase-pubfig831/correct rephrase-pubfig831/degraded for image_path in tqdm(glob('rephrase-pubfig831/degraded/*/*/*.jpg')): degrade(image_path) """# **Checking Free Memory** This block is just so that you can have an idea of the resources you have at hand on the Google Collab system. """ import psutil import humanize import os import GPUtil as GPU gpu = GPU.getGPUs()[0] process = psutil.Process(os.getpid()) print(f"Gen RAM: Free {humanize.naturalsize(psutil.virtual_memory().available)} | Proc size {humanize.naturalsize(process.memory_info().rss)}") print(f"GPU RAM: Free {gpu.memoryFree:.0f}MB | Used {gpu.memoryUsed:.0f}MB | Util {gpu.memoryUtil*100:.0f}% | Total {gpu.memoryTotal:.0f}MB") !pip install tensorflow-gpu==2.0.0 import os from glob import glob import cv2 import numpy as np from tqdm import tqdm """# **Main Code**
# NOTE: First install bert-as-service via # $ # $ pip install bert-serving-server # $ pip install bert-serving-client # $ # read and write TFRecord import os import GPUtil import tensorflow as tf from model_serving.client import bert_client os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUtil.getFirstAvailable()[0]) tf.logging.set_verbosity(tf.logging.INFO) with open('README.md') as fp: data = [v for v in fp if v.strip()] bc = bert_client() list_vec = bc.encode(data) list_label = [0 for _ in data] # a dummy list of all-zero labels # write tfrecords with tf.python_io.TFRecordWriter('tmp.tfrecord') as writer: def create_float_feature(values): return tf.train.Feature(float_list=tf.train.FloatList(value=values))
def start(self): self.running = True while self.running: gpu = GPUtil.getGPUs()[0] self.on_stats(gpu.load, gpu.memoryTotal, gpu.memoryUsed) time.sleep(self.period)
options = parser.parse_args() if not options.agent_profile: parser.error('Agent profile must be selected') if not options.agent_path: parser.error('Agent path must be selected') if not options.temp_path: parser.error('Out experience path must be selected') if not options.games_num: parser.error('Number of games must be selected') num_gpus = len(GPUtil.getGPUs()) if num_gpus < 1: throw_error("Host does not have GPU! Aborting...") if num_gpus == 1: print( "Single-gpu machine detected, starting in the synchronous mode...") iteration_memory_path = options.temp_path + '/' + generate_unique_memory_name( ) generate_self_play(options.agent_profile, options.agent_path, options.games_num, iteration_memory_path, options.max_steps, options.verbose, options.debug, options.exploration_decay_steps)
def get_gpu_count(): return len(GPUtil.getGPUs())
import GPUtil as GPU GPUlist = GPU.getAvailable(order='first', limit=999) a = 1
def main(options, args): print( "%s : prediction using the trained model (run parallel if use multiple GPUs) " % os.path.basename(sys.argv[0])) machine_name = os.uname()[1] start_time = datetime.datetime.now() para_file = args[0] if os.path.isfile(para_file) is False: raise IOError('File %s not exists in current folder: %s' % (para_file, os.getcwd())) basic.setlogfile('parallel_predict_Log.txt') deeplab_inf_script = os.path.join(code_dir, 'deeplabBased', 'deeplab_inference.py') network_setting_ini = parameters.get_string_parameters( para_file, 'network_setting_ini') global tf1x_python tf1x_python = parameters.get_file_path_parameters(network_setting_ini, 'tf1x_python') trained_model = options.trained_model outdir = parameters.get_directory(para_file, 'inf_output_dir') # remove previous results (let user remove this folder manually or in exe.sh folder) io_function.mkdir(outdir) # get name of inference areas multi_inf_regions = parameters.get_string_list_parameters( para_file, 'inference_regions') # max_parallel_inf_task = parameters.get_digit_parameters(para_file,'max_parallel_inf_task','int') b_use_multiGPUs = parameters.get_bool_parameters(para_file, 'b_use_multiGPUs') # loop each inference regions sub_tasks = [] for area_idx, area_ini in enumerate(multi_inf_regions): area_name = parameters.get_string_parameters(area_ini, 'area_name') area_remark = parameters.get_string_parameters(area_ini, 'area_remark') area_time = parameters.get_string_parameters(area_ini, 'area_time') inf_image_dir = parameters.get_directory(area_ini, 'inf_image_dir') # it is ok consider a file name as pattern and pass it the following functions to get file list inf_image_or_pattern = parameters.get_string_parameters( area_ini, 'inf_image_or_pattern') inf_img_list = io_function.get_file_list_by_pattern( inf_image_dir, inf_image_or_pattern) img_count = len(inf_img_list) if img_count < 1: raise ValueError( 'No image for inference, please check inf_image_dir and inf_image_or_pattern in %s' % area_ini) area_save_dir = os.path.join( outdir, area_name + '_' + area_remark + '_' + area_time) io_function.mkdir(area_save_dir) # parallel inference images for this area CUDA_VISIBLE_DEVICES = [] if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(): CUDA_VISIBLE_DEVICES = [ int(item.strip()) for item in os.environ['CUDA_VISIBLE_DEVICES'].split(',') ] idx = 0 while idx < img_count: if b_use_multiGPUs: # get available GPUs # https://github.com/anderskm/gputil deviceIDs = GPUtil.getAvailable(order='first', limit=100, maxLoad=0.5, maxMemory=0.5, includeNan=False, excludeID=[], excludeUUID=[]) # only use the one in CUDA_VISIBLE_DEVICES if len(CUDA_VISIBLE_DEVICES) > 0: deviceIDs = [ item for item in deviceIDs if item in CUDA_VISIBLE_DEVICES ] basic.outputlogMessage('on ' + machine_name + ', available GPUs:' + str(deviceIDs) + ', among visible ones:' + str(CUDA_VISIBLE_DEVICES)) else: basic.outputlogMessage('on ' + machine_name + ', available GPUs:' + str(deviceIDs)) if len(deviceIDs) < 1: time.sleep( 60 ) # wait one minute, then check the available GPUs again continue # set only the first available visible gpuid = deviceIDs[0] basic.outputlogMessage( '%d: predict image %s on GPU %d of %s' % (idx, inf_img_list[idx], gpuid, machine_name)) else: gpuid = None basic.outputlogMessage('%d: predict image %s on %s' % (idx, inf_img_list[idx], machine_name)) # run inference img_save_dir = os.path.join(area_save_dir, 'I%d' % idx) inf_list_file = os.path.join(area_save_dir, '%d.txt' % idx) # if it already exist, then skip if os.path.isdir(img_save_dir) and is_file_exist_in_folder( img_save_dir): basic.outputlogMessage( 'folder of %dth image (%s) already exist, ' 'it has been predicted or is being predicted' % (idx, inf_img_list[idx])) idx += 1 continue with open(inf_list_file, 'w') as inf_obj: inf_obj.writelines(inf_img_list[idx] + '\n') sub_process = Process(target=predict_one_image_deeplab, args=(deeplab_inf_script, para_file, network_setting_ini, img_save_dir, inf_list_file, gpuid, trained_model)) sub_process.start() sub_tasks.append(sub_process) if b_use_multiGPUs is False: # wait until previous one finished while sub_process.is_alive(): time.sleep(5) idx += 1 # wait until predicted image patches exist or exceed 20 minutes time0 = time.time() elapsed_time = time.time() - time0 while elapsed_time < 20 * 60: elapsed_time = time.time() - time0 file_exist = is_file_exist_in_folder(img_save_dir) if file_exist is True or sub_process.is_alive() is False: break else: time.sleep(5) if sub_process.exitcode is not None and sub_process.exitcode != 0: sys.exit(1) close_remove_completed_task(sub_tasks) # if 'chpc' in machine_name: # time.sleep(60) # wait 60 second on ITSC services # else: # time.sleep(10) # check all the tasks already finished while b_all_task_finish(sub_tasks) is False: basic.outputlogMessage('wait all tasks to finish') time.sleep(60) close_remove_completed_task(sub_tasks) end_time = datetime.datetime.now() diff_time = end_time - start_time out_str = "%s: time cost of total parallel inference on %s: %d seconds" % ( str(end_time), machine_name, diff_time.seconds) basic.outputlogMessage(out_str) with open("time_cost.txt", 'a') as t_obj: t_obj.writelines(out_str + '\n')
def check_memory(): """ Check usable system memory Warn the user if insufficient memory is available for the number of processes that the user have chosen. """ memory_status = [] # get system available memory system_memory_available = psutil.virtual_memory().available / (1024**3) memory_status.append(('system', system_memory_available)) # check if Nvidia-smi is available # GPUtil requires nvidia-smi.exe to interact with GPU if args.method in ['gpu', 'cudnn']: if not (shutil.which('nvidia-smi') or pathlib.Path( r'C:\Program Files\NVIDIA Corporation\NVSMI\nvidia-smi.exe'). is_file()): # Nvidia System Management Interface not available Avalon.warning( 'Nvidia-smi not available, skipping available memory check') Avalon.warning( 'If you experience error \"cudaSuccess out of memory\", try reducing number of processes you\'re using' ) else: with contextlib.suppress(ValueError): # "0" is GPU ID. Both waifu2x drivers use the first GPU available, therefore only 0 makes sense gpu_memory_available = (GPUtil.getGPUs()[0].memoryTotal - GPUtil.getGPUs()[0].memoryUsed) / 1024 memory_status.append(('GPU', gpu_memory_available)) # go though each checkable memory type and check availability for memory_type, memory_available in memory_status: if memory_type == 'system': mem_per_process = SYS_MEM_PER_PROCESS else: mem_per_process = GPU_MEM_PER_PROCESS # if user doesn't even have enough memory to run even one process if memory_available < mem_per_process: Avalon.warning( f'You might have insufficient amount of {memory_type} memory available to run this program ({memory_available} GB)' ) Avalon.warning('Proceed with caution') if args.processes > 1: if Avalon.ask('Reduce number of processes to avoid crashing?', default=True, batch=args.batch): args.processes = 1 # if memory available is less than needed, warn the user elif memory_available < (mem_per_process * args.processes): Avalon.warning( f'Each waifu2x-caffe process will require up to {SYS_MEM_PER_PROCESS} GB of system memory' ) Avalon.warning( f'You demanded {args.processes} processes to be created, but you only have {round(memory_available, 4)} GB {memory_type} memory available' ) Avalon.warning( f'{mem_per_process * args.processes} GB of {memory_type} memory is recommended for {args.processes} processes' ) Avalon.warning( f'With your current amount of {memory_type} memory available, {int(memory_available // mem_per_process)} processes is recommended' ) # ask the user if he / she wants to change to the recommended # number of processes if Avalon.ask('Change to the recommended value?', default=True, batch=args.batch): args.processes = int(memory_available // mem_per_process) else: Avalon.warning('Proceed with caution')
def get_args(): """ Returns a namedtuple with arguments extracted from the command line. :return: A namedtuple with arguments """ parser = argparse.ArgumentParser( description= 'Welcome to the MLP course\'s Pytorch training and inference helper script' ) parser.add_argument('--batch_size', nargs="?", type=int, default=100, help='Batch_size for experiment') parser.add_argument('--lstm_hidden_dim', nargs="?", type=int, default=512, help='Hidden_dim for LSTM') parser.add_argument('--lr', nargs="?", type=float, default=0.01, help='Learning rate') parser.add_argument('--encoder_output_size', nargs="?", type=int, default=1024, help='Size of the output of the encoder') parser.add_argument( '--fc1_size', nargs="?", type=int, default=512, help='Size of the output of the first layer of the siamese network') parser.add_argument( '--fc2_size', nargs="?", type=int, default=2048, help='Size of the output of the second layer of the siamese network') parser.add_argument('--model_name', nargs="?", type=str, default="baseline", help='Model for the experiment') parser.add_argument('--continue_from_epoch', nargs="?", type=int, default=-1, help='Batch_size for experiment') parser.add_argument( '--dataset_name', type=str, help='Dataset on which the system will train/eval our model') parser.add_argument( '--seed', nargs="?", type=int, default=7112018, help='Seed to use for random number generator for experiment') parser.add_argument('--num_layers', nargs="?", type=int, default=4, help='Number of LSTM layers') parser.add_argument('--num_epochs', nargs="?", type=int, default=100, help='The experiment\'s epoch budget') parser.add_argument('--dropout_rate', nargs="?", type=float, default=0.0, help='Dropout rate') parser.add_argument( '--experiment_name', nargs="?", type=str, default="exp_1", help='Experiment name - to be used for building the experiment folder') parser.add_argument( '--use_gpu', nargs="?", type=str2bool, default=False, help='A flag indicating whether we will use GPU acceleration or not') parser.add_argument('--gpu_id', type=str, default="None", help="A string indicating the gpu to use") parser.add_argument('--weight_decay_coefficient', nargs="?", type=float, default=1e-05, help='Weight decay to use for Adam') parser.add_argument('--filepath_to_arguments_json_file', nargs="?", type=str, default=None, help='') args = parser.parse_args() gpu_id = str(args.gpu_id) if args.filepath_to_arguments_json_file is not None: args = extract_args_from_json( json_file_path=args.filepath_to_arguments_json_file, existing_args_dict=args) if gpu_id != "None": args.gpu_id = gpu_id arg_str = [(str(key), str(value)) for (key, value) in vars(args).items()] print(arg_str) if args.use_gpu == True: num_requested_gpus = len(args.gpu_id.split(",")) num_received_gpus = len( GPUtil.getAvailable(order='first', limit=8, maxLoad=0.1, maxMemory=0.1, includeNan=False, excludeID=[], excludeUUID=[])) if num_requested_gpus == 1 and num_received_gpus > 1: print("Detected Slurm problem with GPUs, attempting automated fix") gpu_to_use = GPUtil.getAvailable(order='first', limit=num_received_gpus, maxLoad=0.1, maxMemory=0.1, includeNan=False, excludeID=[], excludeUUID=[]) if len(gpu_to_use) > 0: os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_to_use[0]) print("Using GPU with ID", gpu_to_use[0]) else: print( "Not enough GPUs available, please try on another node now, or retry on this node later" ) sys.exit() elif num_requested_gpus > 1 and num_received_gpus > num_requested_gpus: print("Detected Slurm problem with GPUs, attempting automated fix") gpu_to_use = GPUtil.getAvailable(order='first', limit=num_received_gpus, maxLoad=0.1, maxMemory=0.1, includeNan=False, excludeID=[], excludeUUID=[]) if len(gpu_to_use) >= num_requested_gpus: os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( str(gpu_idx) for gpu_idx in gpu_to_use[:num_requested_gpus]) print("Using GPU with ID", gpu_to_use[:num_requested_gpus]) else: print( "Not enough GPUs available, please try on another node now, or retry on this node later" ) sys.exit() import torch args.use_cuda = torch.cuda.is_available() if torch.cuda.is_available( ): # checks whether a cuda gpu is available and whether the gpu flag is True device = torch.cuda.current_device() print("use {} GPU(s)".format(torch.cuda.device_count()), file=sys.stderr) else: print("use CPU", file=sys.stderr) device = torch.device('cpu') # sets the device to be CPU return args, device
def available_gpu(*args, **kwargs): """This function is an alias for ``GPUtil.getAvailable``. If ``GPUtil`` is not installed, it returns [0,] as a default GPU ID.""" return GPUtil.getAvailable(*args, **kwargs)
def get_remote_gpu(): gpus = GPUtil.getGPUs() total_mem_mb = gpus[0].memory_total return total_mem_mb * BYTES_PER_MiB
ds = xr.open_dataset(MITGCM_filename) tr_start = 0 tr_end = int(train_end_ratio * dataset_end_index) val_end = int(val_end_ratio * dataset_end_index) x_dim = (ds.isel(T=slice(0))).sizes['X'] y_dim = (ds.isel(T=slice(0))).sizes['Y'] z_dim = (ds.isel(T=slice(0))).sizes['Zld000038'] ds.close() logging.info('Model ; ' + args.name + '\n') device = 'cuda' if torch.cuda.is_available() else 'cpu' logging.info('Using device: ' + device + '\n') TimeCheck(tic, 'setting variables') logging.info(GPUtil.showUtilization()) #----------------------------------- # Read in mean and std #----------------------------------- data_mean, data_std, data_range = ReadMeanStd(MeanStd_prefix) TimeCheck(tic, 'getting mean & std') logging.info(GPUtil.showUtilization()) if args.dim == '2d': no_in_channels = args.histlen * ( 3 * z_dim + 1 ) + 3 * z_dim + 1 # Eta field, plus Temp, U, V through depth, for each past time, plus masks no_out_channels = 3 * z_dim + 1 # Eta field, plus Temp, U, V through depth, just once elif args.dim == '3d':
def GetStaticStatsDict( self): # Gets Static Stats And Puts Them Into A Dictionary # # Create New Dictionary # self.SystemHardware = {} # Get System Name Info # PlatformInfo = platform.uname() self.SystemHardware.update( {'OperatingSystemName': PlatformInfo.system}) self.SystemHardware.update({'NodeName': PlatformInfo.node}) self.NodeName = PlatformInfo.node self.SystemHardware.update( {'OperatingSystemRelease': PlatformInfo.release}) self.SystemHardware.update( {'OperatingSystemVersion': PlatformInfo.version}) # Get Last Boot Info # BootTimeInfo = psutil.boot_time() BootTimeDateTimeObject = datetime.datetime.fromtimestamp(BootTimeInfo) self.SystemHardware.update({ 'BootTimeDateString': f'{BootTimeDateTimeObject.year}/{BootTimeDateTimeObject.month}/{BootTimeDateTimeObject.day} {BootTimeDateTimeObject.hour}:{BootTimeDateTimeObject.minute}:{BootTimeDateTimeObject.second}' }) # Get System CPU Info # CPUInfo = cpuinfo.get_cpu_info() self.SystemHardware.update( {'PythonVersion': CPUInfo.get('python_version')}) self.SystemHardware.update( {'CPUInfoVersion': CPUInfo.get('cpuinfo_version_string')}) self.SystemHardware.update({'CPUArchitecture': CPUInfo.get('arch')}) self.SystemHardware.update({'CPUBits': CPUInfo.get('bits')}) self.SystemHardware.update({'CPUThreads': CPUInfo.get('count})')}) self.SystemHardware.update( {'CPUCores': psutil.cpu_count(logical=False)}) self.SystemHardware.update({'CPUVendor': CPUInfo.get('vendor_id_raw')}) self.SystemHardware.update({'CPUName': CPUInfo.get('brand_raw')}) self.SystemHardware.update( {'CPUBaseClock': CPUInfo.get('hz_advertized_friendly')}) self.SystemHardware.update({'CPUInstructionSet': CPUInfo.get('flags')}) self.SystemHardware.update( {'CPUL3CacheSize': CPUInfo.get('l3_cache_size')}) self.SystemHardware.update( {'CPUL2CacheSize': CPUInfo.get('l2_cache_Size')}) self.SystemHardware.update( {'CPUL1CacheSize': CPUInfo.get('l1_cache_size')}) # Get System Ram Info # RamInfo = psutil.virtual_memory() self.SystemHardware.update({'TotalSystemRAM': RamInfo.total}) SwapInfo = psutil.swap_memory() self.SystemHardware.update({'TotalSystemSwap': SwapInfo.total}) # Get System Disk Info # Partitions = psutil.disk_partitions() PartitionDevices = [] PartitionMountPoints = [] PartitionFileSystemType = [] PartitionTotal = [] PartitionUsed = [] PartitionFree = [] PartitionUsagePercent = [] for Partition in Partitions: PartitionDevices.append(Partition.device) PartitionMountPoints.append(Partition.mountpoint) PartitionFileSystemType.append(Partition.fstype) try: PartitionUsage = psutil.disk_usage(Partition.mountpoint) except PermissionError: # Catch Exception Thrown If Partition Is Unreadable # continue PartitionTotal.append(PartitionUsage.total) PartitionUsed.append(PartitionUsage.used) PartitionFree.append(PartitionUsage.free) PartitionUsagePercent.append(PartitionUsage.percent) self.SystemHardware.update({'PartitionDevices': PartitionDevices}) self.SystemHardware.update( {'PartitionMountPoints': PartitionMountPoints}) self.SystemHardware.update( {'PartitionFileSystemType': PartitionFileSystemType}) self.SystemHardware.update({'PartitionTotal': PartitionTotal}) self.SystemHardware.update({'PartitionUsed': PartitionUsed}) self.SystemHardware.update({'PartitionFree': PartitionFree}) self.SystemHardware.update( {'PartitionUsagePercent': PartitionUsagePercent}) # Get Network Info # NetNames = [] NetAddresses = [] NetMasks = [] NetBroadcasts = [] IFAddresses = psutil.net_if_addrs() for InterfaceName, InterfaceAddresses in IFAddresses.items(): for Address in InterfaceAddresses: NetNames.append(InterfaceName) NetAddresses.append(Address.address) NetMasks.append(Address.netmask) NetBroadcasts.append(Address.broadcast) self.SystemHardware.update({'NetNames': NetNames}) self.SystemHardware.update({'NetAddresses': NetAddresses}) self.SystemHardware.update({'NetMasks': NetMasks}) self.SystemHardware.update({'NetBroadcasts': NetBroadcasts}) # GPU Info # GPUIds = [] GPUNames = [] GPUTotalMemory = [] GPUs = GPUtil.getGPUs() for GPU in GPUs: GPUIds.append(GPU.id) GPUNames.append(GPU.name) GPUTotalMemory.append(GPU.memoryTotal) self.SystemHardware.update({'GPUIds': GPUIds}) self.SystemHardware.update({'GPUNames': GPUNames}) self.SystemHardware.update({'GPUTotalMemory': GPUTotalMemory})
def get_csv_output(executable, playouts, weights, communicate_string): """Primary function - first three parameters build the basic setup command, the latter two are used to run the CLI and generate the output CSV""" # Extra argument to add at the end final_args = "--noponder" # If the machine is Linux-based, add the folders to the paths; if it is Windows, just change the current working directory if os.name == 'posix': executable = "./leela-zero-0.17/" + executable weights = "./leela-zero-0.17/" + weights else: os.chdir('./leela-zero-0.17') # Check if the user's computer has one or more GPUs - if not, set it to only use CPUs if not GPUtil.getGPUs(): final_args += " --cpu-only" # Key command - configure the actual Leela Zero run string and print it out on-screen for ease of testing run_string = "{} -g -r 0 -d -p {} -w {} {}".format(executable, playouts, weights, final_args) print(run_string) # On Windows, use wexpect, on Linux, use pexpect. Slightly different commands for each to begin Leela Zero if os.name == 'nt': child = wexpect.spawn('cmd.exe') child.expect('>', timeout=120) child.sendline(run_string) else: child = pexpect.spawn('/bin/bash -c "{}"'.format(run_string)) child.expect('Setting max tree', timeout=120) # Once Leela Zero is loaded, we definitely want these three commands run first and foremost starting_commands = ["boardsize 19", "clear_board", "komi 7.5"] for command in starting_commands: child.sendline(command) child.expect( '=', timeout=120 ) # Basic Leela Zero commands always end with a '=' on success (not including lz-analyze) # Convert our giant string of commands into a list of commands communicate_string_list = communicate_string.split("\n") # Output the full communicate_string to command_log.log for further debug review as desired with open("command_log.log", "w") as my_file: my_file.write("\n".join(communicate_string_list)) # Set a basic counter for the current move number y = 0 # all_moves will eventually become our final dataframe all_moves = [] # Initiate the progress bar bar = pb.ProgressBar() colors = ['white', 'black'] # At long last, execute our strings line-by-line. Do it three-by-three since each move has three associated commands (2x 'lz-analyze' plus 'play') for x in bar(range(0, len(communicate_string_list), 3)): y += 1 # If the game is going longer than 180 moves, we can exit Leela Zero if y == 181: break # Extract the human's move from the 'play <color> <coordinate>' command human_move = communicate_string_list[x + 2].split(" ")[2] # Send the primary lz-analyze command to Leela Zero; 'max depth' appears at the end of Leela Zero's output child.sendline(communicate_string_list[x]) child.expect(" max depth", timeout=120) # Only extract those lines of text that have actual moves in them with the key '->' substring. # Windows can just split it immediately, but Linux machines require the string to be decoded first. if os.name == 'nt': before_text = [ line.strip() for line in child.before.split("\n") if "->" in line ] else: before_text = [ line.strip() for line in child.before.decode("utf-8").split("\n") if "->" in line ] # The first line will be the move with the highest LCB winrate, which is what Leela thinks is the "best" option ai_first_choice_move = before_text[0] # Extract move coordinates and other values from the line of text ai_move_coords = ai_first_choice_move.split("->")[0].strip().lower() ai_v_value = ai_first_choice_move.split("(V: ")[1].split("%")[0] ai_n_value = ai_first_choice_move.split("(N: ")[1].split("%")[0] ai_lcb_value = ai_first_choice_move.split("(LCB: ")[1].split("%")[0] global b_player global w_player if colors[y % 2] == 'black': player = b_player else: player = w_player # Begin construction of move_info, i.e. one row of data in our output spreadsheet move_info = { 'move_number': y, 'ai_move': ai_move_coords, 'ai_v_value': ai_v_value, 'ai_n_value': ai_n_value, 'ai_lcb_value': ai_lcb_value, 'human_move': human_move, 'color': colors[y % 2], 'player': player } # As a default, assume the human's move was NOT one of the those identified by Leela Zero. Also extract all 10 moves into a pretty list. is_match_found = False top_10_moves = extract_top_10_moves(before_text) # Go through each move that Leela Zero looked at, checking if any were the human's move. If so, update move_info accordingly for top_10_move in top_10_moves: if top_10_move['move_coord'] == human_move: move_info['is_requery_needed'] = 0 move_info['human_v_value'] = top_10_move['v_value'] move_info['human_n_value'] = top_10_move['n_value'] move_info['human_lcb_value'] = top_10_move['lcb_value'] is_match_found = True break # However, if the human's move is NOT found among the top moves that Leela Zero looked at... if not is_match_found: human_command = communicate_string_list[x + 1] # Still setting is_requery_needed to zero - only set to 1 if this second attempt fails move_info['is_requery_needed'] = 0 # Sort the top 10 moves in ascending order by n_value. Then generate a list of allowed_moves containing the human's move and the # other 9 Leela Zero moves that didn't have the lowest n-value. sorted_top_10 = sorted(top_10_moves, key=lambda i: i['n_value']) allowed_moves = human_move lowest_n = sorted_top_10[0][ 'n_value'] # Save this value for possible use later for top_10_move in sorted_top_10[2:]: allowed_moves += "," + top_10_move['move_coord'] # Replace the "__" placeholder in this command with our new list - this will (theoretically) ensure that Leela gives the move proper attention human_command = human_command.replace("__", allowed_moves) child.sendline(human_command) child.expect(" max depth", timeout=120) # Same as before - eventually I should abstract this into a single function since I'm doing the same thing twice if os.name == 'nt': before_text = [ line.strip() for line in child.before.split("\n") if "->" in line ] else: before_text = [ line.strip() for line in child.before.decode("utf-8").split("\n") if "->" in line ] top_10_moves = extract_top_10_moves(before_text) is_match_found = False # So in theory, it should always have the move now. However, it doesn't *always* (about 95% of the time it does). # You'd know better than me precisely why. for top_10_move in top_10_moves: if top_10_move['move_coord'] == human_move: move_info['human_v_value'] = top_10_move['v_value'] move_info['human_n_value'] = top_10_move['n_value'] move_info['human_lcb_value'] = top_10_move['lcb_value'] is_match_found = True break # Finally, we're now going for absolutely *force* Leela Zero to give us the V and LCB values by re-running the previous command # but with only a single allowable move on the entire board - the move want it to. # The downside, however, is that the N value is lost. Since only one move is allowable, the N value becomes about 99.96% or so. # To mitigate this, I just give this human move an n-value equal to the lowest N-value from the top 10 moves Leela Zero considered. # This is admittedly not an ideal solution, open to better ideas? if not is_match_found: human_command = human_command.replace(allowed_moves, human_move) child.sendline(human_command) child.expect(" max depth", timeout=120) before_text = [ line.strip() for line in child.before.split("\n") if "->" in line ] top_10_moves = extract_top_10_moves(before_text) move_info['human_v_value'] = top_10_moves[0]['v_value'] move_info['human_n_value'] = lowest_n move_info['human_lcb_value'] = top_10_moves[0]['lcb_value'] move_info['is_requery_needed'] = 1 # Add the "row" of data to the all_moves list all_moves.append(move_info) # Execute the 3rd command - very simple, just play the human's move on the board child.sendline(communicate_string_list[x + 2]) child.expect('=', timeout=120) child.sendline('exit') # Generate the dataframe, organize the columns, and return the finished dataframe df = pd.DataFrame(all_moves) column_order = [ "move_number", "color", "human_move", "ai_move", "human_v_value", "ai_v_value", "human_n_value", "ai_n_value", "human_lcb_value", "ai_lcb_value", 'is_requery_needed', 'player' ] df = df[column_order] return df
async def get_gpu_state(request): temps = [] for gpu in GPUtil.getGPUs(): temps.append(float(gpu.temperature)) return web.Response(text=str(max(temps)), content_type="text/html")
while True: count = count + 1 ser = serial.Serial(DEVICE, 115200) # open serial port # CPU Stats cpu_percent = str( psutil.cpu_percent(interval=1)) # Get CPU percent (Takes 1 second) cpu_temp = "..." # Haven't figured out how to get CPU temp yet # RAM Stats ram_percent = str(psutil.virtual_memory().percent) ram_used = str(round(psutil.virtual_memory().used / (2**30), 1)) ram_total = str(round(psutil.virtual_memory().total / (2**30), 1)) # Get GPU Stats (Windows Only) if GPUtil.getGPUs(): gpu_percent = f"{round(GPUtil.getGPUs()[0].load*100, 2)}" gpu_temp = f"{GPUtil.getGPUs()[0].temperature}" else: gpu_percent = "" gpu_temp = "" # Get Disk Stats storage_total, storage_used, storage_free = shutil.disk_usage("/") disk_percent = str( round((storage_used / (2**30)) / (storage_total / (2**30)) * 100, 1)) disk_total = str(round(storage_total / (2**30))) disk_used = str(round(storage_used / (2**30))) # Create JSON Data command = '{"cpu":{"percent": "' + cpu_percent + '", "temp": "' + cpu_temp + '"},"ram":{"percent": "' + ram_percent + '", "used":"' + ram_used + '", "total":"' + ram_total + '"},"gpu":{"percent":"' + gpu_percent + '", "temp":"' + gpu_temp + '"},"disk":{"percent":"' + disk_percent + '","used":"' + disk_used + '","total":"' + disk_total + '"}}\n\r'
def model(train_x, train_y, dev_x, dev_y, test_x, test_y, overal_maxlen, qwks): from keras.models import Sequential from keras.layers import Dense, Dropout, Activation, GlobalAveragePooling1D from keras.layers.embeddings import Embedding from keras.layers.recurrent import LSTM from keras.initializers import Constant from keras import optimizers import keras.backend as K from deepats.my_layers import MeanOverTime from deepats.rwa import RWA import pickle as pk import numpy as np import string import random import os from deepats.optimizers import get_optimizer from deepats.ets_evaluator import Evaluator import deepats.ets_reader as dataset from deepats.ets_config import get_args import GPUtil def random_id(size=6, chars=string.ascii_uppercase + string.digits): return ''.join(random.choice(chars) for _ in range(size)) def kappa_metric(t, x): u = 0.5 * K.sum(K.square(x - t)) v = K.dot(K.transpose(x), t - K.mean(t)) return v / (v + u) def kappa_loss(t, x): u = K.sum(K.square(x - t)) v = K.dot(K.squeeze(x, 1), K.squeeze(t - K.mean(t), 1)) return u / (2 * v + u) import time ms = int(round(time.time() * 1000)) rand_seed = ms % (2**32 - 1) random.seed(rand_seed) args = get_args() model_id = random_id() abs_vocab_file = os.path.join(args.abs_out, 'vocab.pkl') with open(abs_vocab_file, 'rb') as vocab_file: vocab = pk.load(vocab_file) vocab_size = len(vocab) acts = ['tanh', 'relu', 'hard_sigmoid'] emb_dim = {{choice([50, 100, 200, 300])}} rnn_dim = {{uniform(50, 500)}} rnn_dim = int(rnn_dim) rec_act = {{choice([0, 1, 2])}} rec_act = acts[rec_act] dropout = {{uniform(0.2, 0.95)}} epochs = args.epochs n_emb = vocab_size * emb_dim n_rwa = (903 + 2 * rnn_dim) * rnn_dim n_tot = n_emb + n_rwa + rnn_dim + 1 lr = {{lognormal(-3 * 2.3, .8)}} lr = 1.5 * lr rho = {{normal(.875, .04)}} clipnorm = {{uniform(1, 15)}} eps = {{loguniform(-8 * 2.3, -5 * 2.3)}} opt = optimizers.RMSprop(lr=lr, rho=rho, clipnorm=clipnorm, epsilon=eps) loss = kappa_loss metric = kappa_metric evl = Evaluator(dataset, args.prompt_id, args.abs_out, dev_x, test_x, dev_df, test_df, model_id=model_id) train_y_mean = train_y.mean(axis=0) if train_y_mean.ndim == 0: train_y_mean = np.expand_dims(train_y_mean, axis=1) num_outputs = len(train_y_mean) mask_zero = False model = Sequential() model.add(Embedding(vocab_size, emb_dim, mask_zero=mask_zero)) model.add(RWA(rnn_dim, recurrent_activation=rec_act)) model.add(Dropout(dropout)) bias_value = (np.log(train_y_mean) - np.log(1 - train_y_mean)).astype( K.floatx()) model.add(Dense(num_outputs, bias_initializer=Constant(value=bias_value))) model.add(Activation('tanh')) model.emb_index = 0 from deepats.w2vEmbReader import W2VEmbReader as EmbReader emb_reader = EmbReader(args.emb_path, emb_dim) emb_reader.load_embeddings(vocab) emb_wts = emb_reader.get_emb_matrix_given_vocab( vocab, model.layers[model.emb_index].get_weights()[0]) wts = model.layers[model.emb_index].get_weights() wts[0] = emb_wts model.layers[model.emb_index].set_weights(wts) model.compile(loss=loss, optimizer=opt, metrics=[metric]) model_yaml = model.to_yaml() import GPUtil if GPUtil.avail_mem() < 0.1: return {'loss': 1, 'status': STATUS_OK, 'model': '', 'weights': None} print('model_id: %s' % (model_id)) print(model_yaml) print('PARAMS\t\ %s\t\ lr= %.4f\t\ rho= %.4f\t\ clip= %.4f\t\ eps= %.4f\t\ embDim= %.4f\t\ rnnDim= %.4f\t\ drop= %.4f\t\ recAct= %s' % (model_id, lr, rho, clipnorm, np.log(eps) / 2.3, emb_dim, rnn_dim, dropout, rec_act)) for i in range(epochs): train_history = model.fit(train_x, train_y, batch_size=args.batch_size, epochs=1, verbose=0) evl.evaluate(model, i) evl.output_info() p = evl.stats[3] / qwks[0] if i > 10 and p < 0.9: break i = evl.comp_idx j = i + 2 best_dev_kappa = evl.best_dev[i] best_test_kappa = evl.best_dev[j] print('Test kappa:', best_dev_kappa) return { 'loss': 1 - best_dev_kappa, 'status': STATUS_OK, 'model': model.to_yaml(), 'weights': pk.dumps(model.get_weights()) }
try: # get IO statistics since boot net_io = psutil.net_io_counters() print(f"Total Bytes Sent: {get_size(net_io.bytes_sent)}") print(f"Total Bytes Received: {get_size(net_io.bytes_recv)}") except Exception as e: print(e) # GPU information import GPUtil from tabulate import tabulate try: print("=" * 40, "GPU Details", "=" * 40) gpus = GPUtil.getGPUs() list_gpus = [] for gpu in gpus: # get the GPU id gpu_id = gpu.id # name of GPU gpu_name = gpu.name # get % percentage of GPU usage of that GPU gpu_load = f"{gpu.load*100}%" # get free memory in MB format gpu_free_memory = f"{gpu.memoryFree}MB" # get used memory gpu_used_memory = f"{gpu.memoryUsed}MB" # get total memory gpu_total_memory = f"{gpu.memoryTotal}MB" # get GPU temperature in Celsius
default.val_vis, default.val_shuffle, default.val_has_rpn, default.proposal, default.val_max_box, default.val_thresh) prop_file = 'proposals_%s_%s.mat' % (default.test_image_set, default.exp_name) savemat(prop_file, default.res_dict) default.testing = False if __name__ == '__main__': config_file = cfg_from_file('config.yml') merge_a_into_b(config_file, config) config.NUM_ANCHORS = len(config.ANCHOR_SCALES) * len(config.ANCHOR_RATIOS) default_file = cfg_from_file('default.yml') merge_a_into_b(default_file, default) default.e2e_prefix = 'model/' + default.exp_name if default.gpus == '': # auto select import GPUtil deviceIDs = GPUtil.getAvailable(order='lowest', limit=1, maxLoad=0.5, maxMemory=0.5) GPUs = GPUtil.getGPUs() default.gpus = str(len(GPUs)-1-deviceIDs[0]) logger.info('using gpu '+default.gpus) default.val_gpu = default.gpus default.prefetch_thread_num = min(default.prefetch_thread_num, config.TRAIN.SAMPLES_PER_BATCH) print config print default test_net(default.e2e_prefix, default.begin_epoch)
def __init__( self, models: List[nn.Module], model_connection: Dict[Tuple[int, int], int], devices: List[Union[t.device, str]] = None, model_size_multiplier=2, max_mem_ratio=0.5, cpu_weight=0, connection_weight=2, size_match_weight=1e-2, complexity_match_weight=1, entropy_weight=1, iterations=500, update_rate=0.01, gpu_gpu_distance=1, cpu_gpu_distance=10, move_models=True, ): """ Assign models to different devices. In the scope of a single process. Assigner assumes all GPUs have the **same processing power**. Assignment is based on four aspects: 1. Distance and model connections. Connection is usually indicated by the amount of data transmitted between two models. 2. Compute complexity. 3. Model size. 4. Entropy. Four aspects are controlled by four weights: 1. ``connection_weight``, assigner will try to reduce the total ``distance * connection`` if this weight is larger. 2. ``size_match_weight``, this weight controls the total memory space used on a single device, only works if total assigned memory of models exceeds allowed device memory size (internally it uses a relu activation), the larger, the tighter and more restricted the fit. 3. ``complexity_match_weight``, this weights balance the model computation cost across devices, assigner will try to even the ``computation cost / compute power`` ratio for each device if this weight is larger. 4. ``entropy_weight``, this weight minimize the uncertainty of model placement probability, so ``model i`` will have a close to 1 probability of locating on some ``device j`` if this weight is larger. Assignment uses gradient descent to compute the probability matrix of each ``model i`` locating on each available ``device j``. See Also: :class:`.ModelSizeEstimator` Note: When the sum of your model size is very close to the capacity of your device memory, `ModelAssigner` does not respond very well to the ``size_match_weight``, therefore, please consider about increasing ``model_size_multiplier`` or decreasing ``max_mem_ratio``. Args: models: Models to assign. model_connection: Connection weight between modules. **Must be positive** devices: Available devices. model_size_multiplier: Size multiplier of models, used to reserve enough space for models, max_mem_ratio: Maximum percent of memory allowed. cpu_weight: Weight of cpu. Relative to the computing power of one GPU. By default it is 0 so no computation will be performed on CPU. **Must be positive** connection_weight: Weight of connection between models. size_match_weight: Weight of size match. complexity_match_weight: Weight of complexity match. entropy_weight: Weight of entropy. iterations: Number of optimization iterations. update_rate: Learning rate of the adam optimizer. gpu_gpu_distance: Estimated distance cost between gpu-gpu. **Must be positive** cpu_gpu_distance: Estimated distance cost between cpu-gpu. **Must be positive** move_models: Whether to automatically move the models after assignment. """ if devices is None: devices = [ t.device(type="cuda", index=i) for i in GPUtil.getAvailable(order="load") ] else: devices = [t.device(d) for d in devices] available_devices = [ t.device(type="cuda", index=i) for i in GPUtil.getAvailable(order="load") ] used_devices = [] for dev in devices: if dev.type == "cuda" and dev not in available_devices: default_logger.info( f"Warning: device {dev} not available, removed.") else: used_devices.append(dev) devices = used_devices if not devices: devices = [t.device("cpu")] default_logger.info(f"Using these devices: {devices}") sizes = [ ModelSizeEstimator(model, model_size_multiplier).estimate_size() for model in models ] device_size_capacity = [] device_complexity_capacity = [] gpus = GPUtil.getGPUs() for dev in devices: if dev.type == "cpu": device_size_capacity.append( int(psutil.virtual_memory().available / 1024**2) * max_mem_ratio) device_complexity_capacity.append(cpu_weight) elif dev.type == "cuda": device_size_capacity.append(gpus[dev.index].memoryFree * max_mem_ratio) device_complexity_capacity.append(1 - gpus[dev.index].load) if np.sum(np.array(sizes)) > np.sum(device_size_capacity): raise RuntimeError( f"Estimated model will use {np.sum(np.array(sizes)):.2f} MB, " f"but only have {np.sum(device_size_capacity):.2f} MB allowed memory " "in total.") # assign model to devices # using heuristic and gradient decent device_num = len(devices) model_num = len(models) # Important, the placement probability matrix! this matrix # describes the probability of placement of: # model i on device j placement = t.randn([model_num, device_num], requires_grad=True) optimizer = t.optim.Adam([placement], lr=update_rate) model_size = t.tensor(sizes, dtype=t.float).view([1, model_num]) size_capacity = t.tensor(device_size_capacity, dtype=t.float).view([1, device_num]) model_complexity = model_size # complexity_capacity is basically the estimated computing power # of devices. complexity_capacity = t.tensor(device_complexity_capacity, dtype=t.float).view([1, device_num]) # model connection indicates the amount of data transmitted between # each pair of models, a weighted adjacency matrix. model_conn = t.zeros([model_num, model_num]) for direction, conn in model_connection.items(): model_conn[direction[0], direction[1]] = conn # device distance matrix device_distance = t.zeros([device_num, device_num]) for i in range(device_num): for j in range(i): if (devices[i].type == "cpu" and devices[j].type == "cuda" or devices[i].type == "cuda" and devices[j].type == "cpu"): device_distance[i, j] = device_distance[j, i] = cpu_gpu_distance elif (devices[i].type == "cuda" and devices[j].type == "cuda" and devices[i].index != devices[j].index): device_distance[i, j] = device_distance[j, i] = gpu_gpu_distance # optimize for _ in range(iterations): self.optimize_placement( optimizer, placement, model_size, size_capacity, model_complexity, complexity_capacity, model_conn, device_distance, connection_weight, size_match_weight, complexity_match_weight, entropy_weight, ) self._assignment = [ devices[d] for d in t.argmax(placement, dim=1).tolist() ] if move_models: for model, ass_device in zip(models, self._assignment): model.to(ass_device)
parser = argparse.ArgumentParser() parser.add_argument('--points_batch', type=int, default=16384, help='point batch size') parser.add_argument('--nepoch', type=int, default=100000, help='number of epochs to train for') parser.add_argument('--conf', type=str, default='setup.conf') parser.add_argument('--expname', type=str, default='single_shape') parser.add_argument('--gpu', type=str, default='2', help='GPU to use [default: GPU auto]') parser.add_argument('--is_continue', default=False, action="store_true", help='continue') parser.add_argument('--timestamp', default='latest', type=str) parser.add_argument('--checkpoint', default='latest', type=str) parser.add_argument('--eval', default=False, action="store_true") args = parser.parse_args() if args.gpu == "auto": deviceIDs = GPUtil.getAvailable(order='memory', limit=1, maxLoad=0.5, maxMemory=0.5, includeNan=False, excludeID=[], excludeUUID=[]) gpu = deviceIDs[0] else: gpu = args.gpu trainrunner = ReconstructionRunner( conf=args.conf, points_batch=args.points_batch, nepochs=args.nepoch, expname=args.expname, gpu_index=gpu, is_continue=args.is_continue, timestamp=args.timestamp, checkpoint=args.checkpoint, eval=args.eval )
!pip install GPUtil !echo $LD_LIBRARY_PATH # !ls /usr/lib64-nvidia !ls /usr/local/cuda/extras/CUPTI/lib64 !export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64 from google.colab import drive drive.mount('/content/drive') import psutil import humanize import os import GPUtil as GPU import GPUtil as GPU GPUs = GPU.getGPUs() # XXX: only one GPU on Colab and isn’t guaranteed gpu = GPUs[0] def printm(): process = psutil.Process(os.getpid()) print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss)) print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal)) printm() !nvidia-smi import os import pickle import pandas as pd import tensorflow as tf from tensorflow import keras
import os import subprocess import GPUtil deviceIDs = GPUtil.getAvailable(order='first', limit=3, maxLoad=0.5, maxMemory=0.5) print(','.join(str(e) for e in deviceIDs)) task_queue_file = os.path.join(os.environ.get("HOME", None), "task_queue.txt") if not os.path.isfile(task_queue_file): sys.exit(0) task_list = open(task_queue_file).readlines() print(task_list) for i in range(min(len(deviceIDs), len(task_list))): task = task_list[i].strip() + " --gpu-id=%d" % i print(task) subprocess.Popen(task, shell=True, cwd="/home/xyang22/project/research/active-learning-dnn")
try: # Try to import cupy import cupy as cp import cupyx.scipy.linalg as cpxl # Try to access a device cp.cuda.Device(0).compute_capability # Flag indicating successful import have_cupy = True # Import appropriate versions of utility functions from ._cp_util import * try: # Try to import GPUtil import GPUtil # Check whether GPUtil is functional gpus = GPUtil.getGPUs() if gpus: have_gputil = True else: have_gputil = False except ImportError: have_gputil = False except ValueError: have_gputil = False if have_gputil: from ._gputil import * else: from ._nogputil import * except Exception: # If cupy import or device access fails, import numpy to the same alias import numpy as cp
import os.path import sys sys.path.append( os.path.dirname(os.path.abspath(__file__)) + (os.path.sep + '..') * 2) import numpy as np import time, timeit # is there a working GPU around ? import GPUtil try: gpu_available = len(GPUtil.getGPUs()) > 0 except: gpu_available = False N = 1500 M = 300 D = 3 E = 3 # declare numpy from pykeops.numpy.utils import differences, squared_distances, grad_np_kernel, chain_rules a = np.random.rand(N, E).astype('float32') x = np.random.rand(N, D).astype('float32') y = np.random.rand(M, D).astype('float32') b = np.random.rand(M, E).astype('float32') sigma = np.array([0.4]).astype('float32') # declare the torch counterpart try: import torch
def main(conf_name, gpu): # Initialize configs and prepare result dir with date if conf_name is None: conf = configs.Config() else: # this code doesn't seem to work so permanently assign the LIDAR_CONF config # conf = None # exec ('conf = configs.%s' % conf_name) # conf = configs.LIDAR_CONF if conf_name == "X2_REAL_CONF": conf = configs.X2_REAL_CONF elif conf_name == "X2_GRADUAL_IDEAL_CONF": conf = configs.X2_GRADUAL_IDEAL_CONF res_dir = prepare_result_dir(conf) local_dir = os.path.dirname(__file__) # We take all png files that are not ground truth files = [ file_path for file_path in glob.glob('%s/*.%s' % (conf.input_path, conf.input_file_ext)) if not file_path[-7:-4] == '_gt' ] print("files", res_dir, local_dir) print(files) # Loop over all the files for file_ind, input_file in enumerate(files): # Ground-truth file needs to be like the input file with _gt (if exists) ground_truth_file = input_file[:-4] + '_gt.png' if not os.path.isfile(ground_truth_file): ground_truth_file = '0' # Numeric kernel files need to be like the input file with serial number kernel_files = [ '%s_%d.mat;' % (input_file[:-4], ind) for ind in range(len(conf.scale_factors)) ] kernel_files_str = ''.join(kernel_files) for kernel_file in kernel_files: if not os.path.isfile(kernel_file[:-1]): kernel_files_str = '0' print('no kernel loaded') break print(kernel_files) # This option uses all the gpu resources efficiently if gpu == 'all': # Stay stuck in this loop until there is some gpu available with at least half capacity gpus = [] while not gpus: gpus = GPUtil.getAvailable(order='memory') # Take the gpu with the most free memory cur_gpu = gpus[-1] # Run ZSSR from command line, open xterm for each run os.system( "xterm -hold -e " + conf.python_path + " %s/run_ZSSR_single_input.py '%s' '%s' '%s' '%s' '%s' '%s' alias python &" % (local_dir, input_file, ground_truth_file, kernel_files_str, cur_gpu, conf_name, res_dir)) # Verbose print('Ran file #%d: %s on GPU %d\n' % (file_ind, input_file, cur_gpu)) # Wait 5 seconds for the previous process to start using GPU. if we wouldn't wait then GPU memory will not # yet be taken and all process will start on the same GPU at once and later collapse. sleep(5) # The other option is just to run sequentially on a chosen GPU. else: run_ZSSR_single_input.main(input_file, ground_truth_file, kernel_files_str, gpu, conf, res_dir)
def run(self): def get_a_worker(): return self.workers.pop(0) def free_a_worker(w): self.workers.append(w) def register_job(c, num_part=1): job_checksum[c] = num_part finish_jobs[c] = [] def unregister_job(c): job_checksum.pop(c) finish_jobs.pop(c) self.context = zmq.Context.instance() self.frontend = self.context.socket(zmq.ROUTER) self.frontend.bind('tcp://*:%d' % self.port) self.backend = self.context.socket(zmq.ROUTER) self.backend.bind('ipc:///tmp/bert.service') available_gpus = range(self.num_worker) try: import GPUtil available_gpus = GPUtil.getAvailable(limit=self.num_worker) if len(available_gpus) < self.num_worker: logger.warning('only %d GPU(s) is available, but ask for %d' % (len(available_gpus), self.num_worker)) except FileNotFoundError: logger.warn( 'nvidia-smi is missing, often means no gpu found on this machine. ' 'will run service on cpu instead') for i in available_gpus: process = BertWorker(i, self.args) self.processes.append(process) process.start() poller = zmq.Poller() # Only poll for requests from backend until workers are available poller.register(self.backend, zmq.POLLIN) job_queue, finish_jobs, job_checksum = [], {}, {} while True: sockets = dict(poller.poll(2)) if self.backend in sockets: msg = self.backend.recv_multipart() worker, _, client = msg[:3] free_a_worker(worker) if client != b'READY' and len(msg) > 3: arr_info, arr_val = jsonapi.loads(msg[4]), msg[7] X = np.frombuffer(memoryview(arr_val), dtype=arr_info['dtype']) finish_jobs[client].append(X.reshape(arr_info['shape'])) else: poller.register(self.frontend, zmq.POLLIN) # check if there are finished jobs, send it back to workers finished = [(k, v) for k, v in finish_jobs.items() if len(v) == job_checksum[k]] for client, tmp in finished: send_ndarray(self.frontend, client, np.concatenate(tmp, axis=0)) unregister_job(client) if self.frontend in sockets: client, _, msg = self.frontend.recv_multipart() if msg == b'SHOW_CONFIG': self.frontend.send_multipart([ client, b'', jsonapi.dumps({ **{ 'client': client.decode('ascii') }, **self.args_dict }) ]) continue seqs = pickle.loads(msg) num_seqs = len(seqs) if num_seqs > self.max_batch_size: # divide the large batch into small batches s_idx = 0 n = 0 while s_idx < num_seqs: tmp = seqs[s_idx:(s_idx + self.max_batch_size)] if tmp: job_queue.append( (client, pickle.dumps(tmp, protocol=-1))) n += 1 s_idx += len(tmp) register_job(client, num_part=n) else: register_job(client) job_queue.append((client, msg)) # non-empty job queue and free workers, pop the last one and send it to a worker while self.workers and job_queue: client, tmp = job_queue.pop() worker = get_a_worker() self.backend.send_multipart([worker, b'', client, b'', tmp]) logger.info( 'available workers: %2d\tjob queue: %3d\tpending clients: %3d' % (len(self.workers), len(job_queue), len(job_checksum)))
import GPUtil import subprocess from sklearn.cluster import DBSCAN import MDAnalysis as mda from MDAnalysis.analysis.rms import RMSD from utils import start_rabbit, start_worker, start_flower_monitor, read_h5py_file, cm_to_cvae, job_on_gpu from utils import find_frame, write_pdb_frame, make_dir_p, job_list, outliers_from_latent, predict_from_cvae from utils import omm_job, cvae_job from CVAE import CVAE # n_gpus = 16 # number of cvae jobs, starting from hyper_dim 3 n_cvae = 4 GPU_ids = [gpu.id for gpu in GPUtil.getGPUs()] print('Available GPUs', GPU_ids) os.environ["RABBITMQ_MNESIA_BASE"] = "~/.rabbit_base" os.environ["RABBITMQ_LOG_BASE"] = "~/.rabbit_base/" os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # top_file = os.path.abspath('../P27-all/C1B48/C1B48.top.gz') # pdb_file = os.path.abspath('../P27-all/C1B48/C1B48.pdb.gz') top_file = None pdb_file = os.path.abspath('./pdb/100-fs-peptide-400K.pdb') ref_pdb_file = os.path.abspath('./pdb/fs-peptide.pdb') work_dir = os.path.abspath('./') # create folders for store results
return new_hidden_state def forward(self, x): x = torch.transpose(x, 0, 1) output, (final_hidden_state, final_cell_state) = self.encoder(x, None) attn_output = self.attention_net(output, output[-1]) attn_output = self.dropout(attn_output) fc_output = self.fc1(attn_output) output = self.softmax(fc_output) # output layer using softmax function return output print("model done") if cuda_gpu: device_ids = GPUtil.getAvailable(limit=4) print(device_ids) if torch.cuda.device_count() == 1: lstmattn = AttentionLSTM(embedding_dim, hidden_dim, num_layers, output_size, dropout).cuda() else: torch.backends.cudnn.benchmark = True lstmattn = AttentionLSTM(embedding_dim, hidden_dim, num_layers, output_size, dropout).cuda(device_ids[0]) lstmattn = nn.DataParallel(lstmattn, device_ids=device_ids) else: lstmattn = AttentionLSTM(embedding_dim, hidden_dim, num_layers, output_size, dropout) print(lstmattn) tokenizer = ''