def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows the model to intialize any state associated with this model. Parameters ---------- args : dict Both keys and values are strings. The dictionary keys and values are: * model_config: A JSON string containing the model configuration * model_instance_kind: A string containing model instance kind * model_instance_device_id: A string containing model instance device ID * model_repository: Model repository path * model_version: Model version * model_name: Model name """ # You must parse model_config. JSON string is not parsed here self.model_config = json.loads(args['model_config']) self.model_instance_device_id = json.loads( args['model_instance_device_id']) import numba.cuda as cuda cuda.select_device(self.model_instance_device_id) import cudf from cudf.core.subword_tokenizer import SubwordTokenizer # get vocab v_p = Path(__file__).with_name('vocab_hash.txt') self.cudf_tokenizer = SubwordTokenizer(v_p, do_lower_case=True) self.cudf_lib = cudf self.seq_len = 256
def box2d_rotate_iou(boxes2d, gt_boxes2d, device_id=0): # Inputs: # boxes2d: (N1, 5) x,y,w,l,r # gt_boxes2d: (N2, 5) x,y,w,l,r # Outputs: # iou: (N1, N2) boxes2d = boxes2d.astype(np.float32) gt_boxes2d = gt_boxes2d.astype(np.float32) N1 = boxes2d.shape[0] N2 = gt_boxes2d.shape[0] iou = np.zeros((N1, N2), dtype=np.float32) if N1 == 0 or N2 == 0: return iou threadsPerBlock = 8 * 8 cuda.select_device(device_id) blockspergrid = (DIVUP(N1, threadsPerBlock), DIVUP(N2, threadsPerBlock)) stream = cuda.stream() with stream.auto_synchronize(): boxes_dev = cuda.to_device(boxes2d.reshape([-1]), stream) query_boxes_dev = cuda.to_device(gt_boxes2d.reshape([-1]), stream) iou_dev = cuda.to_device(iou.reshape([-1]), stream) rotate_iou_kernel[blockspergrid, threadsPerBlock, stream](N1, N2, boxes_dev, query_boxes_dev, iou_dev) iou_dev.copy_to_host(iou.reshape([-1]), stream=stream) return iou.astype(boxes2d.dtype)
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0): """ rotated box iou running in gpu. 8x faster than cpu version (take 5ms in one example with numba.cuda code). convert from [this project](https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation). :param boxes: rbboxes, format: centers, dims, angles(clockwise when positive), FloatTensor[N, 5] :param query_boxes: FloatTensor[K, 5] :param criterion: optional, default: -1 :param device_id: int, optional, default: 0 :return: """ boxes = boxes.astype(np.float32) query_boxes = query_boxes.astype(np.float32) N = boxes.shape[0] K = query_boxes.shape[0] iou = np.zeros((N, K), dtype=np.float32) if N == 0 or K == 0: return iou threads_per_block = 8 * 8 cuda.select_device(device_id) blocks_per_grid = (div_up(N, threads_per_block), div_up(K, threads_per_block)) stream = cuda.stream() with stream.auto_synchronize(): boxes_dev = cuda.to_device(boxes.reshape([-1]), stream) query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream) iou_dev = cuda.to_device(iou.reshape([-1]), stream) rotate_iou_kernel_eval[blocks_per_grid, threads_per_block, stream](N, K, boxes_dev, query_boxes_dev, iou_dev, criterion) iou_dev.copy_to_host(iou.reshape([-1]), stream=stream) return iou.astype(boxes.dtype)
def gpuWork(stuff): batch, batch_size, gpu_id, data_len, data, parser = stuff workers = multiprocessing.cpu_count() / len(cuda.list_devices()) pool = ThreadPool(workers) cuda.select_device(gpu_id) start = batch * batch_size end = (batch + 1) * batch_size if end > data_len: end = data_len results = pool.map( randRotateAndTranslateWrap, zip([parser] * (end - start), data[start:end], [ gpu_id, ] * len(data[start:end]), [True] * len(data[start:end]))) images, energies = [], [] for im, e in results: images.append(im) energies.append(e) images = np.array(images, dtype=np.float32) energies = np.array(energies, dtype=np.float32) pool.close() pool.join() return images, energies
def gpu_dmt_timeseries(dedisp_times, psr_data, max_delay, device=0): """ :param cand: Candidate object :param device: GPU id :return: """ cuda.select_device(device) dm_time = np.zeros((dedisp_times.shape[1], int(psr_data.shape[0]-max_delay)), dtype=np.float32) @cuda.jit(fastmath=True) def gpu_dmt(cand_data_in, all_delays, cand_data_out): ii, jj, kk = cuda.grid(3) if ii < cand_data_in.shape[0] and jj < cand_data_out.shape[1] and kk < all_delays.shape[1]: cuda.atomic.add(cand_data_out, (kk, jj), cand_data_in[ii, (jj + all_delays[ii,kk]) ]) #with cuda.pinned(dedisp_times, dm_time, psr_data): all_delays = cuda.to_device(dedisp_times) dmt_return = cuda.device_array(dm_time.shape, dtype=np.float32) cand_data_in = cuda.to_device(np.array(psr_data.T, dtype=psr_data.dtype)) threadsperblock = (4, 8, 32) blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1]) blockspergrid_z = math.ceil(dedisp_times.shape[1] / threadsperblock[2]) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) gpu_dmt[blockspergrid, threadsperblock](cand_data_in, all_delays, dmt_return) dm_time = dmt_return.copy_to_host() #print(all_delays.shape) cuda.close() return dm_time
def gpu_dmt(cand, device=0): """ :param cand: Candidate object :param device: GPU id :return: """ cuda.select_device(device) chan_freqs = cuda.to_device(np.array(cand.chan_freqs, dtype=np.float32)) dm_list = cuda.to_device(np.linspace(0, 2 * cand.dm, 256, dtype=np.float32)) dmt_return = cuda.to_device(np.zeros((256, cand.data.shape[0]), dtype=np.float32)) cand_data_in = cuda.to_device(np.array(cand.data.T, dtype=np.uint8)) @cuda.jit def gpu_dmt(cand_data_in, chan_freqs, dms, cand_data_out, tsamp): ii, jj, kk = cuda.grid(3) if ii < cand_data_in.shape[0] and jj < cand_data_in.shape[1] and kk < dms.shape[0]: disp_time = int( -1 * 4148808.0 * dms[kk] * (1 / (chan_freqs[0]) ** 2 - 1 / (chan_freqs[ii]) ** 2) / 1000 / tsamp) cuda.atomic.add(cand_data_out, (kk, jj), cand_data_in[ii, (jj + disp_time) % cand_data_in.shape[1]]) threadsperblock = (16, 8, 8) blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1]) blockspergrid_z = math.ceil(dm_list.shape[0] / threadsperblock[2]) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) gpu_dmt[blockspergrid, threadsperblock](cand_data_in, chan_freqs, dm_list, dmt_return, float(cand.tsamp)) cand.dmt = dmt_return.copy_to_host() cuda.close() return cand
def relase_GPU_memory(): K.clear_session() cuda.select_device(0) cuda.close() ses = K.get_session() config = tf.ConfigProto() K.tensorflow_backend.set_session(tf.Session(config=config))
def cuda_select_device(dev_i): try: cuda.close() except Exception as e: print(e) #pass cuda.select_device(dev_i)
def inference(file_path, file_urls): result = dict() config = yaml.load(open("./assets/config.yaml", 'r'), Loader=yaml.FullLoader) pse = InferencePSE(config["pse_evaluation_parameter"], file_path) ocr = InferenceOCR(config["ocr_evaluation_parameter"], file_path) #Inference PSE pse_time = pse.run() #Crop image based on PSE output # Release the gpu memory cuda.select_device(int(config["pse_evaluation_parameter"]["gpu_list"])) cuda.close() print(file_path) CropPSE(file_path) #Inference OCR ocr_time = ocr.run() #Combining Result #CreateTxt(file_urls) for file_name in file_urls: result[file_name] = dict() txt_file = "./assets/demo/text/" + file_name.replace("jpg", "txt") img_file = file_path + file_name df, _, _, _ = create_df(txt_file) dict_cells, list_infos = create_cells(df) result[file_name]['df'] = create_DB(dict_cells, list_infos).drop( 'idx', axis=1).to_html(header="true") # Visualizer result[file_name]['img'] = connect_and_save(img_file, dict_cells, list_infos) return result
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0): """ :param boxes:这个是gtbox :param query_boxes: 这个是det box :param criterion: 这个是 :param device_id: :return: """ box_dtype = boxes.dtype boxes = boxes.astype(np.float32) query_boxes = query_boxes.astype(np.float32) N = boxes.shape[0] K = query_boxes.shape[0] iou = np.zeros((N, K), dtype=np.float32) if N == 0 or K == 0: return iou threadsPerBlock = 8 * 8 cuda.select_device(device_id) blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock)) stream = cuda.stream() with stream.auto_synchronize(): boxes_bev = cuda.to_device(boxes.reshape([-1]), stream) query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream) iou_dev = cuda.to_device(iou.reshape([-1]), stream) rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](N, K, boxes_bev, query_boxes_dev, iou_dev, criterion) iou_dev.copy_to_host(iou.reshape([-1]), stream=stream) return iou.astype(boxes.dtype)
def stupidconv_gpu(img, filt, padval): """ does convolution without using FFT because FFT is pissing me off and giving me weird answers :param img: :param filt: :param padval: :return: """ cuda.close() cuda.select_device(1) # get the number of nonzero entries in the filter for later averaging of result filt_nnz = np.count_nonzero(filt) # pad the images s_filt = filt.shape s_img = img.shape # appropriate padding depends on context # pad with filt size all around img pad_img = np.ones((s_img[0] + (2 * s_filt[0]), s_img[1] + (2 * s_filt[1])), dtype=np.float32) * padval pad_img[s_filt[0]: s_img[0] + s_filt[0], s_filt[1]: s_img[1] + s_filt[1]] = img output = np.zeros(pad_img.shape, dtype=np.float32) d_pad_img = cuda.to_device(pad_img) d_filt = cuda.to_device(filt) d_output = cuda.to_device(output) stupidconv_gpu_helper(d_pad_img, d_filt, s_img[0], s_img[1], s_filt[0], s_filt[1], d_output) output = d_output.copy_to_host() output = output[s_filt[0]:s_filt[0] + s_img[0], s_filt[1]:s_filt[1] + s_img[1]] return output / filt_nnz
def nms_gpu(dets, nms_overlap_thresh, device_id=0): """nms in gpu. Args: dets ([type]): [description] nms_overlap_thresh ([type]): [description] device_id ([type], optional): Defaults to 0. [description] Returns: [type]: [description] """ boxes_num = dets.shape[0] keep_out = np.zeros([boxes_num], dtype=np.int32) scores = dets[:, 4] order = scores.argsort()[::-1].astype(np.int32) boxes_host = dets[order, :] threadsPerBlock = 8 * 8 col_blocks = div_up(boxes_num, threadsPerBlock) cuda.select_device(device_id) mask_host = np.zeros((boxes_num * col_blocks, ), dtype=np.uint64) blockspergrid = (div_up(boxes_num, threadsPerBlock), div_up(boxes_num, threadsPerBlock)) stream = cuda.stream() with stream.auto_synchronize(): boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream) mask_dev = cuda.to_device(mask_host, stream) nms_kernel[blockspergrid, threadsPerBlock, stream]( boxes_num, nms_overlap_thresh, boxes_dev, mask_dev) mask_dev.copy_to_host(mask_host, stream=stream) # stream.synchronize() num_out = nms_postprocess(keep_out, mask_host, boxes_num) keep = keep_out[:num_out] return list(order[keep])
def gpu_dedisperse(cand, device=0): """ :param cand: Candidate object :param device: GPU id :return: """ cuda.select_device(device) chan_freqs = cuda.to_device(np.array(cand.chan_freqs, dtype=np.float32)) cand_data_in = cuda.to_device(np.array(cand.data.T, dtype=np.uint8)) cand_data_out = cuda.to_device(np.zeros_like(cand.data.T, dtype=np.uint8)) @cuda.jit def gpu_dedisp(cand_data_in, chan_freqs, dm, cand_data_out, tsamp): ii, jj = cuda.grid(2) if ii < cand_data_in.shape[0] and jj < cand_data_in.shape[1]: disp_time = int(-4148808.0 * dm * (1 / (chan_freqs[0]) ** 2 - 1 / (chan_freqs[ii]) ** 2) / 1000 / tsamp) cand_data_out[ii, jj] = cand_data_in[ii, (jj + disp_time) % cand_data_in.shape[1]] threadsperblock = (32, 32) blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1]) blockspergrid = (blockspergrid_x, blockspergrid_y) gpu_dedisp[blockspergrid, threadsperblock](cand_data_in, chan_freqs, float(cand.dm), cand_data_out, float(cand.tsamp)) cand.dedispersed = cand_data_out.copy_to_host().T cuda.close() return cand
def CREATE_MODEL(self): try: cuda.select_device(0) cuda.close() except: pass try: tf.keras.backend.clear_session() except: pass self.outsize = self.dict['OTHERS']['1']['OUT_SIZE'] self.windowlength = self.dict['OTHERS']['1']['WINDOW_LEN'] self.MAX_window = self.dict['OTHERS']['1']['WINDOW_LEN'] self.batch =self.dict['OTHERS']['1']['BATCH_SIZE'] self.period = self.dict['OTHERS']['1']['PERIOD'] self.optimizer.learning_rate = self.dict['OTHERS']['1']['LR'] self.epochz = self.dict['OTHERS']['1']['EPOCHS'] if self.FIRST_ITER: self.CREATE_DATA() self.FIRST_ITER = False if ~(len(list(self.VARS_EX['OTHERS'].keys())) == 0 or list(self.VARS_EX['OTHERS'].keys()) == ['LR'] or list(self.VARS_EX['OTHERS'].keys()) == ['LR','EPOCHS'] or list(self.VARS_EX['OTHERS'].keys()) == ['EPOCHs']): self.CREATE_DATA() self.model_parallel() self.trainingz() self.SAVE_PLOTS() print(self.epochz)
def GPUWrapper(data_out, device_id, photons_req_per_device, max_photons_per_device, muA, muS, g, source_type, source_param1, source_param2, detector_params, max_N, max_distance_from_det, target_type, target_mask, target_gridsize, z_target, z_bounded, z_range, ret_cols, absorb_threshold, absorb_chance): # TODO: These numbers can be optimized based on the device / architecture / number of photons threads_per_block = 256 blocks = 64 photons_per_thread = int( np.ceil(float(photons_req_per_device) / (threads_per_block * blocks))) max_photons_per_thread = int( np.ceil(float(max_photons_per_device) / (threads_per_block * blocks))) cuda.select_device(device_id) stream = cuda.stream() # use stream to trigger async memory transfer # Keeping this piece of code here for now -potentially we need this in the future # with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context # TODO: ideally we should call cuda.jit(signature)(propPhotonGPU), where # signature is the call to the function. So far I couldn't figure out what is the signature of the # rng_states, closest I got to was: array(Record([('s0', '<u8'), ('s1', '<u8')]), 1d, A) # But I couldn't get it to work yet. # MC_cuda_kernel = cuda.jit(propPhotonGPU) data = np.ndarray(shape=(threads_per_block * blocks, photons_per_thread, 12), dtype=np.float32) photon_counters = np.ndarray(shape=(threads_per_block * blocks, 5), dtype=np.int) data_out_device = cuda.device_array_like(data, stream=stream) photon_counters_device = cuda.device_array_like(photon_counters, stream=stream) # Used to initialize the threads random states. rng_states = create_xoroshiro128p_states( threads_per_block * blocks, seed=(np.random.randint(sys.maxsize) - 128) + device_id, stream=stream) # Actual kernel call propPhotonGPU[blocks, threads_per_block]( rng_states, data_out_device, photon_counters_device, photons_per_thread, max_photons_per_thread, muA, muS, g, source_type, source_param1, source_param2, detector_params, max_N, max_distance_from_det, target_type, target_mask, target_gridsize, z_target, z_bounded, z_range, absorb_threshold, absorb_chance) # Copy data back data_out_device.copy_to_host(data, stream=stream) photon_counters_device.copy_to_host(photon_counters, stream=stream) stream.synchronize() data = data.reshape(data.shape[0] * data.shape[1], data.shape[2]) data = data[:, ret_cols] data_out[device_id][0] = data photon_counters_aggr = np.squeeze(np.sum(photon_counters, axis=0)) data_out[device_id][1] = photon_counters_aggr
def get_prediction_real_time(sparkEngine, model=None, url_weight="", dim=15, prediction_weight="", encoder_length=24, decoder_length=24, attention_length=24, is_close_cuda=True): # continuously crawl aws and aqi & weather end = utils.get_datetime_now() end = end - timedelta(hours=1) # end = datetime.strptime("2018-06-19 11:01:00", p.fm) # e_ = end.strftime(p.fm) start = end - timedelta(hours=23) start = start.replace(minute=0, second=0, microsecond=0) # s_ = start.strftime(p.fm) # 2. process normalize data vectors, w_pred, china_vectors, timestamp = sparkEngine.process_vectors(start, end, dim) v_l = len(vectors) if v_l: sp_vectors = psv.convert_data_to_grid_exe(vectors) if v_l < encoder_length: sp_vectors = np.pad(sp_vectors, ((encoder_length - v_l,0), (0,0), (0,0), (0, 0)), 'constant', constant_values=0) # repeat for 25 districts if w_pred: w_pred = np.repeat(np.expand_dims(w_pred, 1), p.grid_size, 1) de_vectors = psv.convert_data_to_grid_exe(w_pred) # pad to fill top elements of decoder vectors de_vectors = np.pad(de_vectors, ((0, 0), (0, 0), (0, 0), (6, 0)), 'constant', constant_values=0) else: # know nothing about future weather forecast de_vectors = np.zeros((decoder_length, p.grid_size, p.grid_size, dim)) sp_vectors = np.concatenate((sp_vectors, de_vectors), axis=0) c_l = len(china_vectors) if c_l < attention_length: # print(attention_length - c_l) china_vectors = np.pad(china_vectors, ((attention_length - c_l, 0), (0, 0)), 'constant', constant_values=0) # 4. Feed to model if model is None: # model = BaselineModel(encoder_length=encoder_length, encode_vector_size=12, batch_size=1, decoder_length=decoder_length, rnn_layers=1, # dtype='grid', grid_size=25, use_cnn=True) # model.set_data(sp_vectors, [0], None) # model = MaskGan(encoder_length=encoder_length, encode_vector_size=15, batch_size=1, decode_vector_size=9, grid_size=25, use_cnn=True) model = APGan(encoder_length=24, decoder_length=24, encode_vector_size=15, batch_size=1, decode_vector_size=9, grid_size=25, forecast_factor=0) # model = APNet(encoder_length=24, decoder_length=24, encode_vector_size=15, batch_size=1, decode_vector_size=9, grid_size=25, forecast_factor=0) model.set_data(sp_vectors, [0], None, china_vectors) with tf.device('/%s' % p.device): model.init_ops(is_train=False) saver = tf.train.Saver() tconfig = get_gpu_options(False) with tf.Session(config=tconfig) as session: model.assign_datasets(session) preds_pm25 = realtime_execute(model, session, saver, decoder_length, p.prediction_weight_pm25) model.forecast_factor = 1 preds_pm10 = realtime_execute(model, session, saver, decoder_length, p.prediction_weight_pm10) china_vectors = np.array(china_vectors) # print("china", china_vectors.shape) # tf.reset_default_graph() # session.close() if is_close_cuda: cuda.select_device(0) cuda.close() return (preds_pm25, preds_pm10), timestamp, np.transpose(china_vectors[:,:2] * 500) else: return ([],[]), [], []
def clear(self): K.clear_session() gc.collect() del self.model for gpu in range(len(cuda.gpus)): cuda.select_device(gpu) cuda.close()
def init(self): # NOTE: do this at first of all if self.gpu_i is not None: cuda.select_device(self.gpu_i) self.nk = Nccl()
def worker(input_q, output_q): # Load a (frozen) Tensorflow model into memory. detection_graph = tf.Graph() with detection_graph.as_default(): od_graph_def = tf.GraphDef() with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid: serialized_graph = fid.read() od_graph_def.ParseFromString(serialized_graph) tf.import_graph_def(od_graph_def, name='') sess = tf.Session(graph=detection_graph) mtcnn = detect_and_align.create_mtcnn(sess, None) fps = FPS().start() while True: fps.update() frame = input_q.get() frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) face_patches, padded_bounding_boxes, landmarks = detect_and_align.detect_faces(frame_rgb, mtcnn) output = dict(face_boxes=padded_bounding_boxes) output_q.put(output) fps.stop() sess.close() cuda.select_device(0) cuda.close()
def crack_gpu(password_list, target_hash: str, gpu_id: int) -> typing.List[str]: #if gpu_id != 0: cuda.select_device(gpu_id) arr = password_list target_hash_arr = [ int(target_hash[i:i + 8], 16) for i in range(0, len(target_hash), 8) ] target_hash_arr = np.array([ struct.unpack(">I", struct.pack("<I", i))[0] for i in target_hash_arr ], dtype=np.uint32) matching_hash_index = np.array([-1], dtype=np.int32) THREADS_PER_BLOCK = 512 BLOCKS_PER_GRID = ( arr.shape[0] + (THREADS_PER_BLOCK - 1)) // THREADS_PER_BLOCK # only 1 "Grid" target_hash_arr = cuda.to_device(target_hash_arr) matching_hash_index = cuda.to_device(matching_hash_index) arr = cuda.to_device(arr) print(f"cracking phase (gpu{gpu_id})") crack_password[BLOCKS_PER_GRID, THREADS_PER_BLOCK](arr, target_hash_arr, matching_hash_index) print(f"finished (gpu{gpu_id})") matching_hash_index = matching_hash_index.copy_to_host() if matching_hash_index[0] != -1: return [int_arr_to_str(password_list[matching_hash_index[0]])] return [] # cracker = PasswordCracker(['a']) # cracker.crack_gpu()
def parse(self): if not self.initialized: self.initialize() self.opt = self.parser.parse_args() self.opt.device = torch.device( "cuda:%d" % (self.opt.gpu_id) if torch.cuda.is_available() else "cpu") cuda.select_device(self.opt.gpu_id) # torch.cuda.set_device(self.opt.gpu_id) args = vars(self.opt) print('------------ Options -------------') for k, v in sorted(args.items()): print('%s: %s' % (str(k), str(v))) print('-------------- End ----------------') # save to the disk expr_dir = os.path.join(self.opt.checkpoints_dir, self.opt.name) util.mkdirs(expr_dir) file_name = os.path.join(expr_dir, 'opt.txt') with open(file_name, 'wt') as opt_file: opt_file.write('------------ Options -------------\n') for k, v in sorted(args.items()): opt_file.write('%s: %s\n' % (str(k), str(v))) opt_file.write('-------------- End ----------------\n') return self.opt
def detect(self, image): cuda.select_device(0) config = ConfigProto() config.gpu_options.allow_growth = True session = InteractiveSession(config=config) ROOT_DIR = "/home/bernihoh/Bachelor/SMS/MaskRCNN/samples/SMSNetworks/face_feature_detection/" MODEL_DIR = os.path.join(ROOT_DIR, "logsFaceFeatureDetection") COCO_MODEL_PATH = "/home/bernihoh/Bachelor/SMS/MaskRCNN/samples/SMSNetworks/face_feature_detection/mask_rcnn_face_feature_detection_0029.h5" config = InferenceConfig() config.display() # Create model object in inference mode. model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config) # Load weights trained on MS-COCO model.load_weights(COCO_MODEL_PATH, by_name=True) class_names = ["bg", "iris_l", "inner_eye_l", "outer_eye_l", "eye_brow_l", "cheek_l", "iris_r", "inner_eye_r", "outer_eye_r", "eye_brow_r", "cheek_r", "nose_tip", "nose", "mouth", "chin", "face", "head", "distortion"] results = model.detect([image], verbose=1) r = results[0] session.close() cuda.close() return r
def configure_node(dist_backend, init_method): args = Param() args.dist_backend = dist_backend is_using_slurm = os.environ.get('SLURM_NTASKS') is not None if is_using_slurm: SLURM_LOCALID = int(os.environ.get('SLURM_LOCALID')) SLURM_PROCID = int(os.environ.get('SLURM_PROCID')) SLURM_NTASKS = int(os.environ.get('SLURM_NTASKS')) SLURM_NTASKS_PER_NODE = int(os.environ.get('SLURM_NTASKS_PER_NODE')) # logging.info(f'SLURM_LOCALID: {SLURM_LOCALID}') # logging.info(f'SLURM_PROCID: {SLURM_PROCID}') # logging.info(f'SLURM_NTASKS: {SLURM_NTASKS}') # logging.info(f'SLURM_NTASKS_PER_NODE: {SLURM_NTASKS_PER_NODE}') args.slurm_tasks_per_node = SLURM_NTASKS_PER_NODE if SLURM_NTASKS_PER_NODE is not None else 0 args.rank = SLURM_PROCID if SLURM_PROCID is not None else 0 args.gpu = SLURM_LOCALID if SLURM_LOCALID is not None else args.rank args.world_size = SLURM_NTASKS if SLURM_NTASKS is not None else 1 args.is_distributed = True args.scheduler = 'slurm' else: args.rank = 0 args.world_size = 1 args.gpu = 0 args.is_distributed = False args.scheduler = 'local' args.device = th.device(f'cuda:{args.gpu}') # if args.world_size > 1: # let numba know which device we are running the kernels on logging.info( f'rank {args.rank} avail gpus: {[x.id for x in GPUtil.getGPUs()]}') logging.info(f'Selecting device: {args.gpu}') cuda.select_device(args.gpu) dist.init_process_group(backend=args.dist_backend, rank=args.rank, world_size=args.world_size, init_method=init_method) ram_gpu_free_GB = [] ram_cpu_free_GB = psutil.virtual_memory().available / 2**30 gpus = GPUtil.getGPUs() gpu = gpus[args.gpu] ram_gpu_free_GB = gpu.memoryFree / 1000 if args.rank == 0: logging.info(f'Scheduler: {args.scheduler}') logging.info(f'System resources:') logging.info(f'Rank {args.rank} Free CPU RAM: {ram_cpu_free_GB} GB') # if args.world_size > 1: # dist.barrier() logging.info(f'Rank {args.rank} Free GPU RAM: {ram_gpu_free_GB} GB') # if args.world_size > 1: # dist.barrier() logging.info( f'Rank {args.rank} is using device {args.gpu}/{len(gpus)}: {gpu.name} driver: v{gpu.driver}' ) # if args.world_size > 1: # dist.barrier() return args
def __exit__(self, *args): cuda.select_device(self.gpu) suffix = 'ms (' + self.label + ')' if self.label else 'ms' self.end.record() self.end.synchronize() time = cuda.event_elapsed_time(self.start, self.end) print('elapsed time:', int(time), suffix)
def __init__(self, x, box, r_cut, gpu=0): cuda.select_device(gpu) self.gpu = gpu self.x = x self.box = box self.r_cut = r_cut self.r_cut2 = self.r_cut**2 self.update()
def __init__(self, label='', gpu=0): self.label = label self.gpu = gpu self.start = cuda.event() self.end = cuda.event() cuda.select_device(self.gpu) self.start.record(),
def cleanup(): from keras import backend as K K.clear_session() from numba import cuda cuda.select_device(0) cuda.close()
def newthread(): cuda.select_device(0) stream = cuda.stream() A = np.arange(100) dA = cuda.to_device(A, stream=stream) stream.synchronize() del dA del stream cuda.close()
def newthread(exception_queue): try: devices = range(driver.get_device_count()) for _ in range(2): for d in devices: cuda.select_device(d) cuda.close() except Exception as e: exception_queue.put(e)
def newthread(): devices = range(driver.get_device_count()) print('Devices', devices) for _ in range(2): for d in devices: cuda.select_device(d) print('Selected device', d) cuda.close() print('Closed device', d)
def gpu_dmt(cand, device=0): """ GPU DM-Time bow-tie (by rolling the array) Args: cand: Candidate instance device (int): GPU ID Returns: candidate object """ cuda.select_device(device) chan_freqs = cuda.to_device(np.array(cand.chan_freqs, dtype=np.float32)) dm_list = cuda.to_device(np.linspace(0, 2 * cand.dm, 256, dtype=np.float32)) dmt_return = cuda.to_device(np.zeros((256, cand.data.shape[0]), dtype=np.float32)) cand_data_in = cuda.to_device(np.array(cand.data.T, dtype=cand.data.dtype)) @cuda.jit def gpu_dmt(cand_data_in, chan_freqs, dms, cand_data_out, tsamp): ii, jj, kk = cuda.grid(3) if ( ii < cand_data_in.shape[0] and jj < cand_data_in.shape[1] and kk < dms.shape[0] ): disp_time = int( -1 * 4148808.0 * dms[kk] * (1 / (chan_freqs[0]) ** 2 - 1 / (chan_freqs[ii]) ** 2) / 1000 / tsamp ) cuda.atomic.add( cand_data_out, (kk, jj), cand_data_in[ii, (jj + disp_time) % cand_data_in.shape[1]], ) threadsperblock = (16, 8, 8) blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0]) blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1]) blockspergrid_z = math.ceil(dm_list.shape[0] / threadsperblock[2]) blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z) gpu_dmt[blockspergrid, threadsperblock]( cand_data_in, chan_freqs, dm_list, dmt_return, float(cand.your_header.tsamp) ) cand.dmt = dmt_return.copy_to_host() cuda.close() return cand
def newthread(exception_queue): try: devices = range(driver.get_device_count()) print('Devices', devices) for _ in range(2): for d in devices: cuda.select_device(d) print('Selected device', d) cuda.close() print('Closed device', d) except Exception as e: exception_queue.put(e)
def newthread(exception_queue): try: cuda.select_device(0) stream = cuda.stream() A = np.arange(100) dA = cuda.to_device(A, stream=stream) stream.synchronize() del dA del stream cuda.close() except Exception as e: exception_queue.put(e)
def device_controller(cid): cuda.select_device(cid) # bind device to thread device = cuda.get_current_device() # get current device # print some information about the CUDA card prefix = '[%s]' % device print(prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY) max_thread = device.MAX_THREADS_PER_BLOCK with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context cuda_kernel = cuda.jit(signature)(kernel) # prepare data N = 12345 data = np.arange(N, dtype=np.int32) * (cid + 1) orig = data.copy() # determine number of threads and blocks if N >= max_thread: ngrid = int(ceil(float(N) / max_thread)) nthread = max_thread else: ngrid = 1 nthread = N print(prefix, 'grid x thread = %d x %d' % (ngrid, nthread)) # real CUDA work d_data = cuda.to_device(data) # transfer to device cuda_kernel[ngrid, nthread](d_data, d_data) # compute inplace d_data.copy_to_host(data) # transfer to host # check result if not np.all(data == orig + 1): raise ValueError
from __future__ import print_function from timeit import default_timer as timer import math import numpy as np import pylab from numba import cuda # For machine with multiple devices cuda.select_device(0) @cuda.jit('float32(float32, float32)', device=True) def core(a, b): return 1 @cuda.jit('void(float32[:], float32[:], float32[:])') def vec_add(a, b, c): i = cuda.grid(1) c[i] = core(a[i], b[i]) @cuda.jit('void(float32[:], float32[:], float32[:])') def vec_add_ilp_x2(a, b, c): # read i = cuda.grid(1) ai = a[i] bi = b[i]