def binary_search_helper(dimension, logger, model_name="EDSR", device="cuda"): """ Process random image and calculates processing time Parameters ---------- dimension : int random image dimension. logger : logger keep logs. device : str, optional GPU or CPU. The default is 'cuda'. Returns ------- total_time : float EDSR processing time. """ print('Before loading model: ') subprocess.run("gpustat", shell=True) print() total_time = 0 try: model = None if model_name == "EDSR": model = md.load_edsr(device=device) print('After loading model: ') subprocess.run("gpustat", shell=True) print() elif model_name == "RRDB": model = md.load_rrdb(device=device) else: raise Exception("Unknown model...") model.eval() input_image = ut.random_image(dimension) if model_name == "RRDB": input_image = input_image[:, 2:, :, :] input_image = input_image.to(device) with torch.no_grad(): start = time.time() print('Before processing: ') subprocess.run("gpustat", shell=True) output_image = model(input_image) print('After processing: ') subprocess.run("gpustat", shell=True) end = time.time() total_time = end - start ut.clear_cuda(input_image, output_image) model.cpu() del model print('After model shifting and deleting: ') subprocess.run("gpustat", shell=True) except RuntimeError as err: logger.error("Runtime error for dimension: {}x{}: " + err) sys.exit(1) return total_time
def trt_forward_chop_iterative( x, trt_engine_path=None, shave=10, min_size=1024, device="cuda", print_result=True, scale=4, use_fp16=False, ): """ Forward chopping in an iterative way Parameters ---------- x : tensor input image. model : nn.Module, optional SR model. The default is None. shave : int, optional patch shave value. The default is 10. min_size : int, optional total patch size (dimension x dimension) . The default is 1024. device : int, optional GPU or CPU. The default is 'cuda'. print_result : bool, optional print result or not. The default is True. Returns ------- output : tensor output image. total_time : float total execution time. total_crop_time : float total cropping time. total_shift_time : float total GPU to CPU shfiting time. total_clear_time : float total GPU clearing time. """ dim = int(math.sqrt(min_size)) # getting patch dimension b, c, h, w = x.size() # current image batch, channel, height, width device = device patch_count = 0 output = torch.tensor(np.zeros((b, c, h * 4, w * 4))).numpy() total_time = 0 total_crop_time = 0 total_shift_time = 0 total_clear_time = 0 extra = x.clone().detach() f = open(trt_engine_path, "rb") runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() new_i_s = 0 stream = cuda.Stream() for i in range(0, h, dim - 2 * shave): new_j_s = 0 new_j_e = 0 for j in range(0, w, dim - 2 * shave): patch_count += 1 h_s, h_e = i, min(h, i + dim) # patch height start and end w_s, w_e = j, min(w, j + dim) # patch width start and end lr = x[:, :, h_s:h_e, w_s:w_e] ba, ch, ht, wt = lr.shape print('\nx: {}\n'.format(x)) print('\nlr: {}\n'.format(lr)) input_lr = torch.tensor(lr).int() output_folder = "output_images" file_name = "data/test7.jpg".split("/")[-1].split(".")[0] ut.save_image(input_lr[0].int(), output_folder, ht, wt, 4, output_file_name=file_name + f"input_{i}_{j}_x4") lr = lr.numpy() print(f"shape of lr:{lr.shape}") # EDSR processing start = time.time() # torch.cuda.synchronize() USE_FP16 = use_fp16 target_dtype = np.float16 if USE_FP16 else np.float32 ba, ch, ht, wt = lr.shape lr = np.ascontiguousarray(lr, dtype=np.float32) # need to set input and output precisions to FP16 to fully enable it p_output = np.empty([b, c, ht * scale, wt * scale], dtype=target_dtype) # allocate device memory d_input = cuda.mem_alloc(1 * lr.nbytes) d_output = cuda.mem_alloc(1 * p_output.nbytes) bindings = [int(d_input), int(d_output)] sr = predict(context, lr, d_input, stream, bindings, p_output, d_output) output_sr = torch.tensor(sr).int() output_folder = "output_images" file_name = "data/test7.jpg".split("/")[-1].split(".")[0] ut.save_image(output_sr[0], output_folder, ht, wt, 4, output_file_name=file_name + f"{i}_{j}_x4") # torch.cuda.synchronize() end = time.time() processing_time = end - start total_time += processing_time # new cropped patch's dimension (h and w) n_h_s, n_h_e, n_w_s, n_w_e = 0, 0, 0, 0 n_h_s = 0 if h_s == 0 else (shave * 4) n_h_e = ((h_e - h_s) * 4) if h_e == h else (((h_e - h_s) - shave) * 4) new_i_e = new_i_s + n_h_e - n_h_s n_w_s = 0 if w_s == 0 else (shave * 4) n_w_e = ((w_e - w_s) * 4) if w_e == w else (((w_e - w_s) - shave) * 4) new_j_e = new_j_e + n_w_e - n_w_s # corpping image in crop_start = time.time() sr_small = sr[:, :, n_h_s:n_h_e, n_w_s:n_w_e] crop_end = time.time() crop_time = crop_end - crop_start total_crop_time += crop_time output[:, :, new_i_s:new_i_e, new_j_s:new_j_e] = sr_small del sr_small clear_start = time.time() if device == "cuda": ut.clear_cuda(None, None) clear_end = time.time() clear_time = clear_end - clear_start total_clear_time += clear_time if w_e == w: break new_j_s = new_j_e new_i_s = new_i_e if h_e == h: break return output, total_time, total_crop_time, total_shift_time, total_clear_time
def trt_forward_chop_iterative_v2( x, trt_engine_path=None, shave=10, min_size=1024, device="cuda", print_result=True, scale=4, use_fp16=False, ): """ Parameters ---------- x : 4d array input image. trt_engine_path : str, optional path of the trt engine. The default is None. shave : int, optional shave value. The default is 10. min_size : int, optional total size of the image. The default is 1024. device : str, optional device cuda or cpu. The default is "cuda". print_result : bool, optional print result or not. The default is True. scale : int, optional hr = scale * lr. The default is 4. use_fp16 : bool, optional choose precision. The default is False. Raises ------ Exception DESCRIPTION. Returns ------- output : TYPE DESCRIPTION. """ patch_count = 0 row_count = 0 column_count = 0 dim = int(math.sqrt(min_size)) # getting patch dimension b, c, img_height, img_width = x.size( ) # current image batch, channel, height, width device = device output = torch.tensor(np.zeros( (b, c, img_height * 4, img_width * 4))).numpy() f = open(trt_engine_path, "rb") runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() new_i_s = 0 new_i_s = 0 # new patch height start for patch_height_start in range(0, img_height, dim - 2 * shave): row_count += 1 right_most = False bottom_most = False left_increased = 0 top_increased = 0 new_j_s = 0 new_j_e = 0 for patch_width_start in range(0, img_width, dim - 2 * shave): column_count += 1 patch_count += 1 patch_height_end = min(img_height, patch_height_start + dim) patch_width_end = min(img_width, patch_width_start + dim) if img_height < patch_height_start + dim: bottom_most = True old_patch_height_start = patch_height_start patch_height_start = img_height - dim patch_height_start = ((img_height - dim) if (img_height - dim) >= 0 else 0) top_increased = old_patch_height_start - patch_height_start if img_width < patch_width_start + dim: right_most = True old_patch_width_start = patch_width_start patch_width_start = img_width - dim patch_width_start = (img_width - dim) if (img_width - dim) >= 0 else 0 left_increased = old_patch_width_start - patch_width_start left_crop, top_crop, right_crop, bottom_crop = ( 0, 0, shave * scale, shave * scale, ) if patch_width_start != 0: if right_most == True: left_crop = (shave + left_increased) * scale else: left_crop = shave * scale if patch_height_start != 0: if bottom_most == True: top_crop = (shave + top_increased) * scale else: top_crop = shave * scale if patch_width_end == img_width: right_crop = 0 if patch_height_end == img_height: bottom_crop = 0 # ============================================================================= # print('Patch no: {}, Row: {}, Column: {}\n'.format(patch_count, row_count, column_count)) # print('{}x{}:{}x{}'.format(patch_height_start, patch_height_end, patch_width_start, patch_width_end )) # print('SR Patch size: {}x{}'.format(dim*scale, dim*scale)) # ============================================================================= h_s, h_e, w_s, w_e = ( 0 + top_crop, dim * scale - bottom_crop, 0 + left_crop, dim * scale - right_crop, ) # ============================================================================= # print('hs, he, ws, we', h_s, h_e, w_s, w_e) # ============================================================================= if dim >= img_height and dim >= img_width: h_s, h_e, w_s, w_e = 0, img_height * scale, 0, img_width * scale elif dim < img_height and dim >= img_width: w_s, w_e = 0, img_width * scale elif dim >= img_height and dim < img_width: h_s, h_e = 0, img_height * scale lr = x[:, :, patch_height_start:patch_height_end, patch_width_start:patch_width_end] # ============================================================================= # print('x.shape: ',x.shape) # print('lr.shape', lr.shape) # ============================================================================= ba, ch, ht, wt = lr.shape lr = lr.numpy() # EDSR processing start = time.time() # torch.cuda.synchronize() USE_FP16 = use_fp16 target_dtype = np.float16 if USE_FP16 else np.float32 ba, ch, ht, wt = lr.shape lr = np.ascontiguousarray(lr, dtype=target_dtype) # need to set input and output precisions to FP16 to fully enable it p_output = np.empty([b, c, ht * scale, wt * scale], dtype=target_dtype) # allocate device memory #subprocess.run("gpustat", shell=True) d_input = cuda.mem_alloc(1 * lr.nbytes) d_output = cuda.mem_alloc(1 * p_output.nbytes) #subprocess.run("gpustat", shell=True) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() sr = predict(context, lr, d_input, stream, bindings, p_output, d_output) #subprocess.run("gpustat", shell=True) new_i_e = new_i_s + h_e - h_s new_j_e = new_j_s + w_e - w_s patch_crop_positions = [h_s, h_e, w_s, w_e] SR_positions = [new_i_s, new_i_e, new_j_s, new_j_e] # torch.cuda.synchronize() end = time.time() processing_time = end - start sr_small = sr[:, :, h_s:h_e, w_s:w_e] output[:, :, new_i_s:new_i_e, new_j_s:new_j_e] = sr_small del sr_small #subprocess.run("gpustat", shell=True) clear_start = time.time() if device == "cuda": ut.clear_cuda(None, None) new_j_s = new_j_e if patch_width_end == img_width: break new_i_s = new_i_e column_count = 0 if patch_height_end == img_height: break if patch_count == 0: raise Exception("Shave size too big for given patch dimension") #subprocess.run("gpustat", shell=True) return output
def forward_chop_iterative(x, model=None, shave=10, min_size=1024, device="cuda", print_result=True): """ Forward chopping in an iterative way Parameters ---------- x : tensor input image. model : nn.Module, optional SR model. The default is None. shave : int, optional patch shave value. The default is 10. min_size : int, optional total patch size (dimension x dimension) . The default is 1024. device : int, optional GPU or CPU. The default is 'cuda'. print_result : bool, optional print result or not. The default is True. Returns ------- output : tensor output image. total_time : float total execution time. total_crop_time : float total cropping time. total_shift_time : float total GPU to CPU shfiting time. total_clear_time : float total GPU clearing time. """ dim = int(math.sqrt(min_size)) # getting patch dimension b, c, h, w = x.size() # current image batch, channel, height, width device = device patch_count = 0 output = torch.tensor(np.zeros((b, c, h * 4, w * 4))) total_time = 0 total_crop_time = 0 total_shift_time = 0 total_clear_time = 0 # ============================================================================= # if device == "cuda": # torch.cuda.synchronize() # x = x.to(device) # ============================================================================= new_i_s = 0 for i in range(0, h, dim - 2 * shave): new_j_s = 0 new_j_e = 0 for j in range(0, w, dim - 2 * shave): patch_count += 1 h_s, h_e = i, min(h, i + dim) # patch height start and end w_s, w_e = j, min(w, j + dim) # patch width start and end # subprocess.run("gpustat", shell=True) # ============================================================================= # print( # "Patch no: {} : {}-{}x{}-{}\n".format(patch_count, h_s, h_e, w_s, w_e) # ) # ============================================================================= lr = x[:, :, h_s:h_e, w_s:w_e] # ============================================================================= # print(lr.shape) # ============================================================================= if device == "cuda": torch.cuda.synchronize() lr = lr.to(device) with torch.no_grad(): # EDSR processing start = time.time() torch.cuda.synchronize() sr = model(lr) torch.cuda.synchronize() end = time.time() processing_time = end - start total_time += processing_time # ============================================================================= # print('Processing time: ', processing_time) # ============================================================================= shift_start = time.time() torch.cuda.synchronize() sr = sr.cpu() torch.cuda.synchronize() shift_end = time.time() shift_time = shift_end - shift_start # new cropped patch's dimension (h and w) n_h_s, n_h_e, n_w_s, n_w_e = 0, 0, 0, 0 n_h_s = 0 if h_s == 0 else (shave * 4) n_h_e = ((h_e - h_s) * 4) if h_e == h else (((h_e - h_s) - shave) * 4) new_i_e = new_i_s + n_h_e - n_h_s n_w_s = 0 if w_s == 0 else (shave * 4) n_w_e = ((w_e - w_s) * 4) if w_e == w else (((w_e - w_s) - shave) * 4) new_j_e = new_j_e + n_w_e - n_w_s # corpping image in crop_start = time.time() sr_small = sr[:, :, n_h_s:n_h_e, n_w_s:n_w_e] crop_end = time.time() crop_time = crop_end - crop_start total_crop_time += crop_time # ============================================================================= # shift_start = time.time() # if device == "cuda": # torch.cuda.synchronize() # sr_small = sr_small.to("cpu") # torch.cuda.synchronize() # shift_end = time.time() # shift_time = shift_end - shift_start # ============================================================================= total_shift_time += shift_time output[:, :, new_i_s:new_i_e, new_j_s:new_j_e] = sr_small del sr_small clear_start = time.time() if device == "cuda": ut.clear_cuda(lr, sr) clear_end = time.time() clear_time = clear_end - clear_start total_clear_time += clear_time if w_e == w: break new_j_s = new_j_e new_i_s = new_i_e if h_e == h: break # ============================================================================= # if print_result == True: # print("Patch dimension: {}x{}".format(dim, dim)) # print("Total pacthes: ", patch_count) # print("Total EDSR Processing time: ", total_time) # print("Total crop time: ", total_crop_time) # print("Total shift time: ", total_shift_time) # print("Total clear time: ", total_clear_time) # ============================================================================= return output, total_time, total_crop_time, total_shift_time, total_clear_time
def forward_chop_iterative(x, model=None, shave=10, min_size=1024): dim = round(math.sqrt(min_size)) b, c, h, w = x.size() device = "cuda" count = 0 output = torch.tensor(np.zeros((b, c, h * 4, w * 4))) total_time = 0 new_i_s = 0 x = x.to(device) for i in tqdm(range(0, h, dim - 2 * shave)): new_j_s = 0 new_j_e = 0 # ============================================================================= # subprocess.run("gpustat", shell=True) # ============================================================================= for j in range(0, w, dim - 2 * shave): # ============================================================================= # print(i,j) # subprocess.run("gpustat", shell=True) # ============================================================================= count += 1 h_s = i h_e = min(h, i + dim) w_s = j w_e = min(w, j + dim) lr = x[:, :, h_s:h_e, w_s:w_e] # ============================================================================= # print('h: {}x{} w: {}x{}'.format(h_s, h_e, w_s, w_e)) # print('current dim: {}x{}'.format(h_e-h_s,w_e-w_s)) # ============================================================================= with torch.no_grad(): # lr = lr.to(device) # ============================================================================= # subprocess.run("gpustat", shell=True) # ============================================================================= start = time.time() sr = model(lr) end = time.time() processing_time = end - start total_time += processing_time # ============================================================================= # subprocess.run("gpustat", shell=True) # ============================================================================= # sr = sr.detach().numpy() n_h = (h_e - h_s) * 4 n_w = (w_e - w_s) * 4 # ============================================================================= # print('next_dimension: {}x{}'.format(n_h, n_w)) # ============================================================================= n_h_s, n_h_e, n_w_s, n_w_e = 0, 0, 0, 0 if h_s == 0: n_h_s = 0 else: n_h_s = shave * 4 if h_e == h: n_h_e = (h_e - h_s) * 4 else: n_h_e = ((h_e - h_s) - shave) * 4 new_i_e = new_i_s + n_h_e - n_h_s if w_s == 0: n_w_s = 0 else: n_w_s = shave * 4 if w_e == w: n_w_e = (w_e - w_s) * 4 else: n_w_e = ((w_e - w_s) - shave) * 4 new_j_e = new_j_e + n_w_e - n_w_s sr_small = sr[:, :, n_h_s:n_h_e, n_w_s:n_w_e] sr_small = sr_small.to("cpu") output[:, :, new_i_s:new_i_e, new_j_s:new_j_e] = sr_small del sr_small # ============================================================================= # print('new -> h: {}x{} w: {}x{}'.format(n_h_s, n_h_e, n_w_s, n_w_e)) # print('h-> {}:{}, w-> {}:{}'.format(new_i_s, new_i_e, new_j_s, new_j_e)) # print() # ============================================================================= if w_e == w: break new_j_s = new_j_e ut.clear_cuda(lr, sr) # ============================================================================= # subprocess.run("gpustat", shell=True) # print('-----------------------------------------------------') # ============================================================================= # ============================================================================= # print('After:') # subprocess.run("gpustat", shell=True) # print() # ============================================================================= new_i_s = new_i_e if h_e == h: break # ============================================================================= # print(count) # print(output.shape) # ============================================================================= print("Patch dimension: {}x{}".format(dim, dim)) print("Total pacthes: ", count) print("Total EDSR Processing time: ", total_time) return output
def batch_forward_chop( patch_list, batch_size, channel, img_height, img_width, dim, shave, scale, model, device="cuda", print_timer=True, ): """ Create SR image from batches of patches Parameters ---------- patch_list : list list of patches. batch_size : int batch size. channel : int input image channel. img_height : int input image height. img_width : int input image width. dim : int patch dimension. shave : int shave value for patch. scale : int scale for LR to SR. model : nn.Module SR model. device : str, optional GPU or CPU. The default is 'cuda'. print_timer : bool, optional Print result or not. The default is True. Raises ------ Exception DESCRIPTION. Returns ------- 3D matrix, tuple output_image, tuple of timings. """ logger = ut.get_logger() total_patches = len(patch_list) if batch_size > total_patches: sys.exit(2) raise Exception("Batch size greater than total number of patches") output_image = torch.tensor( np.zeros((channel, img_height * scale, img_width * scale))) cpu_to_gpu_time = 0 gpu_to_cpu_time = 0 batch_creating_time = 0 total_EDSR_time = 0 cuda_clear_time = 0 merging_time = 0 for start in range(1, total_patches + 1, batch_size): info = "" try: batch_creating_timer = ut.timer() batch = [] end = start + batch_size if start + batch_size > total_patches: end = total_patches + 1 for p in range(start, end): batch.append(patch_list[p][4]) batch_creating_time += batch_creating_timer.toc() torch.cuda.synchronize() cpu_to_gpu_timer = ut.timer() batch = torch.stack(batch).to(device) torch.cuda.synchronize() cpu_to_gpu_time += cpu_to_gpu_timer.toc() info = (info + "C2G Starts: " + str(cpu_to_gpu_timer.t0) + "C2G total: " + str(cpu_to_gpu_time)) # ============================================================================= # print(batch.shape) # subprocess.run("gpustat", shell=True) # ============================================================================= with torch.no_grad(): # ============================================================================= # print(start, end) # print(sys.getsizeof(batch)) # ============================================================================= torch.cuda.synchronize() start_time = time.time() sr_batch = model(batch) torch.cuda.synchronize() end_time = time.time() processing_time = end_time - start_time total_EDSR_time += processing_time info = (info + "\tModel Starts: " + str(start_time) + "Model total: " + str(total_EDSR_time)) torch.cuda.synchronize() gpu_to_cpu_timer = ut.timer() sr_batch = sr_batch.to("cpu") torch.cuda.synchronize() gpu_to_cpu_time += gpu_to_cpu_timer.toc() info = (info + "\tGPU 2 CPU Starts: " + str(gpu_to_cpu_timer.t0) + "G2C total: " + str(gpu_to_cpu_time)) _, _, patch_height, patch_width = sr_batch.size() logger.info(info) batch_id = 0 merging_timer = ut.timer() for p in range(start, end): output_image[:, patch_list[p][3][0]:patch_list[p][3][ 1], patch_list[p][3][2]:patch_list[p][3][3], ] = sr_batch[ batch_id][:, patch_list[p][2][0]:patch_list[p][2][1], patch_list[p][2][2]:patch_list[p][2][3], ] batch_id += 1 merging_time += merging_timer.toc() cuda_clear_timer = ut.timer() ut.clear_cuda(batch, None) cuda_clear_time += cuda_clear_timer.toc() except RuntimeError as err: ut.clear_cuda(batch, None) raise Exception(err) model = model.to("cpu") if print_timer: print("Total upsampling time: {}\n".format(total_EDSR_time)) print("Total CPU to GPU shifting time: {}\n".format(cpu_to_gpu_time)) print("Total GPU to CPU shifting time: {}\n".format(gpu_to_cpu_time)) print("Total batch creation time: {}\n".format(batch_creating_time)) print("Total merging time: {}\n".format(merging_time)) print("Total CUDA clear time: {}\n".format(cuda_clear_time)) print("Total time: {}\n".format(total_EDSR_time + cpu_to_gpu_time + gpu_to_cpu_time + batch_creating_time + cuda_clear_time + merging_time)) return output_image, ( total_EDSR_time, cpu_to_gpu_time, gpu_to_cpu_time, batch_creating_time, cuda_clear_time, merging_time, )
def maximum_acceptable_dimension(device, logger, model, max_unacceptable_dimension, model_name="EDSR"): """ Get amximum acceptable dimension Parameters ---------- device : str device type. model : torch.nn.model SR model. max_unacceptable_dimension : int Maximum unacceptable dimension which is apower of 2. Returns ------- last : int acceptable dimension. """ print("\nGetting maximum acceptable dimension...\n") result2 = {} dimension = max_unacceptable_dimension maxm = math.inf minm = -math.inf last = 0 last_used_memory = 0 iteration = 0 while True: # Printing iterations status iteration += 1 _, used_memory, _ = ut.get_gpu_details(device, None, logger, print_details=False) leaked_memory = (used_memory - last_used_memory if used_memory > last_used_memory else 0) print( "Patch Dimension: {:04}x{:04} | Used Memory: {:09.3f} | Leaked Memory: {:09.3f} | Iteration: {}" .format(dimension, dimension, used_memory, leaked_memory, iteration)) last_used_memory = used_memory # Clearing cuda cache: ut.clear_cuda(None, None) # Binary Search if last == dimension: break process_output = subprocess.run( ["python3", "binarysearch_helper.py", str(dimension), model_name], stdout=subprocess.PIPE, text=True, ) if process_output.returncode == 0: out = process_output.stdout.split("\n") total_time = out[0] last = dimension if dimension in result2.keys(): result2[dimension].append(total_time) else: result2[dimension] = [total_time] minm = copy.copy(dimension) if maxm == math.inf: dimension *= 2 else: dimension = dimension + (maxm - minm) // 2 ut.clear_cuda(None, None) else: ut.get_gpu_details( device, "Runtime error for dimension: {}x{}".format( dimension, dimension), logger, ) maxm = copy.copy(dimension) if dimension in result2.keys(): result2[dimension].append(math.inf) else: result2[dimension] = [math.inf] if minm == -math.inf: dimension = dimension // 2 else: dimension = minm + (maxm - minm) // 2 ut.clear_cuda(None, None) return last
def build_onnx_trt(model_name, patch_dim, use_precision, verbose): if patch_dim == None: config = toml.load("../config.toml") patch_dim = int(config["max_dim"]) else: patch_dim = int(patch_dim) # pytorch to onnx model if verbose: print("Building ONNX model from the PyTorch model...") onnx_model_name = model_name.lower() + "_" + str(use_precision)+ "_" + \ str(patch_dim) + ".onnx" # ============================================================================= # omb.build_onnx_model(model_name, patch_dim, onnx_model_name) # ============================================================================= # ============================================================================= # command1 = "python3 onnx_model_builder.py " + str(model_name) + " " + \ # str(patch_dim) + " " + str(onnx_model_name) # ============================================================================= command1 = [ "python3", "onnx_model_builder.py", str(model_name), str(patch_dim), str(onnx_model_name) ] #subprocess.run(command1, shell=True) while True: process_output = subprocess.run( command1, stdout=subprocess.PIPE, text=True, ) if process_output.returncode != 0: ut.clear_cuda(None, None) patch_dim -= 1 print('Memory out. Decreasing patch size. New patch_size = {}'. format(patch_dim)) onnx_model_name = model_name.lower() + "_" + str(use_precision)+ "_" + \ str(patch_dim) + ".onnx" command1 = [ "python3", "onnx_model_builder.py", str(model_name), str(patch_dim), str(onnx_model_name) ] else: ut.clear_cuda(None, None) # for linear search config = toml.load("../config.toml") config["max_dim"] = patch_dim f = open("../config.toml", "w") toml.dump(config, f) break # onnx to trt if verbose: print("Building TRT engine from the ONNX model...") trt_model = "inference_models/" + model_name.lower() + "_" + str(use_precision) + \ "_" + str(patch_dim) + ".trt" if use_precision == "fp32": command2 = "python3 onnx_trt_util.py " + "inference_models/"+onnx_model_name + " " + \ str(trt_model) + " 0" elif use_precision == "fp16": command2 = "python3 onnx_trt_util.py " + "inference_models/"+onnx_model_name + " " + \ str(trt_model) + " 1" subprocess.run(command2, shell=True)
def maximum_unacceptable_dimension_2n(device, logger, start_dim=2, model_name="EDSR"): """ Ge the maximum unacceptable dimension which is apower of 2 Parameters ---------- device : str device type. model : torch.nn.model SR model. Returns ------- last_dimension : int unacceptabel dimension. """ print( "\nGetting maximum unacceptable dimension which is a power of two...\n" ) result1 = {} last_dimension = 0 dimension = start_dim last_used_memory = 0 iteration = 0 while True: # Prinitng loop status iteration += 1 _, used_memory, _ = ut.get_gpu_details(device, None, logger, print_details=False) leaked_memory = (used_memory - last_used_memory if used_memory > last_used_memory else 0) print( "Patch Dimension: {:04}x{:04} | Used Memory: {:09.3f} | Leaked Memory: {:09.3f} | Iteration: {}" .format(dimension, dimension, used_memory, leaked_memory, iteration)) last_used_memory = used_memory # Calling SR model for different dimension process_output = subprocess.run( ["python3", "binarysearch_helper.py", str(dimension), model_name], stdout=subprocess.PIPE, text=True, ) if process_output.returncode == 0: out = process_output.stdout.split("\n") total_time = out[0] if dimension in result1.keys(): result1[dimension].append(total_time) else: result1[dimension] = [total_time] dimension *= 2 else: ut.get_gpu_details( device, "Runtime error for dimension: {}x{}".format( dimension, dimension), logger, ) if dimension in result1.keys(): result1[dimension].append(math.inf) else: result1[dimension] = [math.inf] last_dimension = dimension ut.clear_cuda(None, None) break return last_dimension
def do_binary_search(model_name, start_dim): """ Binary search function... Returns ------- None. """ # Prints the header banner banner = pyfiglet.figlet_format("Binary Search: " + model_name) print(banner) # Getting logger logger = ut.get_logger() # Check valid model or not if model_name not in ["EDSR", "RRDB"]: logger.exception("{} model is unkknown".format(model_name)) raise Exception("Unknown model...") # Device type cpu or cuda device = ut.get_device_type() if device == "cpu" and model_name not in ["EDSR"]: logger.exception("{} model cannot be run in CPU".format(model_name)) raise Exception("{} model cannot be run in CPU".format(model_name)) # Device information _, device_name = ut.get_device_details() if device == "cuda": logger.info("Device: {}, Device Name: {}".format(device, device_name)) ut.get_gpu_details( device, "Before binary search: {}".format(model_name), logger, print_details=True, ) else: logger.info("Device: {}, Device Name: {}".format(device, device_name)) # Clearing cuda cache ut.clear_cuda(None, None) # Getting the highest unacceptable dimension which is a power of 2 max_unacceptable_dimension = maximum_unacceptable_dimension_2n( device, logger, start_dim=start_dim, model_name=model_name) print("\nMaximum unacceptable dimension: {}\n".format( max_unacceptable_dimension)) # Clearing cuda cache ut.clear_cuda(None, None) # Getting the maximum acceptable dimension max_dim = maximum_acceptable_dimension(device, logger, None, max_unacceptable_dimension, model_name=model_name) print("\nMaximum acceptable dimension: {}\n".format(max_dim)) # Clearing cuda cache ut.clear_cuda(None, None) # For batch processing config = toml.load("../batch_processing.toml") config["end_patch_dimension"] = max_dim f = open("../batch_processing.toml", "w") toml.dump(config, f) # for linear search config = toml.load("../config.toml") config["max_dim"] = max_dim f = open("../config.toml", "w") toml.dump(config, f)
def do_linear_search(test=False, test_dim=32): """ Linear search function... Returns ------- None. """ logger = ut.get_logger() device = "cuda" model_name = "EDSR" config = toml.load("../config.toml") run = config["run"] scale = int(config["scale"]) if config["scale"] else 4 # device information _, device_name = ut.get_device_details() total, _, _ = ut.get_gpu_details( device, "\nDevice info:", logger, print_details=False ) log_message = ( "\nDevice: " + device + "\tDevice name: " + device_name + "\tTotal memory: " + str(total) ) logger.info(log_message) ut.clear_cuda(None, None) state = "Before loading model: " total, used, _ = ut.get_gpu_details(device, state, logger, print_details=True) model = md.load_edsr(device=device) state = "After loading model: " total, used, _ = ut.get_gpu_details(device, state, logger, print_details=True) # ============================================================================= # file = open("temp_max_dim.txt", "r") # line = file.read() # max_dim = int(line.split(":")[1]) # ============================================================================= config = toml.load("../config.toml") max_dim = int(config["max_dim"]) if test == False: detailed_result, memory_used, memory_free = result_from_dimension_range( device, logger, config, model, 1, max_dim ) else: detailed_result, memory_used, memory_free = result_from_dimension_range( device, logger, config, model, test_dim, test_dim ) if test == False: # get mean # get std mean_time, std_time = ut.get_mean_std(detailed_result) mean_memory_used, std_memory_used = ut.get_mean_std(memory_used) mean_memory_free, std_memory_free = ut.get_mean_std(memory_free) # make folder for saving results plt_title = "Model: {} | GPU: {} | Memory: {} MB".format( model_name, device_name, total ) date = "_".join(str(time.ctime()).split()) date = "_".join(date.split(":")) foldername = date os.mkdir("results/" + foldername) # plot data ut.plot_data( foldername, "dimension_vs_meantime", mean_time, "Dimensionn of Patch(nxn)", "Mean Processing Time: LR -> SR, Scale: {} ( {} runs )".format(scale, run), mode="mean time", title=plt_title, ) ut.plot_data( foldername, "dimension_vs_stdtime", std_time, "Dimension n of Patch(nxn)", "Std of Processing Time: LR -> SR, Scale: {} ( {} runs )".format( scale, run ), mode="std time", title=plt_title, ) ut.plot_data( foldername, "dimension_vs_meanmemoryused", mean_memory_used, "Dimension n of Patch(nxn)", "Mean Memory used: LR -> SR, Scale: {} ( {} runs )".format(scale, run), mode="mean memory used", title=plt_title, ) ut.plot_data( foldername, "dimension_vs_stdmemoryused", std_memory_used, "Dimension n of Patch(nxn)", "Std Memory Used: LR -> SR, Scale: {} ( {} runs )".format(scale, run), mode="std memory used", title=plt_title, ) ut.plot_data( foldername, "dimension_vs_meanmemoryfree", mean_memory_free, "Dimension n of Patch(nxn)", "Mean Memory Free: LR -> SR, Scale: {} ( {} runs )".format(scale, run), mode="mean memory free", title=plt_title, ) ut.plot_data( foldername, "dimension_vs_stdmemoryfree", std_memory_free, "Dimension n of Patch(nxn)", "Std Memory Free: LR -> SR, Scale: {} ( {} runs )".format(scale, run), mode="std memory free", title=plt_title, ) # save data ut.save_csv( foldername, "total_stat", device, device_name, total, mean_time, std_time, mean_memory_used, std_memory_used, mean_memory_free, std_memory_free, )
def result_from_dimension_range(device, logger, config, model, first, last): """ Get detailed result for every dimension from 1 to the last acceptable dimension Parameters ---------- device : str device type. model : torch.nn.model SR model. first : int starting dimension. last : int last acceptable dimension. run : int, optional total run to average the result. The default is 10. Returns ------- result3 : dictionary time for every dimension. memory_used : dictionary memory used per dimension. memory_free : dictionary memory free per dimension. """ run = config["run"] print("\nPreparing detailed data... ") result3 = {} memory_used = {} memory_free = {} for i in range(run): print("\nRun: ", i + 1) print() for dim in tqdm(range(first, last + 1)): dimension = dim input_image = ut.random_image(dimension) input_image = input_image.to(device) with torch.no_grad(): try: print("\n") print(input_image.shape) print(input_image[0, 0, 0, 0:5]) start = time.time() output_image = model(input_image) end = time.time() total_time = end - start print("Processing time: ", total_time) print("\n") if dimension in result3.keys(): result3[dimension].append(total_time) _, used, free = ut.get_gpu_details( device, "", None, print_details=False ) memory_used[dimension].append(used) memory_free[dimension].append(free) else: result3[dimension] = [total_time] _, used, free = ut.get_gpu_details( device, "", None, print_details=False ) memory_used[dimension] = [used] memory_free[dimension] = [free] ut.clear_cuda(input_image, output_image) except RuntimeError as err: logger.exception("\nDimension NOT OK!") state = "\nGPU usage after dimension exception...\n" ut.get_gpu_details(device, state, logger, print_details=True) output_image = None ut.clear_cuda(input_image, output_image) state = f"\nGPU usage after clearing the image {dimension}x{dimension}...\n" ut.get_gpu_details(device, state, logger, print_details=True) break ut.clear_cuda(None, None) subprocess.run("gpustat", shell=True) return result3, memory_used, memory_free
def forward_chop_iterative(x, model=None, shave=10, min_size=1024, device="cuda", print_result=True): dim = int(math.sqrt(min_size)) # getting patch dimension b, c, h, w = x.size() # current image batch, channel, height, width device = device patch_count = 0 output = torch.tensor(np.zeros((b, c, h * 4, w * 4))) total_time = 0 total_crop_time = 0 total_shift_time = 0 total_clear_time = 0 if device == "cuda": x = x.to(device) new_i_s = 0 for i in range(0, h, dim - 2 * shave): new_j_s = 0 new_j_e = 0 for j in range(0, w, dim - 2 * shave): patch_count += 1 h_s, h_e = i, min(h, i + dim) # patch height start and end w_s, w_e = j, min(w, j + dim) # patch width start and end lr = x[:, :, h_s:h_e, w_s:w_e] with torch.no_grad(): # EDSR processing start = time.time() sr = model(lr) end = time.time() processing_time = end - start total_time += processing_time # new cropped patch's dimension (h and w) n_h_s, n_h_e, n_w_s, n_w_e = 0, 0, 0, 0 n_h_s = 0 if h_s == 0 else (shave * 4) n_h_e = ((h_e - h_s) * 4) if h_e == h else (((h_e - h_s) - shave) * 4) new_i_e = new_i_s + n_h_e - n_h_s n_w_s = 0 if w_s == 0 else (shave * 4) n_w_e = ((w_e - w_s) * 4) if w_e == w else (((w_e - w_s) - shave) * 4) new_j_e = new_j_e + n_w_e - n_w_s # corpping image in crop_start = time.time() sr_small = sr[:, :, n_h_s:n_h_e, n_w_s:n_w_e] crop_end = time.time() crop_time = crop_end - crop_start total_crop_time += crop_time # ============================================================================= # print('Crop time: ', crop_time) # ============================================================================= shift_start = time.time() if device == "cuda": sr_small = sr_small.to("cpu") shift_end = time.time() shift_time = shift_end - shift_start total_shift_time += shift_time # ============================================================================= # print('Shift time: ', shift_time) # ============================================================================= output[:, :, new_i_s:new_i_e, new_j_s:new_j_e] = sr_small del sr_small if w_e == w: break new_j_s = new_j_e clear_start = time.time() if device == "cuda": ut.clear_cuda(lr, sr) clear_end = time.time() clear_time = clear_end - clear_start total_clear_time += clear_time new_i_s = new_i_e if h_e == h: break if print_result == True: print("Patch dimension: {}x{}".format(dim, dim)) print("Total pacthes: ", patch_count) print("Total EDSR Processing time: ", total_time) print("Total crop time: ", total_crop_time) print("Total shift time: ", total_shift_time) print("Total clear time: ", total_clear_time) return output, total_time, total_crop_time, total_shift_time, total_clear_time