def depth_Estimation(args): model_name = args.model_name #Setting up the network print("Loading model....") download_model_if_doesnt_exist(model_name) encoder_path = os.path.join("models", model_name, "encoder.pth") depth_decoder_path = os.path.join("models", model_name, "depth.pth") # LOADING PRETRAINED MODEL encoder = networks.ResnetEncoder(18, False) depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict_enc = torch.load(encoder_path, map_location='cpu') filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()} encoder.load_state_dict(filtered_dict_enc) loaded_dict = torch.load(depth_decoder_path, map_location='cpu') depth_decoder.load_state_dict(loaded_dict) encoder.eval() depth_decoder.eval(); #Loading image print("Loading image....") image_path = args.image_path input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] input_image_resized = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image_pytorch = transforms.ToTensor()(input_image_resized).unsqueeze(0) input_npy = input_image_pytorch.squeeze().cpu().numpy() #prediction of disparity image with torch.no_grad(): features = encoder(input_image_pytorch) outputs = depth_decoder(features) disp = outputs[("disp", 0)] #Scaling for given resolution disp_resized = torch.nn.functional.interpolate(disp, (original_height, original_width), mode="bilinear", align_corners=False) # interpolate the values in to fit the given resolution of the image disp_resized_np = disp_resized.squeeze().cpu().numpy() # Converting tensor in pytorch to numpy array print("resized disp" + str(disp_resized_np.shape)) print("Range of Depth in image") scaled,dep = disp_to_depth(disp_resized_np,0.1,1000) # resizing the depth from 0.1 to 1000 units print("min->"+str(dep.min())+"mx->"+str(dep.max())) #Preview of the rgb and Depth images rgb = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB) depth = dep.reshape((rgb.shape[0],rgb.shape[1]),order='C') plot(rgb,depth) return rgb,depth
def __init__(self, model_path=os.path.dirname(__file__) + "/models/", model_name="stereo_640x192"): self.model_name = model_name self.model_path = model_path + self.model_name torch.set_grad_enabled(False) if torch.cuda.is_available(): self.device = torch.device("cuda") else: self.device = torch.device("cpu") download_model_if_doesnt_exist(self.model_name, self.model_path) self.encoder_path = os.path.join(self.model_path, "encoder.pth") self.depth_decoder_path = os.path.join(self.model_path, "depth.pth") # LOADING PRETRAINED MODEL self.encoder = ResnetEncoder(18, False) self.loaded_dict_enc = torch.load(self.encoder_path, map_location=self.device) # extract the height and width of image that this model was trained with self.feed_height = self.loaded_dict_enc['height'] self.feed_width = self.loaded_dict_enc['width'] self.filtered_dict_enc = { k: v for k, v in self.loaded_dict_enc.items() if k in self.encoder.state_dict() } self.encoder.load_state_dict(self.filtered_dict_enc) self.encoder.to(self.device) self.encoder.eval() self.depth_decoder = DepthDecoder(num_ch_enc=self.encoder.num_ch_enc, scales=range(4)) self.loaded_dict = torch.load(self.depth_decoder_path, map_location=self.device) self.depth_decoder.load_state_dict(self.loaded_dict) self.depth_decoder.to(self.device) self.depth_decoder.eval() #set up service calls self.mono_depth_service = rospy.Service( "MonoDepthService", MonoDepth, self.mono_depth_service_callback) self.mono_depth_publisher = rospy.Publisher('mono_depth_img', Image, queue_size=10)
def setup_network(model_name="mono_640x192"): download_model_if_doesnt_exist(model_name) encoder_path = os.path.join("models", model_name, "encoder.pth") depth_decoder_path = os.path.join("models", model_name, "depth.pth") # LOADING PRETRAINED MODEL encoder = networks.ResnetEncoder(18, False) depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict_enc = torch.load(encoder_path, map_location='cpu') filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) loaded_dict = torch.load(depth_decoder_path, map_location='cpu') depth_decoder.load_state_dict(loaded_dict) encoder.eval() depth_decoder.eval() return encoder, depth_decoder, loaded_dict_enc
def test_cam(args): if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # Extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() print("-> Loading complete, initializing the camera") # Initialize camera to capture image stream # Change the value to 0 when using default camera #video_stream = WebcamVideoStream(src=args.webcam).start() if not args.no_display: # Object to display images image_display = DisplayImage(not args.no_process) # Flag that records when 'q' is pressed to break out of inference loop below quit_inference = False def on_release(key): if key == keyboard.KeyCode.from_char('q'): nonlocal quit_inference quit_inference = True #s.close() return False keyboard.Listener(on_release=on_release).start() # Number of frames to capture to calculate fps num_frames = 5 curr_time = np.zeros(num_frames) with torch.no_grad(): print("Loop has started") host = "0.0.0.0" port = 5015 s = socket.socket() try: s.bind((host, port)) except socket.error as e: print(str(e)) print("Socket setup") connected = True bufferSize = 8192 #c, addr = s.accept() #print("Connected to :", addr[0], ":",addr[1]) first_loop = True connection_ready = False while True: if quit_inference: if args.no_display: print('-> Done') break if first_loop: frame = cv2.imread('assets/test_image.jpg') print("Read test image") first_loop = False elif not connection_ready: s.listen(10) c, addr = s.accept() print("Connected to: ", addr[0], ":", addr[1]) connection_ready = True continue else: try: data = c.recv(11) print("data as a string: " + str(data)) if (str(data).startswith('b\'SIZE')): tmp = str(data).split() bufferSize = int(tmp[1][:-1]) print("tmp[1] :" + str(tmp[1])) c.sendall("yes".encode()) data = bytearray(c.recv(bufferSize)) print(data) #else: # data = bytearray(data) + bytearray(c.recv(bufferSize)) #data = bytearray(c.recv(bufferSize)) print("Data") print(data) frame_np = np.asarray(data, dtype=np.uint8) print("frame_np") print(frame_np) frame = cv2.imdecode(frame_np, cv2.IMREAD_COLOR) print("frame") print(frame) # print(frame.shape) except socket.error as e: connected = False print("Connection lost, reconnecting") while not connected: try: c.bind(("0.0.0.0", port)) c.listen() c.accept() print("Reconnection worked") connected = True except socket.error as e: print(e) # Capture frame-by-frame #frame = video_stream.read() # frame = np.asarray(data, dtype =np.uint8) #PUT IN THE ACTUAL IMAGE RETRIEVAL HERE #print (type(frame)) # Calculate the fps print("Got frame") curr_time[1:] = curr_time[:-1] curr_time[0] = time.time() fps = num_frames / (curr_time[0] - curr_time[len(curr_time) - 1]) # Our operations on the frame come here # input_image = pil.fromarray(frame).convert('RGB') #fh = open("testfile.jpg","wb") #fh.write(data) #fh.close() input_image = pil.fromarray(frame).convert('RGB') # img = pil.open(fh) # img.save(data, format ='jpg') # print("type: "+ type(img)) # input_image = pil.frombytes('RGB', len(data), data, 'raw') #input_image = pil.fromarray(data).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION print("Prediction starting") input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="nearest") # Get the predict depth scaled_disp, pred_depth = disp_to_depth(disp_resized, 0.1, 100) pred_depth_np = pred_depth.squeeze().cpu().detach().numpy() # Initialize a 3x4 depth map depth_map = np.zeros([3, 4]) grid_width = original_width // 4 grid_height = original_height // 3 for i in range(len(depth_map)): for j in range(len(depth_map[0])): # Cut and store the average value of depth information of 640x480 into 3x4 grid depth_map[i][j] = get_avg_depth(pred_depth_np, grid_width * i, grid_height * j, grid_width * (i + 1), grid_height * (j + 1)) # Giving a simple decision logic if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1 or depth_map[ 0, 2] <= 1 or depth_map[1, 2] <= 1: if depth_map[1, 1] <= 1 and depth_map[1, 2] <= 1: print("Dangerous!!! AHEAD") else: if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1: print("Dangerous!!! LEFT") if depth_map[0, 2] <= 1 or depth_map[1, 2] <= 1: print("Dangerous!!! RIGHT") elif np.sum(depth_map[0:2, 2:3]) <= 7 or np.sum( depth_map[0:2, 2:3]) <= 7: if np.sum(depth_map[0:2, 0:1]) <= 7: print("Careful!! LEFT") if np.sum(depth_map[0:2, 2:3]) <= 7: print("Careful!! RIGHT") else: print("Clear") if not args.no_display: # DISPLAY # Generate color-mapped depth image disp_resized_np = disp_resized.squeeze().cpu().detach().numpy() image_display.display(frame, disp_resized_np, fps, original_width, original_height, blended=not args.no_blend) else: print(f"FPS: {fps}") # if quit_inference: # if args.no_display: # print('-> Done') # break # When everything is done, stop camera stream video_stream.stop()
def test_simple(args): """Function to predict for a single image or folder of images""" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc["height"] feed_width = loaded_dict_enc["width"] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = (os.path.dirname(args.image_path) if not args.dump_path else args.dump_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob( os.path.join(args.image_path, "*.{}".format(args.ext))) output_directory = args.image_path if not args.dump_path else args.dump_path else: raise Exception("Can not find args.image_path: {}".format( args.image_path)) print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): mse = 0 for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(image_path).convert("RGB") original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False, ) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) vmin = disp_resized_np.min() normalizer = mpl.colors.Normalize(vmin=vmin, vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap="magma") colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name)) im.save(name_dest_im) # Calc error correct_file = re.sub(r"\.\w+", "_depth.npy", image_path) if os.path.exists(correct_file): correct = np.load(correct_file)[:, :, 0] disp_np = disp_resized.cpu().detach().numpy() disp_np = disp_np[0, 0, :, :] correct = ((correct - correct.min()) / (correct.max() - correct.min()) * 255) disp_np = ((disp_np - disp_np.min()) / (disp_np.max() - disp_np.min()) * 255) mse = mse + ((correct - disp_np)**2).mean()**0.5 / 255 print(" Processed {:d} of {:d} images - saved prediction to {}". format(idx + 1, len(paths), name_dest_im)) print(f"mse: {mse}") print("-> Done!")
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()} encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder( num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = os.path.dirname(args.image_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob(os.path.join(args.image_path, '*.{}'.format(args.ext))) output_directory = args.image_path else: raise Exception("Can not find args.image_path: {}".format(args.image_path)) camera_intrinsics_px = [1242*0.58, 375*1.92, 1242*0.5, 375*0.5] # See datasets/kitti_dataset.py # TODO: improve loading intrinsics from file ? print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpeg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image_original = pil.open(image_path).convert('RGB') original_width, original_height = input_image_original.size input_image = input_image_original.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, depth = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Save PLY pointcloud from depth map depth_resized = torch.nn.functional.interpolate( depth, (original_height, original_width), mode="nearest") # !! do not interpolate depth values depth_resized_np = depth_resized.cpu().numpy()[0][0] nbPts = 0 plypoints = "" for v in range(0, original_height): for u in range(0, original_width): d = depth_resized_np[v][u] if d <= 0.0: continue r,g,b = input_image_original.getpixel((u,v)) x = d * (float(u) - camera_intrinsics_px[2]) / camera_intrinsics_px[0] y = d * (float(v) - camera_intrinsics_px[3]) / camera_intrinsics_px[1] z = d * 1.0; nbPts += 1 plypoints += str(x) + " " + str(y) + " " + str(z) + " " + str(r) + " " + str(g) + " " + str(b) + "\n" plyhead = "ply\n" plyhead += "format ascii 1.0\n" plyhead += "element vertex " + str(nbPts) + "\n" plyhead += "property float x\n" plyhead += "property float y\n" plyhead += "property float z\n" plyhead += "property uchar red\n" plyhead += "property uchar green\n" plyhead += "property uchar blue\n" plyhead += "end_header\n" filePly = open(os.path.join(output_directory, "{}_disp.ply".format(output_name)), "w+") filePly.write(plyhead + plypoints + "\n") filePly.close() # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name)) im.save(name_dest_im) print(" Processed {:d} of {:d} images - saved prediction to {}".format( idx + 1, len(paths), name_dest_im)) print('-> Done!')
def test_simple(): ext = 'jpg' # model_name='mono_640x192' model_name = 'mono_1024x320' no_cuda = False if torch.cuda.is_available() and not no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(model_name) model_path = os.path.join("models", model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print("Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() run_glob = True #à retirer, et préciser au début du namespace les vars globales. # boucle de vol while run_glob: #en faire une variable globale. #faire ici le imwrite et définir le path. t = time.time() path_ = '/home/edern/Documents/TIPE/traitement/mesures/test_image_000062.jpg' output_directory = os.path.dirname(path_) with torch.no_grad(): if path_.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(path_).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(path_))[0] #MAP ? name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpg".format(output_name)) im.save(name_dest_im) run_glob = False #tests seulement print('-> Done in {}'.format(time.time() - t))
from google.colab.patches import cv2_imshow import networks from utils import download_model_if_doesnt_exist import re import cv2 import glob from PIL import Image import re import cv2 import glob from PIL import Image model_name = "mono_640x192" download_model_if_doesnt_exist(model_name) encoder_path = os.path.join("models", model_name, "encoder.pth") depth_decoder_path = os.path.join("models", model_name, "depth.pth") # LOADING PRETRAINED MODEL encoder = networks.ResnetEncoder(18, False) depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict_enc = torch.load(encoder_path, map_location='cpu') filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()} encoder.load_state_dict(filtered_dict_enc) loaded_dict = torch.load(depth_decoder_path, map_location='cpu') depth_decoder.load_state_dict(loaded_dict) encoder.eval()
def test_cam(args): """Function to predict for a camera image stream """ if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # Extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() print("-> Loading complete, initializing the camera") # Initialize camera to capture image stream # Change the value to 0 when using default camera video_stream = WebcamVideoStream(src=args.webcam).start() if not args.no_display: # Object to display images image_display = DisplayImage(not args.no_process) # Flag that records when 'q' is pressed to break out of inference loop below quit_inference = False def on_release(key): if key == keyboard.KeyCode.from_char('q'): nonlocal quit_inference quit_inference = True return False keyboard.Listener(on_release=on_release).start() # Number of frames to capture to calculate fps num_frames = 5 curr_time = np.zeros(num_frames) with torch.no_grad(): while True: if quit_inference: if args.no_display: print('-> Done') break # Capture frame-by-frame frame = video_stream.read() # Calculate the fps curr_time[1:] = curr_time[:-1] curr_time[0] = time.time() fps = num_frames / (curr_time[0] - curr_time[len(curr_time) - 1]) # Our operations on the frame come here input_image = pil.fromarray(frame).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="nearest") # Get the predict depth scaled_disp, pred_depth = disp_to_depth(disp_resized, 0.1, 100) pred_depth_np = pred_depth.squeeze().cpu().detach().numpy() # Initialize a 3x4 depth map depth_map = np.zeros([3, 4]) for i in range(len(depth_map)): for j in range(len(depth_map[0])): # Cut and store the average value of depth information of 640x480 into 3x4 grid depth_map[i][j] = get_avg_depth(pred_depth_np, 160 * i, 160 * j, 160 * i + 160, 160 * j + 160) # Giving a simple decision logic if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1 or depth_map[ 0, 2] <= 1 or depth_map[1, 2] <= 1: if depth_map[1, 1] <= 1 and depth_map[1, 2] <= 1: print("Dangerous!!! AHEAD") else: if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1: print("Dangerous!!! LEFT") if depth_map[0, 2] <= 1 or depth_map[1, 2] <= 1: print("Dangerous!!! RIGHT") elif np.sum(depth_map[0:2, 2:3]) <= 7 or np.sum( depth_map[0:2, 2:3]) <= 7: if np.sum(depth_map[0:2, 0:1]) <= 7: print("Careful!! LEFT") if np.sum(depth_map[0:2, 2:3]) <= 7: print("Careful!! RIGHT") else: print("Clear") if not args.no_display: # DISPLAY # Generate color-mapped depth image disp_resized_np = disp_resized.squeeze().cpu().detach().numpy() image_display.display(frame, disp_resized_np, fps, original_width, original_height, blended=not args.no_blend) else: print(f"FPS: {fps}") # if quit_inference: # if args.no_display: # print('-> Done') # break # When everything is done, stop camera stream video_stream.stop()
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()} encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder( num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() try: cap = cv2.VideoCapture(int(args.video_path)) print(f"Loaded camera {int(args.video_path)}") except ValueError: cap = cv2.VideoCapture(args.video_path) print(f"Loaded video file {int(args.video_path)}") # PREDICTING ON EACH IMAGE IN TURN try: with torch.no_grad(): while True: _, image = cap.read() # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # if image_path.endswith("_disp.jpg"): # # don't try to predict disparity for a disparity image! # continue # Load image and preprocess input_image = pil.fromarray(image) original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file # output_name = os.path.splitext(os.path.basename(image_path))[0] # name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) # scaled_disp, _ = disp_to_depth(disp, 0.1, 100) # np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) cv2.imshow("", np.concatenate((colormapped_im, cv2.resize(image, tuple(colormapped_im.shape[:2][::-1]))))) key = cv2.waitKey(10) if key == ord('q'): break elif key == ord('c'): cv2.imwrite("assets/test_image2.jpeg", image) # im = pil.fromarray(colormapped_im) # name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name)) # im.save(name_dest_im) # print(" Processed {:d} of {:d} images - saved prediction to {}".format( # idx + 1, len(paths), name_dest_im)) print('-> Done!') except Exception: raise finally: cap.release() cv2.destroyAllWindows()
def __init__(self, _host_frame, _target_frame): ''' initialize the randpattern based photometric residual wrapper :param _host_frame: numpy ndarray H x W x 3 image. :param _target_frame: numpy ndarray image, same dimension as above. ''' # load options options = MonodepthOptions() opts = options.parse() self.opt = opts self.num_input_frames = len(self.opt.frame_ids) # init model self.model_name = "mono_1024x320" download_model_if_doesnt_exist(self.model_name) self.encoder_path = os.path.join("models", self.model_name, "encoder.pth") self.depth_decoder_path = os.path.join("models", self.model_name, "depth.pth") self.pose_encoder_path = os.path.join("models", self.model_name, "pose_encoder.pth") self.pose_decoder_path = os.path.join("models", self.model_name, "pose.pth") # LOADING PRETRAINED MODEL self.encoder = networks.ResnetEncoder(18, False) self.depth_decoder = networks.DepthDecoder( num_ch_enc=self.encoder.num_ch_enc, scales=range(4)) self.pose_encoder = networks.ResnetEncoder(self.opt.num_layers, False, 2) # self.pose_encoder = networks.PoseCNN(self.num_input_frames if self.opt.pose_model_input == "all" else 2) self.pose_decoder = networks.PoseDecoder(self.pose_encoder.num_ch_enc, 1, 2) # self.pose_decoder = networks.PoseDecoder(self.pose_encoder.num_ch_enc, num_input_features=1, # num_frames_to_predict_for=2) self.loaded_dict_enc = torch.load(self.encoder_path, map_location='cpu') self.filtered_dict_enc = { k: v for k, v in self.loaded_dict_enc.items() if k in self.encoder.state_dict() } self.encoder.load_state_dict(self.filtered_dict_enc) self.loaded_dict_pose_enc = torch.load(self.pose_encoder_path, map_location='cpu') self.filtered_dict_pose_enc = { k: v for k, v in self.loaded_dict_pose_enc.items() if k in self.pose_encoder.state_dict() } self.pose_encoder.load_state_dict(self.filtered_dict_pose_enc) self.loaded_dict = torch.load(self.depth_decoder_path, map_location='cpu') self.depth_decoder.load_state_dict(self.loaded_dict) self.loaded_dict_pose = torch.load(self.pose_decoder_path, map_location='cpu') self.pose_decoder.load_state_dict(self.loaded_dict_pose) self.encoder.eval() self.depth_decoder.eval() self.pose_encoder.eval() self.pose_decoder.eval() self.isgood = [] # define frames self.host_frame = _host_frame self.target_frame = _target_frame self.host_frame_dx, self.host_frame_dy = image_gradients( self.host_frame) self.target_frame_dx, self.target_frame_dy = image_gradients( self.target_frame) # dso's pattern: self.residual_pattern = np.array([ [0, 0], [-2, 0], [2, 0], [-1, -1], [1, 1], [-1, 1], [1, -1], [0, 2], [0, -2], ])
def test_cam(args): """Function to predict for an image stream """ # Determine where to run inference if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") # Download model given in args if it doesn't exist download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # Extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() print("-> Loading complete, initializing the camera") # Get coco labels ctypes.CDLL("../TRT_object_detection/lib/libflattenconcat.so") COCO_LABELS = coco.COCO_CLASSES_LIST # initialize TRT_LOGGER = trt.Logger(trt.Logger.INFO) trt.init_libnvinfer_plugins(TRT_LOGGER, '') runtime = trt.Runtime(TRT_LOGGER) # compile model into TensorRT if not os.path.isfile(model.TRTbin): dynamic_graph = model.add_plugin(gs.DynamicGraph(model.path)) uff_model = uff.from_tensorflow(dynamic_graph.as_graph_def(), model.output_name, output_filename='tmp.uff') with trt.Builder(TRT_LOGGER) as builder, builder.create_network( ) as network, trt.UffParser() as parser: builder.max_workspace_size = 1 << 28 builder.max_batch_size = 1 builder.fp16_mode = True parser.register_input('Input', model.dims) parser.register_output('MarkOutput_0') parser.parse('tmp.uff', network) engine = builder.build_cuda_engine(network) buf = engine.serialize() with open(model.TRTbin, 'wb') as f: f.write(buf) # create engine with open(model.TRTbin, 'rb') as f: buf = f.read() engine = runtime.deserialize_cuda_engine(buf) # create buffer host_inputs = [] cuda_inputs = [] host_outputs = [] cuda_outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size host_mem = cuda.pagelocked_empty(size, np.float32) cuda_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(cuda_mem)) if engine.binding_is_input(binding): host_inputs.append(host_mem) cuda_inputs.append(cuda_mem) else: host_outputs.append(host_mem) cuda_outputs.append(cuda_mem) context = engine.create_execution_context() if not args.no_display: # Object to display images image_display = DisplayImage(not args.no_process) # Flag that records when 'q' is pressed to break out of inference loop below quit_inference = False # Listener for key board presses and updates quit_inference def on_release(key): if key == keyboard.KeyCode.from_char('q'): nonlocal quit_inference quit_inference = True return False # Initialize listener keyboard.Listener(on_release=on_release).start() status_socket_thread = SocketStatusThread() status_socket_thread.start() image_stream_thread = ImageStreamThread() image_stream_thread.start() # Number of frames to capture to calculate fps num_frames = 5 curr_time = np.zeros(num_frames) with torch.no_grad(): while True: if quit_inference: if args.no_display: image_stream_thread.stop() print('-> Done') break frame = image_stream_thread.read_frame() # Calculate the fps curr_time[1:] = curr_time[:-1] curr_time[0] = time.time() fps = num_frames / (curr_time[0] - curr_time[len(curr_time) - 1]) # Do depth inference disp_resized, danger_level, danger_side, original_width, original_height = predict_depth( frame, feed_width, feed_height, device, encoder, depth_decoder) # Only do object detection if danger level is above 0 (i.e. Careful or Dangerous) print(f"Danger level: {danger_level}") detections_str = "" if danger_level > 0: detections = detect_objects(frame, host_inputs, host_outputs, cuda_inputs, cuda_outputs, bindings, stream, context, COCO_LABELS) # Only sending back detections in region where depth seems close # detections = detections_dict[danger_side] detections_str = '\n' + '\n'.join('$'.join(map(str, obj)) for obj in detections) print(str(detections)) print(f"Detections: {detections_str}") # Construct string with danger level and END signal # Separate each piece (i.e. danger level, each detection, END) with new line so client socket knows # where each item ends result = str( danger_level) + "\n" + danger_side + detections_str + "\nEND\n" print("Sending result...") image_stream_thread.send_result(result) if not args.no_display: # Generate color-mapped depth image and display alongside original frame and blended, if chosen disp_resized_np = disp_resized.squeeze().cpu().detach().numpy() image_display.display(frame, disp_resized_np, fps, original_width, original_height, blended=not args.no_blend) cv2.waitKey(1) else: print(f"FPS: {fps}") print("Outside of with statement") image_stream_thread.stop()
def video_test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() vs = cv2.VideoCapture(args.video_path) writer = None try: prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2() \ else cv2.CAP_PROP_FRAME_COUNT total = int(vs.get(prop)) print(" {} total frames in video".format(total)) except: print(" Could not determine # of frames in video") print(" No approx. completion time can be provided") total = -1 # FINDING INPUT VIDEO if os.path.isfile(args.video_path): paths = [args.video_path] elif os.path.isdir(args.video_path): paths = glob.glob( os.path.join(args.video_path, '*.{}'.format(args.ext))) else: raise Exception("Can not find args.video_path: {}".format( args.video_path)) # PREDICTING with torch.no_grad(): while True: # Load frame and preprocess (grabbed, input_image) = vs.read() if not grabbed: break original_height, original_width, c = input_image.shape input_image = cv2.resize(input_image, (feed_width, feed_height), interpolation=cv2.INTER_LANCZOS4) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION start = time.time() input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) colormapped_im = cv2.cvtColor(colormapped_im, cv2.COLOR_RGB2BGR) end = time.time() if writer is None: # Initialize our video writer fourcc = cv2.VideoWriter_fourcc(*'MJPG') writer = cv2.VideoWriter( args.video_path_output, fourcc, 30, (colormapped_im.shape[1], colormapped_im.shape[0]), True) if total > 0: elap = (end - start) print(" Single frame took {:.4f} seconds".format(elap)) print(" Estimated total time to finish: {:.4f}".format( elap * total)) # Write the output frame to disk writer.write(colormapped_im) print('-> Done!')
def test_webcam(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() video_capture = cv2.VideoCapture(webcam_index) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): while True: ret, frame = video_capture.read() # frame shape 640*480*3 if frame.shape[0] == 0: break # Load image and preprocess input_image = cv2.resize(frame, (feed_width, feed_height)) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (frame.shape[0], frame.shape[1]), mode="bilinear", align_corners=False) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() # print(disp_resized_np.shape) # vmax = np.percentile(disp_resized_np, 95) # normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) cv2.imshow('out', colormapped_im) if cv2.waitKey(1) & 0xFF == ord('q'): break video_capture.release() cv2.destroyAllWindows()
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()} encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print("Loading pretrained decoder") depth_decoder = networks.DepthDecoder( num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() print("Loading pose networks") pose_encoder_path = os.path.join(model_path, "pose_encoder.pth") pose_decoder_path = os.path.join(model_path, "pose.pth") pose_encoder = networks.ResnetEncoder(18, False, 2) pose_encoder.load_state_dict(torch.load(pose_encoder_path)) pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) pose_decoder.load_state_dict(torch.load(pose_decoder_path)) pose_encoder.cuda() pose_encoder.eval() pose_decoder.cuda() pose_decoder.eval() bag_name = '2019-12-17-13-24-03' map_name = "feature=base&ver=2019121700&base_pt=(32.75707,-111.55757)&end_pt=(32.092537212,-110.7892506)" begin = '0:36:00' end = '0:37:00' output_directory = "assets/" dataset = TSDataset(bag_name, begin, end) pred_depth = [] pred_poses = [] last_img = None # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, input_image in enumerate(dataset): # Load image and preprocess original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) pred_depth.append(im) # Handle pose if last_img is None: last_img = input_image all_color_aug = torch.cat([last_img, input_image], 1) last_img = input_image features = [pose_encoder(all_color_aug)] axisangle, translation = pose_decoder(features) pose = transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy() pred_poses.append(pose) print(" Processed {:d} of {:d} images".format( idx + 1, len(dataset))) pred_poses = np.concatenate(pred_poses, axis=0) print(pred_poses.shape) np.save("poses.npy", pred_poses) # save_video(pred_depth) print('-> Done!')
def test_depth_pose(args): """Function to predict depth and pose """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") pose_encoder_path = os.path.join(model_path, "pose_encoder.pth") pose_decoder_path = os.path.join(model_path, "pose.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained depth encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) print(" Loading pretrained pose encoder") pose_encoder = networks.ResnetEncoder(18, False, 2) loaded_dict_pose_enc = torch.load(pose_encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) pose_encoder.load_state_dict(loaded_dict_pose_enc) encoder.to(device) pose_encoder.to(device) encoder.eval() pose_encoder.eval() print(" Loading pretrained depth decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) print(" Loading pretrained pose decoder") pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2) loaded_dict_pose = torch.load(pose_decoder_path, map_location=device) pose_decoder.load_state_dict(loaded_dict_pose) depth_decoder.to(device) pose_decoder.to(device) depth_decoder.eval() pose_decoder.eval() print("-> Predicting on test images") pred_depths = [] pred_poses = [] backproject_depth = BackprojectDepth(1, feed_height, feed_width) backproject_depth.to(device) project_3d = Project3D(1, feed_height, feed_width) project_3d.to(device) K = np.array( [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=np.float32) K[0, :] *= feed_width K[1, :] *= feed_height inv_K = np.linalg.pinv(K) K = torch.from_numpy(K) K = K.unsqueeze(0).to(device) inv_K = torch.from_numpy(inv_K) inv_K = inv_K.unsqueeze(0).to(device) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for i in range(107): # Load image and preprocess image_0_path = './kitti_data/01/{:010d}.jpg'.format(i) input_image_0 = Image.open(image_0_path).convert('RGB') original_width, original_height = input_image_0.size input_image_0 = input_image_0.resize((feed_width, feed_height), Image.LANCZOS) input_image_0 = transforms.ToTensor()(input_image_0).unsqueeze(0) image_1_path = './kitti_data/01/{:010d}.jpg'.format(i + 1) input_image_1 = Image.open(image_1_path).convert('RGB') input_image_1 = input_image_1.resize((feed_width, feed_height), Image.LANCZOS) input_image_1 = transforms.ToTensor()(input_image_1).unsqueeze(0) # PREDICTION for depth input_image_0 = input_image_0.to(device) features = encoder(input_image_0) outputs = depth_decoder(features) disp = outputs[("disp", 0)] #disp_resized = torch.nn.functional.interpolate( # disp, (original_height, original_width), mode="bilinear", align_corners=False) _, pred_depth = disp_to_depth(disp, 0.1, 100) pred_depth = pred_depth.cpu()[:, 0].numpy() pred_depths.append(pred_depth[0]) print(" Predict Depth {:d}".format(i)) # PREDICTION for pose input_image_1 = input_image_1.to(device) input_image_pose = torch.cat([input_image_0, input_image_1], 1) features_pose = pose_encoder(input_image_pose) features_pose = [features_pose] axisangle, translation = pose_decoder(features_pose) pred_pose = transformation_from_parameters(axisangle[:, 0], translation[:, 0]) pred_poses.append(pred_pose.cpu()[0].numpy()) print(" Predict Pose {:d}".format(i)) print(pred_pose) # WARPED image if RECONSTRUCTION: print(" Reconstruct image {:d}".format(i)) cam_points = backproject_depth(pred_depth, inv_K) pix_coords = project_3d(cam_points, K, pred_pose) reconstruct_image_0 = torch.nn.functional.grid_sample( input_image_1, pix_coords, padding_mode="border") print(" Saving resonstructed image...") reconstruct_image_0 = torch.nn.functional.interpolate( reconstruct_image_0, (original_height, original_width), mode="bilinear", align_corners=False) reconstruct_image_0_np = reconstruct_image_0.squeeze().cpu( ).numpy() reconstruct_image_0_np = (reconstruct_image_0_np * 255).astype( np.uint8) reconstruct_image_0_np = np.concatenate([ np.expand_dims(reconstruct_image_0_np[i], 2) for i in range(3) ], 2) im = Image.fromarray(reconstruct_image_0_np, mode='RGB') name_dest_im = os.path.join("kitti_data/01", "warped", "{:010d}_warped.jpg".format(i)) im.save(name_dest_im) print("...") np.save('kitti_data/pred_depth_01.npy', np.array(pred_depths)) np.save('kitti_data/pred_pose_01.npy', np.array(pred_poses)) print('-> Done!')
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() datasetVOT = [ 'bag', 'ball1', 'ball2', 'basketball', 'birds1', 'birds2', 'blanket', 'bmx', 'bolt1', 'bolt2', 'book', 'butterfly', 'car1', 'car2', 'crossing', 'dinosaur', 'fernando', 'fish1', 'fish2', 'fish3', 'fish4', 'girl', 'glove', 'godfather', 'graduate', 'gymnastics1', 'gymnastics2', 'gymnastics3', 'gymnastics4', 'hand', 'handball1', 'handball2', 'helicopter', 'iceskater1', 'iceskater2', 'leaves', 'marching', 'matrix', 'motocross1', 'motocross2', 'nature', 'octopus', 'pedestrian1', 'pedestrian2', 'rabbit', 'racing', 'road', 'shaking', 'sheep', 'singer1', 'singer2', 'singer3', 'soccer1', 'soccer2', 'soldier', 'sphere', 'tiger', 'traffic', 'tunnel', 'wiper' ] datasetMOT = [ 'MOT17-02', 'MOT17-04', 'MOT17-05', 'MOT17-09', 'MOT17-10', 'MOT17-11', 'MOT17-13' ] for d in datasetMOT[:]: new_path = args.image_path + "\\" + d + "\\img1" print(new_path, d) if os.path.isdir(new_path): # Searching folder for images paths = glob.glob(os.path.join(new_path, '*.{}'.format(args.ext))) output_directory = paths else: raise Exception("Can not find args.image_path: {}".format( args.image_path)) print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(new_path, "{}_disp.jpeg".format(output_name)) im.save(name_dest_im) print( " Processed {:d} of {:d} images - saved prediction to {}" .format(idx + 1, len(paths), name_dest_im)) print('-> Done!')
def image_demo(image_path, model_name): if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(model_name) model_path = os.path.join("models", model_name) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") print("-> Loading model from ", model_path) print("-> encoder_path = ", encoder_path) print("-> depth_decoder_path = ", depth_decoder_path) # LOADING PRETRAINED MODEL print("-> Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print("-> Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES print("-> Predicting test image : ", image_path) with torch.no_grad(): # Load image and preprocess input_image = pil.open(image_path).convert('RGB') print(input_image) original_width, original_height = input_image.size print("-> input image size ", input_image.size) input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) print("-> resize to ", input_image.size) input_image = transforms.ToTensor()(input_image).unsqueeze(0) print(input_image) # PREDICTION input_image = input_image.to(device) print(input_image) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) cv2.imshow('img', colormapped_im) cv2.waitKey(0)
disp, (frame.shape[0], frame.shape[1]), mode="bilinear", align_corners=False) return disp_resized.squeeze().cpu().numpy() # Initialize SSD net = build_ssd('test', 300, 21) net.load_state_dict(torch.load('data/weights/ssd_300_VOC0712.pth')) transform = BaseTransform(net.size, (104/256.0, 117/256.0, 123/256.0)) # Initialize Monodepth2 if torch.cuda.is_available(): device = torch.device("cuda") print("GPU BOiii") else: device = torch.device("cpu") download_model_if_doesnt_exist(DEPTH_MODEL_NAME) model_path = os.path.join("models", DEPTH_MODEL_NAME) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()} encoder.load_state_dict(filtered_dict_enc)
def video_demo(video_path, model_name): if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(model_name) model_path = os.path.join("models", model_name) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") print("-> Loading model from ", model_path) print("-> encoder_path = ", encoder_path) print("-> depth_decoder_path = ", depth_decoder_path) # LOADING PRETRAINED MODEL print("-> Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print("-> Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # play video vid = cv2.VideoCapture(video_path) if not vid.isOpened(): raise IOError("Couldn't open webcam or video") while True: ok, frame = vid.read() if ok == False: break with torch.no_grad(): # Load image and preprocess input_image = pil.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # print(input_image) original_width, original_height = input_image.size #print("-> input image size ", input_image.size) input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) #print("-> resize to ", input_image.size) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # print(input_image) # PREDICTION input_image = input_image.to(device) # print(input_image) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) cv2.imshow('img', colormapped_im) cv2.waitKey(10) key = cv2.waitKey(1) & 0xFF if key == ord('q'): break #cv2.imshow("result", frame) # cv2.waitKey(30) #key = cv2.waitKey(1) & 0xFF # if key == ord('q'): # break cv2.destroyAllWindows()
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") pose_encoder_path = os.path.join(model_path, "pose_encoder.pth") pose_decoder_path = os.path.join(model_path, "pose.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(26, False) # encoder = networks.PackNeSt_encoder() # REVIEW loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) # depth_decoder = networks.PackNeSt_decoder() # REVIEW loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = os.path.dirname(args.image_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob( os.path.join(args.image_path, '*.{}'.format(args.ext))) output_directory = args.image_path else: raise Exception("Can not find args.image_path: {}".format( args.image_path)) print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name)) im.save(name_dest_im) print(" Processed {:d} of {:d} images - saved prediction to {}". format(idx + 1, len(paths), name_dest_im)) print('-> Done!')
def test_simple(args): """Function to predict for a single image or folder of images """ assert args.model_name is not None, \ "You must specify the --model_name parameter; see README.md for an example" if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(args.model_name) model_path = os.path.join("models", args.model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() # FINDING INPUT IMAGES if os.path.isfile(args.image_path): # Only testing on a single image paths = [args.image_path] output_directory = os.path.dirname(args.image_path) elif os.path.isdir(args.image_path): # Searching folder for images paths = glob.glob( os.path.join(args.image_path, '*.{}'.format(args.ext))) output_directory = os.path.join(args.image_path, 'disp_images') if not os.path.exists(output_directory): os.makedirs(output_directory) video_directory = os.path.join(args.image_path, 'videos') if not os.path.exists(video_directory): os.makedirs(video_directory) else: raise Exception("Can not find args.image_path: {}".format( args.image_path)) print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp, 0.1, 100) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpg".format(output_name)) im.save(name_dest_im) print(" Processed {:d} of {:d} images - saved prediction to {}". format(idx + 1, len(paths), name_dest_im)) # Create videos if folder if os.path.isdir(args.image_path): print('-> Building the original video from the inputted images') # Sorting files files = [ file for file in os.listdir(args.image_path) if os.path.isfile(os.path.join(args.image_path, file)) ] nums = [int(re.findall('\d+', s)[0]) for s in files] dictionary = dict(zip(nums, files)) sorted_keys = sorted(dictionary) sorted_dict = {i: dictionary[i] for i in sorted_keys} files = sorted_dict.values() orig_imgs = [] for file in files: if not file.endswith('.{}'.format(args.ext)): continue temp = cv2.imread(os.path.join(args.image_path, file)) orig_imgs.append(temp) height, width = orig_imgs[0].shape[0:2] orig_video = cv2.VideoWriter( os.path.join(video_directory, 'orig_video.avi'), cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (width, height)) for image in orig_imgs: orig_video.write(image) cv2.destroyAllWindows() orig_video.release() print('-> Building the depth video') # Sorting files outputs = [ file for file in os.listdir(output_directory) if file.endswith('jpg') ] nums = [int(re.findall('\d+', s)[0]) for s in outputs] dictionary = dict(zip(nums, outputs)) sorted_keys = sorted(dictionary) sorted_dict = {i: dictionary[i] for i in sorted_keys} outputs = sorted_dict.values() depth_imgs = [] for file in outputs: if file.endswith("_disp.npy"): continue temp_depth = cv2.imread(os.path.join(output_directory, file)) depth_imgs.append(temp_depth) height, width = depth_imgs[0].shape[0:2] depth_video = cv2.VideoWriter( os.path.join(video_directory, 'depth_video.avi'), cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (width, height)) for image in depth_imgs: depth_video.write(image) cv2.destroyAllWindows() depth_video.release() print('-> Done!')