def __init__(self, model_name="mono_1024x320"): self.model_name = model_name download_model_if_doesnt_exist(model_name) encoder_path = os.path.join("./monodepth2/models", model_name, "encoder.pth") depth_decoder_path = os.path.join("./monodepth2/models", model_name, "depth.pth") # LOADING PRETRAINED MODEL self.encoder = networks.ResnetEncoder(18, False) self.depth_decoder = networks.DepthDecoder( num_ch_enc=self.encoder.num_ch_enc, scales=range(4)) loaded_dict_enc = torch.load(encoder_path, map_location='cpu') filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in self.encoder.state_dict() } self.encoder.load_state_dict(filtered_dict_enc) loaded_dict = torch.load(depth_decoder_path, map_location='cpu') self.depth_decoder.load_state_dict(loaded_dict) self.encoder.eval() self.depth_decoder.eval() self.feed_height = loaded_dict_enc['height'] self.feed_width = loaded_dict_enc['width']
def load_model(model_name): """ Returns an encoder, depth decoder, and expected input image size from a model name Args: model_name: One of: "mono_640x192", "stereo_640x192", "mono+stereo_640x192", "mono_no_pt_640x192", "stereo_no_pt_640x192", "mono+stereo_no_pt_640x192", "mono_1024x320", "stereo_1024x320", "mono+stereo_1024x320" """ if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(model_name) model_path = os.path.join("models", model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = monodepth2.networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = monodepth2.networks.DepthDecoder( num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() return encoder, depth_decoder, (feed_width, feed_height)
def __init__(self, model_name, no_cuda): # Setup execution env if torch.cuda.is_available() and not no_cuda: self._device = torch.device("cuda") else: self._device = torch.device("cpu") # Get model download_model_if_doesnt_exist(model_name) dir_path = os.path.dirname(os.path.abspath(__file__)) model_path = os.path.join(dir_path, "monodepth2", "models", model_name) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # Load encoder self._encoder = networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=self._device) # extract the height and width of image that this model was trained with self._feed_height = loaded_dict_enc['height'] self._feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in self._encoder.state_dict() } self._encoder.load_state_dict(filtered_dict_enc) self._encoder.to(self._device) self._encoder.eval() # Load decoder self._depth_decoder = networks.DepthDecoder( num_ch_enc=self._encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=self._device) self._depth_decoder.load_state_dict(loaded_dict) self._depth_decoder.to(self._device) self._depth_decoder.eval() # ROS image subscriber and publiser self._img_pub = rospy.Publisher('monodepth2')
def get_depth_model(model_name): if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(model_name) model_path = os.path.join("trained_models", model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() return encoder, depth_decoder, feed_width, feed_height
) # For some unknown reason the camera input is on BGR format, so we change it into RGBA. x = x.transpose((2, 0, 1)) x = torch.from_numpy(x).float() x = normalize(x) x = x.to(device) x = x[None, ...] return x # ## Setting up Monodepth model # We build our monocular depth estimation model from the Monodepth module # Define which model to use and download if not found model_name = "mono_640x192" download_model_if_doesnt_exist(model_name) # Build paths to coders and instantiate from path encoder_path = os.path.join("models", model_name, "encoder.pth") depth_decoder_path = os.path.join("models", model_name, "depth.pth") encoder = networks.ResnetEncoder(18, False).cuda() depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4)).cuda() # Encoder and Decoder loading loaded_dict_enc = torch.load(encoder_path, map_location='cpu') filtered_dict_enc = { k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict() } encoder.load_state_dict(filtered_dict_enc)
def generate_depth(output_path, mode, model_name='mono+stereo_1024x320', ext='jpg', no_cuda=False): # Load model if torch.cuda.is_available() and not no_cuda: device = torch.device("cuda") else: device = torch.device("cpu") download_model_if_doesnt_exist(model_name) model_path = os.path.join('', model_name) print("-> Loading model from ", model_path) encoder_path = os.path.join(model_path, "encoder.pth") depth_decoder_path = os.path.join(model_path, "depth.pth") # LOADING PRETRAINED MODEL print(" Loading pretrained encoder") encoder = monodepth2.networks.ResnetEncoder(18, False) loaded_dict_enc = torch.load(encoder_path, map_location=device) # extract the height and width of image that this model was trained with feed_height = loaded_dict_enc['height'] feed_width = loaded_dict_enc['width'] filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()} encoder.load_state_dict(filtered_dict_enc) encoder.to(device) encoder.eval() print(" Loading pretrained decoder") depth_decoder = monodepth2.networks.DepthDecoder( num_ch_enc=encoder.num_ch_enc, scales=range(4)) loaded_dict = torch.load(depth_decoder_path, map_location=device) depth_decoder.load_state_dict(loaded_dict) depth_decoder.to(device) depth_decoder.eval() for image_path in glob(os.path.join(output_path, mode, '*')): print('Scene: {}'.format(image_path)) # Searching folder for images paths = glob(os.path.join(image_path, '*.{}'.format(ext))) output_directory = image_path print("-> Predicting on {:d} test images".format(len(paths))) # PREDICTING ON EACH IMAGE IN TURN with torch.no_grad(): for idx, image_path in enumerate(paths): if image_path.endswith("_disp.jpg"): # don't try to predict disparity for a disparity image! continue # Load image and preprocess input_image = pil.open(image_path).convert('RGB') original_width, original_height = input_image.size input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS) input_image = transforms.ToTensor()(input_image).unsqueeze(0) # PREDICTION input_image = input_image.to(device) features = encoder(input_image) outputs = depth_decoder(features) disp = outputs[("disp", 0)] disp_resized = torch.nn.functional.interpolate( disp, (original_height, original_width), mode="bilinear", align_corners=False) # Saving numpy file output_name = os.path.splitext(os.path.basename(image_path))[0] output_name = output_name[:-len('_image')] name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name)) scaled_disp, _ = disp_to_depth(disp_resized, 0.1, 100) scaled_disp = scaled_disp.view(original_height, original_width) np.save(name_dest_npy, scaled_disp.cpu().numpy()) # Saving colormapped depth image disp_resized_np = disp_resized.squeeze().cpu().numpy() vmax = np.percentile(disp_resized_np, 95) normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax) mapper = cm.ScalarMappable(norm=normalizer, cmap='magma') colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8) im = pil.fromarray(colormapped_im) name_dest_im = os.path.join(output_directory, "{}_disp.jpg".format(output_name)) im.save(name_dest_im) print(" Processed {:d} of {:d} images - saved prediction to {}".format( idx + 1, len(paths), name_dest_im)) print('-> Done!')