def forward(self, imgLeft, imgRight): N, C, H, W = imgLeft.size()[:] assert C == 3, 'should be RGB images as input' #NOTE: newly added for quarter size cost volume; # add one downsample operation: if self.is_quarter_size_cost_volume_gcnet: img_ds_scale = 2 imgl = F.interpolate(imgLeft, [H // 2, W // 2], mode='bilinear', align_corners=True) imgr = F.interpolate(imgRight, [H // 2, W // 2], mode='bilinear', align_corners=True) else: img_ds_scale = 1 imgl = imgLeft imgr = imgRight # feature extraction; f_imgl = self.feature_extraction(imgl) f_imgr = self.feature_extraction(imgr) # cost volume cv = cost_volume_faster(f_imgl, f_imgr, d=self.maxdisp // (2 * img_ds_scale)) #print ("[???] cv shape: ", cv.shape) # cost volume aggregation if self.is_kendall_version: out = self.cost_aggregation_kendall(cv) else: out = self.cost_aggregation(cv) out = out.view(N, self.maxdisp // img_ds_scale, H // img_ds_scale, W // img_ds_scale) #NOTE: This is right!!! Updated on 04/12/2020; # We should upsample the cost volume (now in quarter size) to full size before the soft-argmin operation; # which can gaurantee that the regressed disparity range should be in [0, D) (instead of in [0, D/4)); if self.is_quarter_size_cost_volume_gcnet: # corresponding to the first downsampling at the beginning to the input image pair; out = out[:, None, ...] # add channel C first, i.e., chang [N,D,H,W] to [N,C=1,D,H,W]; out = F.interpolate(out, [self.maxdisp, H, W], mode='trilinear', align_corners=True) # in size [N,C=1,D,H,W]; out = torch.squeeze(out, 1) # in size [N,D,H,W] prob = F.softmax(out, 1) #disp = self.disparityregression(prob, maxdisp=self.maxdisp//img_ds_scale) #NOTE: This is right!!! Updated on 04/12/2020; disp = self.disparityregression(prob, maxdisp=self.maxdisp) #NOTE: This is wrong!!! #if self.is_quarter_size_cost_volume_gcnet: # # NOTE: newly added for SGA: upsampling operation, # # corresponding to the first downsampling at the beginning to the input image pair; # disp = F.interpolate(disp[:,None,...], [H, W], mode='bilinear', align_corners=True) # disp = torch.squeeze(disp,1) # [N,H,W] return disp
def forward(self, imgLeft, imgRight): N, C, H, W = imgLeft.size()[:] assert C == 3, 'should be RGB images as input' #NOTE: newly added for quarter size cost volume; # add one downsample operation: if self.is_quarter_size_cost_volume_gcnet: img_ds_scale = 2 imgl = F.interpolate(imgLeft, [H // 2, W // 2], mode='bilinear', align_corners=True) imgr = F.interpolate(imgRight, [H // 2, W // 2], mode='bilinear', align_corners=True) else: img_ds_scale = 1 imgl = imgLeft imgr = imgRight #print ("[???] imgLeft shape: ", imgLeft.shape) imgl0 = self.relu(self.convbn0(imgl)) imgr0 = self.relu(self.convbn0(imgr)) imgl_block = self.res_block(imgl0) imgr_block = self.res_block(imgr0) #print ("[???] imgl_block shape: ", imgl_block.shape) imgl1 = self.conv1(imgl_block) imgr1 = self.conv1(imgr_block) #print ("[???] imgl1 shape: ", imgl1.shape) # cost volume #cv = self.get_costVolume(imgl1,imgr1) #cv = self.cost_volume(imgl1,imgr1) #cv = self.cost_volume_faster(imgl1,imgr1) cv = cost_volume_faster(imgl1, imgr1, d=self.maxdisp // (2 * img_ds_scale)) #print ("[???] cv shape: ", cv.shape) out = self.relu(self.conv3dbn_1(cv)) # conv3d_19 out = self.relu(self.conv3dbn_2(out)) # conv3d_20 #conv3d block res_l20 = out # from layer conv3d_20; out = self.block_3d_1(out) # conv3d_21,22,23 res_l23 = out out = self.block_3d_2(out) # conv3d_24,25,26 res_l26 = out out = self.block_3d_3(out) # conv3d_27,28,29 res_l29 = out out = self.block_3d_4(out) # conv3d_30,31,32 #print ("[???] after conv3d_32 out shape = ", out.shape) #deconv3d #print ("[???] res_l29: ", res_l29.shape) out = self.relu(self.deconvbn1(out) + res_l29) out = self.relu(self.deconvbn2(out) + res_l26) out = self.relu(self.deconvbn3(out) + res_l23) out = self.relu(self.deconvbn4(out) + res_l20) #last deconv3d, no BN or ReLU out = self.deconv5(out) # [N, 1, D, H, W] #print ("[???] out shape = ", out.shape) out = out.view(N, self.maxdisp // img_ds_scale, H // img_ds_scale, W // img_ds_scale) prob = F.softmax(out, 1) #disp = self.disparityregression(prob) disp = self.disparityregression(prob, maxdisp=self.maxdisp // img_ds_scale) if self.is_quarter_size_cost_volume_gcnet: # NOTE: newly added for SGA: upsampling operation, # corresponding to the first downsampling at the beginning to the input image pair; disp = F.interpolate(disp[:, None, ...], [H, W], mode='bilinear', align_corners=True) disp = torch.squeeze(disp, 1) # [N,H,W] return disp
def forward(self, imgLeft, imgRight): N, C, H, W = imgLeft.size()[:] assert C == 3, 'should be RGB images as input' #print ("[???] imgLeft shape: ", imgLeft.shape) if self.is_quarter_size_cost_volume_gcnet: img_ds_scale = 2 imgl = F.interpolate( imgLeft, [H // 2, W // 2], mode='bilinear', align_corners=True) #in size [N, 3, H/2, W/2]; imgr = F.interpolate( imgRight, [H // 2, W // 2], mode='bilinear', align_corners=True) #in size [N, 3, H/2, W/2]; else: img_ds_scale = 1 imgl = imgLeft imgr = imgRight # feature extraction; f_imgl = self.feature_extraction(imgl) f_imgr = self.feature_extraction(imgr) # cost volume cv = cost_volume_faster(f_imgl, f_imgr, d=self.maxdisp // (2 * img_ds_scale)) #print ("[???] cv shape: ", cv.shape) if not self.isDFN: dfn_filter = None dfn_bias = None else: # downscale x to [N,C,H/2, W/2] then fed into embeddingnet, # because the cost volume generated below is in shape [N,C,D/2, H/2, W/2] left_scale = F.interpolate(imgLeft, [ imgLeft.size()[2] // (2 * img_ds_scale), imgLeft.size()[3] // (2 * img_ds_scale) ], mode='bilinear', align_corners=True) #print ('[???] left shape', left.shape) #print ('[???] left_scale shape', left_scale.shape) dfn_filter, dfn_bias = self.dfn_generator(left_scale) D = cv.size()[2] #print ('[???] cost size = ', cost.size()) # NOTE: this might be the memory consuming!!! # NO sure this torch.no_grad() will distory the training or not !!!! #with torch.set_grad_enabled(False): #with torch.set_grad_enabled(True): with torch.set_grad_enabled(self.cost_filter_grad): for d in range(0, D): #for d in range(0,1): #print ('DFN filtering cost volume slice %d/%d' %(d+1, D)) # apply DFN filter to cost volume [N,C,H,W]; cv_d_slice = cv[:, :, d, :, :].contiguous() #print ('[???] cv_d_slice shape', cv_d_slice.shape) cv[:, :, d, :, :] = self.dfn_layer(cv_d_slice, dfn_filter, dfn_bias) cv = cv.contiguous() # cost volume aggregation if self.is_kendall_version: out = self.cost_aggregation_kendall(cv) else: out = self.cost_aggregation(cv) out = out.view(N, self.maxdisp // img_ds_scale, H // img_ds_scale, W // img_ds_scale) #NOTE: This is right!!! Updated on 04/12/2020; # We should upsample the cost volume (now in quarter size) to full size before the soft-argmin operation; # which can gaurantee that the regressed disparity range should be in [0, D) (instead of in [0, D/4)); if self.is_quarter_size_cost_volume_gcnet: # corresponding to the first downsampling at the beginning to the input image pair; out = out[:, None, ...] # add channel C first, i.e., chang [N,D,H,W] to [N,C=1,D,H,W]; out = F.interpolate(out, [self.maxdisp, H, W], mode='trilinear', align_corners=True) # in size [N,C=1,D,H,W]; out = torch.squeeze(out, 1) # in size [N,D,H,W] prob = F.softmax(out, 1) #disp = self.disparityregression(prob, maxdisp=self.maxdisp//img_ds_scale) #NOTE: This is right!!! Updated on 04/12/2020; disp = self.disparityregression(prob, maxdisp=self.maxdisp) #if self.is_quarter_size_cost_volume_gcnet: # # NOTE: newly added for PAC: upsampling operation, # # corresponding to the first downsampling at the beginning to the input image pair; # disp = F.interpolate(disp[:,None,...], [H, W], mode='bilinear', align_corners=True) # disp = torch.squeeze(disp,1) # [N,H,W] if return disp, dfn_filter, dfn_bias else: return disp, [dfn_filter, dfn_bias]
def forward(self, left, right): x = self.feature_extraction(left) # left feature y = self.feature_extraction(right) # right feature # matching volume, in size [N,2C,D/4, H/4, W/4]; cost = cost_volume_faster(x, y, self.maxdisp // 4) if not self.isEmbed: embed = None else: # downscale x to [N,C,H/4, W/4] then fed into embeddingnet, # because the cost volume generated below is in shape [N,C,D/4, H/4, W/4] left_scale = F.interpolate( left, [left.size()[2] // 4, left.size()[3] // 4], mode='bilinear', align_corners=True) #print ('[???] left shape', left.shape) #print ('[???] left_scale shape', left_scale.shape) """ embed shape [2, 64, 64, 128]""" embed = self.embednet(left_scale) #print ('[???] embed shape', embed.shape) """ cost shape [2, 64, 36, 64, 128]""" N, C, D, H, W = cost.size()[:] #print ('[???] cost shape', cost.shape) # NOTE: this might be the memory consuming!!! # NO sure this torch.no_grad() will distory the training or not !!!! #with torch.set_grad_enabled(False): #with torch.set_grad_enabled(True): with torch.set_grad_enabled(self.cost_filter_grad): for d in range(0, D): #for d in range(0,1): #print ('bilateral filtering cost volume slice %d/%d' %(d+1, D)) # apply bilateral filter to cost volume [N,C,H,W]; cv_d_slice = cost[:, :, d, :, :].contiguous() #print ('[???] cv_d_slice shape', cv_d_slice.shape) cost[:, :, d, :, :] = self.bifilter(embed, cv_d_slice) cost = cost.contiguous() cost0 = self.dres0(cost) cost0 = self.dres1(cost0) + cost0 out1, pre1, post1 = self.dres2(cost0, None, None) out1 = out1 + cost0 out2, pre2, post2 = self.dres3(out1, pre1, post1) out2 = out2 + cost0 out3, pre3, post3 = self.dres4(out2, pre1, post2) out3 = out3 + cost0 cost1 = self.classif1(out1) cost2 = self.classif2(out2) + cost1 cost3 = self.classif3(out3) + cost2 if # updated by CCJ: due to deprecated warning! #cost1 = F.upsample(cost1, [self.maxdisp,left.size()[2],left.size()[3]], mode='trilinear') #cost2 = F.upsample(cost2, [self.maxdisp,left.size()[2],left.size()[3]], mode='trilinear') cost1 = F.interpolate( cost1, [self.maxdisp, left.size()[2], left.size()[3]], mode='trilinear', align_corners=True) cost2 = F.interpolate( cost2, [self.maxdisp, left.size()[2], left.size()[3]], mode='trilinear', align_corners=True) cost1 = torch.squeeze(cost1, 1) pred1 = F.softmax(cost1, dim=1) pred1 = disparityregression(self.maxdisp)(pred1) cost2 = torch.squeeze(cost2, 1) pred2 = F.softmax(cost2, dim=1) pred2 = disparityregression(self.maxdisp)(pred2) #cost3 = F.upsample(cost3, [self.maxdisp,left.size()[2],left.size()[3]], mode='trilinear') cost3 = F.interpolate( cost3, [self.maxdisp, left.size()[2], left.size()[3]], mode='trilinear', align_corners=True) cost3 = torch.squeeze(cost3, 1) pred3 = F.softmax(cost3, dim=1) # For your information: This formulation 'softmax(c)' learned "similarity" # while 'softmax(-c)' learned 'matching cost' as mentioned in the paper. # However, 'c' or '-c' do not affect the performance because feature-based cost volume provided flexibility. pred3 = disparityregression(self.maxdisp)(pred3) if return pred1, pred2, pred3, embed else: return pred3, embed
def forward(self, imgLeft, imgRight): N, C, H, W = imgLeft.size()[:] assert C == 3, 'should be RGB images as input' #NOTE: newly added for quarter size cost volume; # add one downsample operation: if self.is_quarter_size_cost_volume_gcnet: img_ds_scale = 2 imgl = F.interpolate(imgLeft, [H // 2, W // 2], mode='bilinear', align_corners=True) imgr = F.interpolate(imgRight, [H // 2, W // 2], mode='bilinear', align_corners=True) else: img_ds_scale = 1 imgl = imgLeft imgr = imgRight # feature extraction; f_imgl = self.feature_extraction(imgl) f_imgr = self.feature_extraction(imgr) # cost volume cv = cost_volume_faster(f_imgl, f_imgr, d=self.maxdisp // (2 * img_ds_scale)) #print ("[???] cv shape: ", cv.shape) if self.is_sga_guide_from_img: g_in = None else: # downscale x to [N,C,H/4, W/4] then fed into embeddingnet, # because the cost volume generated below is in shape [N,C,D/4, H/4, W/4] left_scale = F.interpolate(imgLeft, [H // 4, W // 4], mode='bilinear', align_corners=True) """ embed shape [2, 64, 64, 128]""" g_in = self.embednet(left_scale) #print ('[???] embed shape', embed.shape) """ apply SGA_CostAggregation() """ # NOTE: this might be the memory consuming!!! with torch.set_grad_enabled(self.cost_filter_grad): cv = self.sga_costAgg(cv, g_in, img_for_g=imgLeft) #print ('[???] cost shape', cv.shape) cv = cv.contiguous() # cost volume aggregation if self.is_kendall_version: out = self.cost_aggregation_kendall(cv) else: out = self.cost_aggregation(cv) out = out.view(N, self.maxdisp // img_ds_scale, H // img_ds_scale, W // img_ds_scale) #NOTE: This is right!!! Updated on 04/12/2020; # We should upsample the cost volume (now in quarter size) to full size before the soft-argmin operation; # which can gaurantee that the regressed disparity range should be in [0, D) (instead of in [0, D/4)); if self.is_quarter_size_cost_volume_gcnet: # corresponding to the first downsampling at the beginning to the input image pair; out = out[:, None, ...] # add channel C first, i.e., chang [N,D,H,W] to [N,C=1,D,H,W]; out = F.interpolate(out, [self.maxdisp, H, W], mode='trilinear', align_corners=True) # in size [N,C=1,D,H,W]; out = torch.squeeze(out, 1) # in size [N,D,H,W] prob = F.softmax(out, 1) #disp = self.disparityregression(prob, maxdisp=self.maxdisp//img_ds_scale) #NOTE: This is right!!! Updated on 04/12/2020; disp = self.disparityregression(prob, maxdisp=self.maxdisp) return disp, g_in
def forward(self, left, right): x = self.feature_extraction(left) # left feature y = self.feature_extraction(right) # right feature # matching volume, in size [N,2C,D/4, H/4, W/4]; cost = cost_volume_faster(x, y, self.maxdisp // 4) if self.is_sga_guide_from_img: g_in = None else: # downscale x to [N,C,H/4, W/4] then fed into embeddingnet, # because the cost volume generated below is in shape [N,C,D/4, H/4, W/4] left_scale = F.interpolate( left, [left.size()[2] // 4, left.size()[3] // 4], mode='bilinear', align_corners=True) #print ('[???] left shape', left.shape) #print ('[???] left_scale shape', left_scale.shape) """ embed shape [2, 64, 64, 128]""" g_in = self.embednet(left_scale) #print ('[???] embed shape', embed.shape) """ apply SGA_CostAggregation() """ # NOTE: this might be the memory consuming!!! with torch.set_grad_enabled(self.cost_filter_grad): cost = self.sga_costAgg(cost, g_in, img_for_g=left) #print ('[???] cost shape', cost.shape) cost0 = self.dres0(cost) cost0 = self.dres1(cost0) + cost0 out1, pre1, post1 = self.dres2(cost0, None, None) out1 = out1 + cost0 out2, pre2, post2 = self.dres3(out1, pre1, post1) out2 = out2 + cost0 out3, pre3, post3 = self.dres4(out2, pre1, post2) out3 = out3 + cost0 cost1 = self.classif1(out1) cost2 = self.classif2(out2) + cost1 cost3 = self.classif3(out3) + cost2 if # updated by CCJ: due to deprecated warning! cost1 = F.interpolate( cost1, [self.maxdisp, left.size()[2], left.size()[3]], mode='trilinear', align_corners=True) cost2 = F.interpolate( cost2, [self.maxdisp, left.size()[2], left.size()[3]], mode='trilinear', align_corners=True) cost1 = torch.squeeze(cost1, 1) pred1 = F.softmax(cost1, dim=1) pred1 = disparityregression(self.maxdisp)(pred1) cost2 = torch.squeeze(cost2, 1) pred2 = F.softmax(cost2, dim=1) pred2 = disparityregression(self.maxdisp)(pred2) #cost3 = F.upsample(cost3, [self.maxdisp,left.size()[2],left.size()[3]], mode='trilinear') cost3 = F.interpolate( cost3, [self.maxdisp, left.size()[2], left.size()[3]], mode='trilinear', align_corners=True) cost3 = torch.squeeze(cost3, 1) pred3 = F.softmax(cost3, dim=1) # For your information: This formulation 'softmax(c)' learned "similarity" # while 'softmax(-c)' learned 'matching cost' as mentioned in the paper. # However, 'c' or '-c' do not affect the performance because feature-based cost volume provided flexibility. pred3 = disparityregression(self.maxdisp)(pred3) if return pred1, pred2, pred3, g_in else: return pred3, g_in