def __init__( self, texture_channels=3, cloth_channels=19, num_roi=12, norm_type="batch", dropout=0.5, unet_type="pix2pix", img_size=128, ): super(TextureModule, self).__init__() self.roi_align = ROIAlign(output_size=(128, 128), spatial_scale=1, sampling_ratio=1) self.num_roi = num_roi channels = texture_channels * num_roi self.encode = UNetDown(channels, channels) # UNET if unet_type == "pix2pix": # fast log2 of img_size, int only. E.g. if size=128 => num_downs=7 num_downs = math.frexp(img_size)[1] - 1 use_dropout = True if dropout is not None else False norm_layer = get_norm_layer(norm_type=norm_type) self.unet = pix2pix_modules.UnetGenerator( channels + cloth_channels, texture_channels, num_downs, norm_layer=norm_layer, use_dropout=use_dropout, ) else: self.unet = nn.Sequential( UNetDown(channels + cloth_channels, 64, normalize=False), UNetDown(64, 128), UNetDown(128, 256), UNetDown(256, 512, dropout=dropout), UNetDown(512, 1024, dropout=dropout), UNetDown(1024, 1024, normalize=False, dropout=dropout), UNetUp(1024, 1024, dropout=dropout), UNetUp(2 * 1024, 512, dropout=dropout), UNetUp(2 * 512, 256), UNetUp(2 * 256, 128), UNetUp(2 * 128, 64), # upsample and pad nn.Upsample(scale_factor=2), nn.ZeroPad2d((1, 0, 1, 0)), nn.Conv2d(128, texture_channels, 4, padding=1), nn.Tanh(), )
def __init__(self, output_w, output_h, dropout=0.5, channels=3): super(MaskTryOnModule, self).__init__() self.face_down1 = UNetDown(channels, 64, normalize=False) self.face_down2 = UNetDown(64, 128) self.face_down3 = UNetDown(128, 256) self.face_down4 = UNetDown(256, 512) self.face_down5 = UNetDown(512, 1024, dropout=dropout) self.face_down6 = UNetDown(1024, 1024, normalize=False, dropout=dropout) # the two UNetUp's below will be used WITHOUT concatenation. # hence the input size will not double self.face_up1 = UNetUp(1024, 1024) self.face_up2 = UNetUp(1024, 512) self.face_up3 = UNetUp(512, 256) self.face_up4 = UNetUp(256, 128) self.face_up5 = UNetUp(128, 64) self.upsample_and_pad = nn.Sequential( nn.Upsample(scale_factor=2), nn.ZeroPad2d((1, 0, 1, 0)), nn.Conv2d(3 * 64, 3, 4, padding=1), nn.Tanh(), )
def __init__(self, body_channels=3, cloth_channels=19, dropout=0.5): super(WarpModule, self).__init__() ###################### # Body pre-encoding # (top left of SwapNet diagram) ###################### self.body_down1 = UNetDown(body_channels, 64, normalize=False) self.body_down2 = UNetDown(64, 128) self.body_down3 = UNetDown(128, 256) self.body_down4 = UNetDown(256, 512, dropout=dropout) ###################### # Cloth pre-encoding # (bottom left of SwapNet diagram) ###################### self.cloth_down1 = UNetDown(cloth_channels, 64, normalize=False) self.cloth_down2 = UNetDown(64, 128) self.cloth_down3 = UNetDown(128, 256) self.cloth_down4 = UNetDown(256, 512) self.cloth_down5 = UNetDown(512, 1024, dropout=dropout) self.cloth_down6 = UNetDown(1024, 1024, normalize=False, dropout=dropout) # the two UNetUp's below will be used WITHOUT concatenation. # hence the input size will not double self.cloth_up1 = UNetUp(1024, 1024) self.cloth_up2 = UNetUp(1024, 512) ###################### # Resblocks # (middle of SwapNet diagram) ###################### self.resblocks = nn.Sequential( # I don't really know if dropout should go here. I'm just guessing ResidualBlock(1024, dropout=dropout), ResidualBlock(1024, dropout=dropout), ResidualBlock(1024, dropout=dropout), ResidualBlock(1024, dropout=dropout), ) ###################### # Dual Decoding # (right of SwapNet diagram, maybe) ###################### # The SwapNet diagram just says "cloth" decoder, so I don't know if they're # actually doing dual decoding like I've done here. # Still, I think it's cool and it makes more sense to me. # Found from "Multi-view Image Generation from a Single-View". # --------------------- # input encoded (512) & cat body_d4 (512) cloth_d4 (512) self.dual_up1 = DualUNetUp(1024, 256) # input dual_up1 (256) & cat body_d3 (256) cloth_d3 (256) self.dual_up2 = DualUNetUp(3 * 256, 128) # input dual_up2 (128) & cat body_d2 (128) cloth_d2 (128) self.dual_up3 = DualUNetUp(3 * 128, 64) # TBH I don't really know what the below code does. # like why don't we dualnetup with down1? # maybe specific to pix2pix? hm, if so maybe we should replicate. # ------ # update: OHHH I get it now. it's because U-Net only outputs half the size as # the original image, hence we need to upsample. self.upsample_and_pad = nn.Sequential( nn.Upsample(scale_factor=2), nn.ZeroPad2d((1, 0, 1, 0)), nn.Conv2d(3 * 64, cloth_channels, 4, padding=1), nn.Tanh(), )