Exemplo n.º 1
0
 def __init__(self,
              backbone,
              img_size=224,
              feature_size=None,
              in_chans=3,
              embed_dim=768):
     super().__init__()
     assert isinstance(backbone, nn.Module)
     img_size = to_2tuple(img_size)
     self.img_size = img_size
     self.backbone = backbone
     if feature_size is None:
         with torch.no_grad():
             training = backbone.training
             if training:
                 backbone.eval()
             o = self.backbone(
                 torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
             feature_size = o.shape[-2:]
             feature_dim = o.shape[1]
             backbone.train(training)
     else:
         feature_size = to_2tuple(feature_size)
         feature_dim = self.backbone.feature_info.channels()[-1]
     self.num_patches = feature_size[0] * feature_size[1]
     self.proj = nn.Linear(feature_dim, embed_dim)
Exemplo n.º 2
0
 def __init__(self,
              backbone,
              img_size=224,
              feature_size=None,
              in_chans=3,
              embed_dim=768):
     super().__init__()
     assert isinstance(backbone, nn.Module)
     img_size = to_2tuple(img_size)
     self.img_size = img_size
     self.backbone = backbone
     if feature_size is None:
         with torch.no_grad():
             # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
             # map for all networks, the feature metadata has reliable channel and stride info, but using
             # stride to calc feature dim requires info about padding of each stage that isn't captured.
             training = backbone.training
             if training:
                 backbone.eval()
             o = self.backbone(
                 torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
             feature_size = o.shape[-2:]
             feature_dim = o.shape[1]
             backbone.train(training)
     else:
         feature_size = to_2tuple(feature_size)
         feature_dim = self.backbone.feature_info.channels()[-1]
     self.num_patches = feature_size[0] * feature_size[1]
     self.proj = nn.Linear(feature_dim, embed_dim)
Exemplo n.º 3
0
    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)