Пример #1
0
def model():
    model = MetaSequential(
        MetaLinear(2, 3, bias=True),
        nn.ReLU(),
        MetaLinear(3, 1, bias=False))

    return model
    def __init__(self):
        super(MIL, self).__init__()
        h1, w1 = 200, 200
        h2, w2 = calcHW(h1, w1, kernel_size=8, stride=2)
        h3, w3 = calcHW(h2, w2, kernel_size=4, stride=2)
        h4, w4 = calcHW(h3, w3, kernel_size=4, stride=2)

        self.features = MetaSequential(
            MetaConv2d(in_channels=3, out_channels=32, kernel_size=8,
                       stride=2), MetaLayerNorm([32, h2, w2]), nn.ReLU(),
            MetaConv2d(in_channels=32,
                       out_channels=64,
                       kernel_size=4,
                       stride=2), MetaLayerNorm([64, h3, w3]), nn.ReLU(),
            MetaConv2d(in_channels=64,
                       out_channels=64,
                       kernel_size=4,
                       stride=2), MetaLayerNorm([64, h4, w4]), nn.ReLU(),
            SpatialSoftmax(h4, w4))

        self.policy = MetaSequential(
            MetaLinear(2 * 64 + 3, 128),
            nn.ReLU(),
            MetaLinear(128, 128),
            nn.ReLU(),
            MetaLinear(128, 128),
            nn.ReLU(),
            MetaLinear(128, 4),
        )
Пример #3
0
    def __init__(self,
                 out_features,
                 in_channels=1,
                 hidden_size=32,
                 mid_feats=256,
                 feature_size=25088):
        super(MetaMNISTConvModel, self).__init__()

        self.in_channels = in_channels
        self.out_features = out_features
        self.hidden_size = hidden_size
        self.feature_size = feature_size
        self.mid_feats = mid_feats

        self.features = MetaSequential(
            OrderedDict([
                ('layer1',
                 conv_block(in_channels,
                            hidden_size,
                            kernel_size=3,
                            stride=1,
                            padding=1,
                            bias=True)),
                ('layer2',
                 conv_block(hidden_size,
                            hidden_size,
                            kernel_size=3,
                            stride=1,
                            padding=1,
                            bias=True)),
            ]))
        self.classifier_first = MetaLinear(feature_size, mid_feats, bias=True)
        self.classifier = MetaLinear(mid_feats, out_features, bias=True)
Пример #4
0
 def __init__(self, l_obs, n_action, l1=64, l2=64):
     super(MetaNet_PG, self).__init__()
     self.l_obs = l_obs
     self.n_action = n_action
     self.l1 = l1
     self.l2 = l2
     self.actor_net = MetaSequential(
         MetaLinear(self.l_obs, self.l1), nn.ReLU(),
         MetaLinear(self.l1, self.l2), nn.ReLU(),
         MetaLinear(self.l2, self.n_action), nn.Sigmoid(),
         MetaLinear(self.n_action, self.n_action))
Пример #5
0
    def __init__(self, in_channels, hidden1_size=40, hidden2_size=80):
        super(RegressionNeuralNetwork, self).__init__()
        self.in_channels = in_channels
        self.hidden1_size = hidden1_size
        self.hidden2_size = hidden2_size

        self.regressor = MetaSequential(MetaLinear(in_channels, hidden1_size),
                                        nn.ReLU(),
                                        MetaLinear(hidden1_size, hidden2_size),
                                        nn.ReLU(),
                                        MetaLinear(hidden2_size, hidden1_size),
                                        nn.ReLU(), MetaLinear(hidden1_size, 1))
Пример #6
0
    def __init__(self, in_features, out_features, hidden_sizes):
        super(MetaMLPModel, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.hidden_sizes = hidden_sizes

        layer_sizes = [in_features] + hidden_sizes
        self.features = MetaSequential(OrderedDict([('layer{0}'.format(i + 1),
            MetaSequential(OrderedDict([
                ('linear', MetaLinear(hidden_size, layer_sizes[i + 1], bias=True)),
                ('relu', nn.ReLU())
            ]))) for (i, hidden_size) in enumerate(layer_sizes[:-1])]))
        self.classifier = MetaLinear(hidden_sizes[-1], out_features, bias=True)
Пример #7
0
def test_ridge_regression_requires_grad(reg_lambda, use_woodbury, scale, bias):
    # Numpy
    num_classes = 3
    embeddings_np = np.random.randn(5, 7).astype(np.float32)
    targets_np = np.random.randint(0, num_classes, size=(5, ))

    # PyTorch
    embeddings_th = torch.as_tensor(embeddings_np).requires_grad_()
    targets_th = torch.as_tensor(targets_np)
    model = MetaLinear(7, 3, bias=bias)

    solution = ridge_regression(embeddings_th,
                                targets_th,
                                reg_lambda,
                                num_classes,
                                use_woodbury=use_woodbury,
                                scale=scale,
                                bias=bias)
    params = OrderedDict([('weight', solution.weight)])
    if bias:
        params['bias'] = solution.bias

    # Compute loss on test/query samples
    test_embeddings = torch.randn(11, 7)
    test_logits = model(test_embeddings, params=params)
    test_targets = torch.randint(num_classes, size=(11, ))
    loss = F.cross_entropy(test_logits, test_targets)

    # Backpropagation
    loss.backward()

    assert embeddings_th.grad is not None
Пример #8
0
    def __init__(self,
                 in_dim,
                 out_dim,
                 num_layers=3,
                 hidden_size=64,
                 nonlinearity="relu"):
        super(MultiLayerPerceptron, self).__init__()
        self.in_dim = in_dim
        self.hidden_size = hidden_size
        self.out_dim = out_dim

        if nonlinearity == "relu":
            self.activation = nn.ReLU
        elif nonlinearity == "swish":
            self.activation = swish
        elif nonlinearity == "sigmoid":
            self.activation = nn.sigmoid
        else:
            raise ()

        self.layer_list = [
            nn.Flatten(),
            nn.Linear(in_dim, hidden_size),
            self.activation()
        ]
        for _ in range(num_layers):
            self.layer_list.extend(
                [nn.Linear(hidden_size, hidden_size),
                 self.activation()])

        # Should be able to add variable layers
        self.features = MetaSequential(*self.layer_list)

        self.classifier = MetaLinear(hidden_size, out_dim)
Пример #9
0
    def __init__(self,
                 out_features,
                 in_channels=1,
                 hidden_size=64,
                 feature_size=64):
        super(MetaToyConvModel, self).__init__()

        self.in_channels = in_channels
        self.out_features = out_features
        self.hidden_size = hidden_size
        self.feature_size = feature_size

        self.features = MetaSequential(
            OrderedDict([
                ('layer1',
                 conv_block(in_channels,
                            hidden_size,
                            kernel_size=3,
                            stride=1,
                            padding=1,
                            bias=True)),
                ('layer2',
                 conv_block(hidden_size,
                            hidden_size,
                            kernel_size=3,
                            stride=1,
                            padding=1,
                            bias=True)),
                # ('layer3', conv_block(hidden_size, hidden_size, kernel_size=3,
                #                       stride=1, padding=1, bias=True)),
                # ('layer4', conv_block(hidden_size, hidden_size, kernel_size=3,
                #                       stride=1, padding=1, bias=True))
            ]))
        self.classifier = MetaLinear(feature_size, out_features, bias=True)
Пример #10
0
def test_metasequential_params():
    meta_model = MetaSequential(
        nn.Linear(2, 3, bias=True),
        nn.ReLU(),
        MetaLinear(3, 5, bias=True))
    model = nn.Sequential(
        nn.Linear(2, 3, bias=True),
        nn.ReLU(),
        nn.Linear(3, 5, bias=True))

    # Set same weights for both models (first layer)
    weight0 = torch.randn(3, 2)
    meta_model[0].weight.data.copy_(weight0)
    model[0].weight.data.copy_(weight0)

    bias0 = torch.randn(3)
    meta_model[0].bias.data.copy_(bias0)
    model[0].bias.data.copy_(bias0)

    params = OrderedDict()
    params['2.weight'] = torch.randn(5, 3)
    model[2].weight.data.copy_(params['2.weight'])

    params['2.bias'] = torch.randn(5)
    model[2].bias.data.copy_(params['2.bias'])

    inputs = torch.randn(5, 2)

    outputs_torchmeta = meta_model(inputs, params=params)
    outputs_nn = model(inputs)

    np.testing.assert_equal(outputs_torchmeta.detach().numpy(),
                            outputs_nn.detach().numpy())
Пример #11
0
    def __init__(self,
                 in_channels,
                 out_features,
                 hidden_size=64,
                 fc_in_size=None,
                 conv_kernel=[3, 3, 3, 3],
                 strides=None):
        super(ConvolutionalNeuralNetwork, self).__init__()
        self.in_channels = in_channels
        self.out_features = out_features
        self.hidden_size = hidden_size

        if strides is None:
            strides = [None] * len(conv_kernel)
        else:
            assert (len(strides) == len(conv_kernel))

        self.features = MetaSequential(
            conv3x3(in_channels, hidden_size, conv_kernel[0], strides[0]), )
        for k in range(1, len(conv_kernel)):
            self.features.add_module(
                'block_' + str(k),
                conv3x3(hidden_size, hidden_size, conv_kernel[k], strides[k]))

        if fc_in_size is None:
            fc_in_size = hidden_size
        self.classifier = MetaLinear(fc_in_size, out_features)
Пример #12
0
    def __init__(self,
                 out_features=10,
                 input_size=100,
                 hidden_size1=300,
                 hidden_size2=200):

        super(MLP, self).__init__()
        self.out_features = out_features

        self.features = MetaSequential(nn.Linear(input_size, hidden_size1),
                                       nn.ReLU(),
                                       nn.Linear(hidden_size1, hidden_size2),
                                       nn.ReLU(),
                                       nn.Linear(hidden_size2, hidden_size1),
                                       nn.ReLU(),
                                       nn.Linear(hidden_size1, hidden_size2),
                                       nn.ReLU(),
                                       nn.Linear(hidden_size2, hidden_size1),
                                       nn.ReLU(),
                                       nn.Linear(hidden_size1, hidden_size2),
                                       nn.ReLU(),
                                       nn.Linear(hidden_size2, hidden_size1),
                                       nn.ReLU())

        self.classifier = MetaLinear(hidden_size1, out_features)
Пример #13
0
    def __init__(self, backbone, in_features, num_ways):
        super(ConvolutionalNeuralNetwork, self).__init__()
        self.in_features = in_features
        self.num_ways = num_ways

        self.encoder = get_meta_backbone(backbone)
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))

        self.classifier = MetaLinear(in_features, num_ways) #1600
Пример #14
0
 def __init__(self, nc, num_classes, block, num_blocks):
     super(ResNet, self).__init__()
     self.in_planes = 64
     self.conv1 = MetaConv2d(nc, 64, kernel_size=3, stride=1, padding=1, bias=False)
     self.bn1 = MetaBatchNorm2d(64)
     self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
     self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
     self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
     self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
     self.linear = MetaLinear(512 * block.expansion, num_classes)
Пример #15
0
    def __init__(self, model_dict, dataset, device):
        super().__init__()

        self.dataset = dataset

        self.model_dict = model_dict
        if self.model_dict['name'] == 'resnet18':
            if self.model_dict['pretrained']:
                self.net = models.resnet18(pretrained=True)
                self.net.fc = nn.Linear(512, self.dataset.n_classes)
            else:
                self.net = models.resnet18(num_classes=self.dataset.n_classes)

        elif self.model_dict['name'] == 'resnet18_meta':
            if self.model_dict.get('pretrained', True):
                self.net = resnet_meta.resnet18(pretrained=True)
                self.net.fc = MetaLinear(512, self.dataset.n_classes)
            else:
                self.net = resnet_meta.resnet18(
                    num_classes=self.dataset.n_classes)
        elif self.model_dict['name'] == 'resnet18_meta_2':
            self.net = resnet_meta_2.ResNet18(nc=3,
                                              nclasses=self.dataset.n_classes)

        elif self.model_dict['name'] == 'resnet18_meta_old':
            self.net = resnet_meta_old.ResNet18(
                nc=3, nclasses=self.dataset.n_classes)

        else:
            raise ValueError('network %s does not exist' % model_dict['name'])

        if (device.type == 'cuda'):
            self.net = DataParallel(self.net)
        self.net.to(device)
        # set optimizer
        self.opt_dict = model_dict['opt']
        self.lr_init = self.opt_dict['lr']
        if self.model_dict['opt']['name'] == 'sps':
            n_batches_per_epoch = 120
            self.opt = sps.Sps(self.net.parameters(),
                               n_batches_per_epoch=n_batches_per_epoch,
                               c=0.5,
                               adapt_flag='smooth_iter',
                               eps=0,
                               eta_max=None)
        else:
            self.opt = optim.SGD(self.net.parameters(),
                                 lr=self.opt_dict['lr'],
                                 momentum=self.opt_dict['momentum'],
                                 weight_decay=self.opt_dict['weight_decay'])

        # variables
        self.device = device
Пример #16
0
    def __init__(self, in_channels, out_features, hidden_size=64):
        super(ConvolutionalNeuralNetwork, self).__init__()
        self.in_channels = in_channels
        self.out_features = out_features
        self.hidden_size = hidden_size

        self.features = MetaSequential(conv3x3(in_channels, hidden_size),
                                       conv3x3(hidden_size, hidden_size),
                                       conv3x3(hidden_size, hidden_size),
                                       conv3x3(hidden_size, hidden_size))

        self.classifier = MetaLinear(hidden_size, out_features)
Пример #17
0
    def __init__(self,
                 blocks,
                 keep_prob=1.0,
                 avg_pool=False,
                 drop_rate=0.0,
                 dropblock_size=5,
                 out_features=5,
                 wh_size=None):
        self.inplanes = 3
        super(ResNet, self).__init__()

        self.layer1 = self._make_layer(blocks[0],
                                       64,
                                       stride=2,
                                       drop_rate=drop_rate,
                                       drop_block=True,
                                       block_size=dropblock_size)
        self.layer2 = self._make_layer(blocks[1],
                                       128,
                                       stride=2,
                                       drop_rate=drop_rate,
                                       drop_block=True,
                                       block_size=dropblock_size)
        self.layer3 = self._make_layer(blocks[2],
                                       256,
                                       stride=2,
                                       drop_rate=drop_rate,
                                       drop_block=True,
                                       block_size=dropblock_size)
        self.layer4 = self._make_layer(blocks[3],
                                       512,
                                       stride=2,
                                       drop_rate=drop_rate,
                                       drop_block=True,
                                       block_size=dropblock_size)
        if avg_pool:
            self.avgpool = nn.AdaptiveAvgPool2d(1)
        self.keep_prob = keep_prob
        self.keep_avg_pool = avg_pool
        self.dropout = nn.Dropout(p=1 - self.keep_prob, inplace=False)
        self.drop_rate = drop_rate

        self.classifier = MetaLinear(512 * wh_size * wh_size, out_features)

        for m in self.modules():
            if isinstance(m, MetaConv2d):
                nn.init.kaiming_normal_(m.weight,
                                        mode='fan_out',
                                        nonlinearity='leaky_relu')
            elif isinstance(m, MetaBatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
Пример #18
0
    def __init__(self, in_channels, out_features, hidden_size=64):
        super(ConvolutionalNeuralNetwork, self).__init__()
        self.in_channels = in_channels
        self.out_features = out_features
        self.hidden_size = hidden_size

        self.features = nn.Sequential(conv3x3(in_channels, hidden_size),
                                      conv3x3(hidden_size, hidden_size),
                                      conv3x3(hidden_size, hidden_size),
                                      conv3x3(hidden_size, hidden_size))

        # Only the last (linear) layer is used for adaptation in ANIL
        self.classifier = MetaLinear(hidden_size, out_features)
Пример #19
0
def test_ridge_regression(reg_lambda, use_woodbury, scale, bias):
    # Numpy
    num_classes = 3
    embeddings_np = np.random.randn(5, 7).astype(np.float32)
    targets_np = np.random.randint(0, num_classes, size=(5, ))

    # PyTorch
    embeddings_th = torch.as_tensor(embeddings_np)
    targets_th = torch.as_tensor(targets_np)
    model = MetaLinear(7, 3, bias=bias)

    solution = ridge_regression(embeddings_th,
                                targets_th,
                                reg_lambda,
                                num_classes,
                                use_woodbury=use_woodbury,
                                scale=scale,
                                bias=bias)

    assert solution.weight.shape == (3, 7)
    if bias:
        assert solution.bias is not None
        assert solution.bias.shape == (3, )
    else:
        assert solution.bias is None

    # Optimality criterion
    # Check if the gradient of the L2-regularized MSE at the solution
    # is close to 0
    params = OrderedDict([('weight', solution.weight.requires_grad_())])
    if bias:
        params['bias'] = solution.bias.requires_grad_()

    logits = model(embeddings_th, params=params)
    targets_binary = F.one_hot(targets_th, num_classes=num_classes).float()

    # Least-square
    loss = F.mse_loss(logits, targets_binary, reduction='sum')
    if scale:
        loss /= embeddings_th.size(0)

    # L2-regularization
    loss += reg_lambda * torch.sum(solution.weight**2)
    if bias:
        loss += reg_lambda * torch.sum(solution.bias**2)
    loss.backward()

    np.testing.assert_allclose(solution.weight.grad.numpy(), 0., atol=1e-4)
    if bias:
        np.testing.assert_allclose(solution.bias.grad.numpy(), 0., atol=1e-4)
Пример #20
0
    def __init__(self,
                 channels,
                 init_block_channels,
                 bottleneck,
                 conv1_stride,
                 in_channels=3,
                 in_size=(224, 224),
                 num_classes=1000,
                 mode='',
                 linear=True):
        super(ResNet, self).__init__()
        self.in_size = in_size
        self.num_classes = num_classes
        self.mode = mode
        self.linear = linear

        self.features = MetaSequential()
        self.features.add_module(
            "init_block",
            ResInitBlock(in_channels=in_channels,
                         out_channels=init_block_channels,
                         mode=self.mode))
        in_channels = init_block_channels
        for i, channels_per_stage in enumerate(channels):
            stage = MetaSequential()
            for j, out_channels in enumerate(channels_per_stage):
                stride = 2 if (j == 0) and (i != 0) else 1
                stage.add_module(
                    "unit{}".format(j + 1),
                    ResUnit(in_channels=in_channels,
                            out_channels=out_channels,
                            stride=stride,
                            bottleneck=bottleneck,
                            conv1_stride=conv1_stride,
                            mode=self.mode))
                in_channels = out_channels
            self.features.add_module("stage{}".format(i + 1), stage)
        self.features.add_module("final_pool", nn.AdaptiveAvgPool2d(1))
        # self.features.add_module("final_pool", nn.AvgPool2d(kernel_size=7, stride=1))

        if self.mode == 'maml':
            self.output = MetaLinear(in_features=in_channels,
                                     out_features=num_classes)
        else:
            self.output = nn.Linear(in_features=in_channels,
                                    out_features=num_classes)

        self._init_params()
Пример #21
0
    def __init__(self, in_channels, input_dim, out_features, hidden_size=64):
        super(ConvolutionalNeuralNetwork, self).__init__()
        self.in_channels = in_channels
        self.out_features = out_features

        self.features = MetaSequential(conv3x3(in_channels, hidden_size),
                                       conv3x3(hidden_size, hidden_size),
                                       conv3x3(hidden_size, hidden_size),
                                       conv3x3(hidden_size, hidden_size))

        i = 4
        idim = input_dim
        for x in range(i):
            idim = math.floor(idim / 2)

        self.linear_input = hidden_size * idim * idim

        self.classifier = MetaLinear(self.linear_input, out_features)
Пример #22
0
    def __init__(self,
                 in_channels,
                 out_features,
                 hidden_size=64,
                 feature_size=64,
                 embedding=False):
        super(MetaConvModel, self).__init__()
        self.in_channels = in_channels
        self.out_features = out_features
        self.hidden_size = hidden_size
        self.feature_size = feature_size
        self.embedding = embedding

        self.features = MetaSequential(
            OrderedDict([('layer1', conv_block(in_channels, hidden_size)),
                         ('layer2', conv_block(hidden_size, hidden_size)),
                         ('layer3', conv_block(hidden_size, hidden_size)),
                         ('layer4', conv_block(hidden_size, hidden_size))]))

        self.classifier = MetaLinear(feature_size, out_features)
Пример #23
0
def test_metasequential():
    meta_model = MetaSequential(
        nn.Linear(2, 3, bias=True),
        nn.ReLU(),
        MetaLinear(3, 5, bias=True))
    model = nn.Sequential(
        nn.Linear(2, 3, bias=True),
        nn.ReLU(),
        nn.Linear(3, 5, bias=True))

    assert isinstance(meta_model, MetaModule)
    assert isinstance(meta_model, nn.Sequential)

    params = OrderedDict(meta_model.meta_named_parameters())
    assert set(params.keys()) == set(['2.weight', '2.bias'])

    # Set same weights for both models
    weight0 = torch.randn(3, 2)
    meta_model[0].weight.data.copy_(weight0)
    model[0].weight.data.copy_(weight0)

    bias0 = torch.randn(3)
    meta_model[0].bias.data.copy_(bias0)
    model[0].bias.data.copy_(bias0)

    weight2 = torch.randn(5, 3)
    meta_model[2].weight.data.copy_(weight2)
    model[2].weight.data.copy_(weight2)

    bias2 = torch.randn(5)
    meta_model[2].bias.data.copy_(bias2)
    model[2].bias.data.copy_(bias2)

    inputs = torch.randn(5, 2)

    outputs_torchmeta = meta_model(inputs, params=None)
    outputs_nn = model(inputs)

    np.testing.assert_equal(outputs_torchmeta.detach().numpy(),
                            outputs_nn.detach().numpy())
Пример #24
0
    def __init__(self, in_dims, out_dims, hidden_size=100):
        super(PolicyNetwork, self).__init__()
        self.in_dims = in_dims
        self.out_dims = out_dims * 2  # diag guassian distribution
        self.hidden_size = hidden_size
        fc1 = MetaLinear(in_dims, hidden_size)
        fc2 = MetaLinear(hidden_size, hidden_size)
        fc3 = MetaLinear(hidden_size, self.out_dims)

        self.features = MetaSequential(
            fc1,
            nn.Tanh(),
            fc2,
            nn.Tanh(),
            fc3,
            nn.Tanh(),
        )

        self.activation = {}

        def get_activation(name):
            def hook(model, input, output):
                self.activation[name] = output.detach()

            return hook

        fc1.register_forward_hook(get_activation('fc1'))
        fc2.register_forward_hook(get_activation('fc2'))
        fc3.register_forward_hook(get_activation('fc3'))
Пример #25
0
def model():
    model = MetaLinear(3, 1, bias=False)
    model.weight.data = torch.tensor([[2., 3., 5.]])
    return model
Пример #26
0
 def __init__(self, node_embedding_dim, hidden_dim, num_classes):
     super(GraphClassificationOutputModule, self).__init__()
     self.linear1 = MetaLinear(node_embedding_dim, hidden_dim)
     self.linear2 = MetaLinear(hidden_dim, num_classes)
Пример #27
0
 def __init__(self, node_embedding_dim):
     super(LinkPredictionOutputModule, self).__init__()
     self.linear_a = MetaLinear(node_embedding_dim, node_embedding_dim)
     self.linear = MetaLinear(2 * node_embedding_dim, 1)
Пример #28
0
 def __init__(self, node_embedding_dim, num_classes):
     super(NodeClassificationOutputModule, self).__init__()
     self.linear = MetaLinear(node_embedding_dim, num_classes)
Пример #29
0
def linear_model():
    return MetaLinear(2, 1)
Пример #30
0
    def __init__(self,
                 block,
                 layers,
                 num_classes=1000,
                 zero_init_residual=False,
                 groups=1,
                 width_per_group=64,
                 replace_stride_with_dilation=None,
                 norm_layer=None):
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = MetaBatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(
                                 replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = MetaConv2d(3,
                                self.inplanes,
                                kernel_size=7,
                                stride=2,
                                padding=3,
                                bias=False)
        # self.conv1 = MetaConv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1,
        #                        bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block,
                                       128,
                                       layers[1],
                                       stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block,
                                       256,
                                       layers[2],
                                       stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block,
                                       512,
                                       layers[3],
                                       stride=2,
                                       dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = MetaLinear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, MetaConv2d):
                nn.init.kaiming_normal_(m.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
            elif isinstance(m, (MetaBatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)