def __init__(self, codeword_multiplier, out_dims, block, layers, num_classes=1000, zero_init_residual=False, groups=1, width_per_group=64, replace_stride_with_dilation=None, norm_layer=None): super(GlobalLocalPredResNet, self).__init__() if norm_layer is None: norm_layer = nn.BatchNorm2d self._norm_layer = norm_layer in_dims = [] self.inplanes = 64 self.dilation = 1 if replace_stride_with_dilation is None: # each element in the tuple indicates if we should replace # the 2x2 stride with a dilated convolution instead replace_stride_with_dilation = [False, False, False] if len(replace_stride_with_dilation) != 3: raise ValueError("replace_stride_with_dilation should be None " "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) self.groups = groups self.base_width = width_per_group self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = norm_layer(self.inplanes) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) in_dims.append(self.inplanes) self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0]) in_dims.append(self.inplanes) self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1]) in_dims.append(self.inplanes) self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2]) in_dims.append(self.inplanes) in_dims.append(self.inplanes) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.fc = nn.Linear(512 * block.expansion, num_classes) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) # Zero-initialize the last BN in each residual branch, # so that the residual branch starts with zeros, and each residual block behaves like an identity. # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 if zero_init_residual: for m in self.modules(): if isinstance(m, Bottleneck): nn.init.constant_(m.bn3.weight, 0) elif isinstance(m, BasicBlock): nn.init.constant_(m.bn2.weight, 0) self.sparse_layers = nn.ModuleList() for input_dim, output_dim in zip(in_dims, out_dims): self.sparse_layers.append(Layers.GlobalLocalPred(input_dim, output_dim*codeword_multiplier))
def __init__(self, nb_codewords, scale, nb_class=10, return_output=False): super(PredAllCNN, self).__init__() self.scale = scale in_dims = [] self.return_output = return_output nb_conv = int(48 * scale) self.conv1 = nn.Conv2d(3, nb_conv, kernel_size=7, stride=2, padding=3, bias=True) self.bn1 = nn.BatchNorm2d(nb_conv) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) in_dims.append(nb_conv) # 56x56x48 in_conv = 48 nb_conv = int(96 * scale) self.block1 = nn.Sequential(*self.block_(((in_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 2, 1)))) in_dims.append(nb_conv) # 28x28x96 in_conv = nb_conv nb_conv = int(192 * scale) self.block2 = nn.Sequential(*self.block_(((in_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 2, 1)))) in_dims.append(nb_conv) # 14x14x192 in_conv = nb_conv nb_conv = int(256 * scale) self.block3 = nn.Sequential(*self.block_(((in_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 2, 1)))) # 7x7x 256 in_conv = nb_conv nb_conv = int(384 * scale) self.block4 = nn.Sequential(*self.block_(((in_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 1, 1, 0)))) in_dims.append(nb_conv) in_dims.append(nb_conv) self.output_layer = nn.Linear(nb_conv, nb_class) self.sparse_layers = nn.ModuleList() for idx, in_dim in enumerate(in_dims): self.sparse_layers.append( Layers.GlobalLocalPred(in_dim, nb_codewords[idx])) self.initialize()
def __init__(self, codeword_multiplier, centers, intercept, scale, nb_class=10, return_output=False): super(GlobalLocalPred, self).__init__() self.scale = scale self.return_output = return_output in_dims = [] in_conv = 3 nb_conv = int(96 * scale) self.block1 = nn.Sequential(*self.block_(((in_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 2, 1)))) in_dims.append(nb_conv) in_conv = nb_conv nb_conv = int(192 * scale) self.block2 = nn.Sequential(*self.block_(((in_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 2, 1)))) in_dims.append(nb_conv) in_conv = nb_conv nb_conv = int(256 * scale) self.block3 = nn.Sequential(*self.block_(((in_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 2, 1)))) in_conv = nb_conv nb_conv = int(384 * scale) self.block4 = nn.Sequential(*self.block_(((in_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 3, 1, 1), (nb_conv, nb_conv, 1, 1, 0)))) in_dims.append(nb_conv) self.output_layer = nn.Linear(nb_conv, nb_class) # nbof layers if centers is None: centers = [ None, ] * 3 if intercept is None: intercept = [ None, ] * 3 self.sparse_layers = nn.ModuleList() teacher_in_dims = [128, 256, 1024] self.sparse_layers.append( Layers.GlobalLocalPred(in_dims[0], teacher_in_dims[0] * codeword_multiplier, centers=centers[0], intercept=intercept[0])) self.sparse_layers.append( Layers.GlobalLocalPred(in_dims[1], teacher_in_dims[1] * codeword_multiplier, centers=centers[1], intercept=intercept[1])) self.sparse_layers.append( Layers.GlobalLocalPred(in_dims[2], teacher_in_dims[2] * codeword_multiplier, centers=centers[2], intercept=intercept[2])) self.initialize()