def _build_block(self, c_in: int, c_inner: int, c_out: int, has_first_act=False) -> nn.Module: padding0 = get_padding(self.padding, self.k_size, self.stride, self.dilation) padding1 = get_padding('same', self.k_size, 1, self.dilation) ops = [ nn.Conv2d(c_in, c_inner, self.k_size, self.stride, padding0, self.dilation, bias=False), nn.BatchNorm2d(c_inner, affine=self.bn_affine), Register.act_funs.get(self.act_fun)(inplace=self.act_inplace), nn.Conv2d(c_inner, c_out, self.k_size, 1, padding1, 1, bias=False), nn.BatchNorm2d(c_out, affine=self.bn_affine), ] if has_first_act: return nn.Sequential( Register.act_funs.get(self.act_fun)(inplace=self.act_inplace), *ops) return nn.Sequential(*ops)
def _build(self, s_in: Shape, c_out: int) -> Shape: assert not (c_out <= s_in.num_features() and self.stride > 1), "must increase num features when stride is >1" assert s_in.num_features() % 4 == 0 and c_out % 2 == 0, "num features must be divisible by 4" padding = get_padding(self.padding, self.k_size, self.stride, self.dilation) padding2 = get_padding(self.padding, self.k_size, 1, self.dilation) if self.stride >= 2: c_side = c_main_in = s_in.num_features() self.branch_proj = nn.Sequential(*[ # dw nn.Conv2d(c_side, c_side, self.k_size, self.stride, padding, groups=c_side, bias=False), nn.BatchNorm2d(c_side, affine=self.bn_affine), # pw nn.Conv2d(c_side, c_side, 1, 1, 0, bias=False), nn.BatchNorm2d(c_side, affine=self.bn_affine), Register.act_funs.get(self.act_fun)(inplace=self.act_inplace), ]) else: c_side = c_main_in = s_in.num_features() // 2 c_main_out = c_out - c_side c_main_mid = int(c_out // 2 * self.expansion) bm = [ # dw 1 nn.Conv2d(c_main_in, c_main_in, self.k_size, self.stride, padding, groups=c_main_in, bias=False), nn.BatchNorm2d(c_main_in, affine=self.bn_affine), # pw 1 nn.Conv2d(c_main_in, c_main_mid, 1, 1, 0, bias=False), nn.BatchNorm2d(c_main_mid, affine=self.bn_affine), Register.act_funs.get(self.act_fun)(inplace=self.act_inplace), # dw 2 nn.Conv2d(c_main_mid, c_main_mid, self.k_size, 1, padding2, groups=c_main_mid, bias=False), nn.BatchNorm2d(c_main_mid, affine=self.bn_affine), # pw 2 nn.Conv2d(c_main_mid, c_main_mid, 1, 1, 0, bias=False), nn.BatchNorm2d(c_main_mid, affine=self.bn_affine), Register.act_funs.get(self.act_fun)(inplace=self.act_inplace), # dw 3 nn.Conv2d(c_main_mid, c_main_mid, self.k_size, 1, padding2, groups=c_main_mid, bias=False), nn.BatchNorm2d(c_main_mid, affine=self.bn_affine), # pw 3 nn.Conv2d(c_main_mid, c_main_out, 1, 1, 0, bias=False), nn.BatchNorm2d(c_main_out, affine=self.bn_affine), Register.act_funs.get(self.act_fun)(inplace=self.act_inplace), ] # optional attention module if isinstance(self.att_dict, dict): bm.append(AbstractAttentionModule.module_from_dict(c_main_out, c_substitute=c_main_in, att_dict=self.att_dict)) # self.branch_main = nn.Sequential(*bm) self.branch_main = DropPathModule(nn.Sequential(*bm)) return self.probe_outputs(s_in)
def _build(self, s_in: Shape, c_out: int, weight_functions=()) -> Shape: padding = get_padding(self.padding, self.k_size, self.stride, 1) pool = (nn.AvgPool2d if self.pool_type == 'avg' else nn.MaxPool2d)( self.k_size, self.stride, padding) conv = nn.Conv2d(s_in.num_features(), c_out, kernel_size=1, stride=1, padding=0, bias=self.bias) wf = list(weight_functions) + [pool, conv] return super()._build(s_in, c_out, weight_functions=wf)
def get_conv2d(c_in: int, c_out: int, k_size, stride=1, groups=-1, dilation=1, padding='same') -> nn.Module: # multiple kernel sizes, mix conv if isinstance(k_size, (tuple, list)): if len(k_size) > 1: return MixConvModule(c_in, c_out, k_size=k_size, stride=stride, dilation=dilation, padding=padding, groups=groups, bias=False, mode='even', divisible=1) k_size = k_size[0] # one kernel size, regular conv padding = get_padding(padding, k_size, stride, dilation) groups = c_in if groups == -1 else groups return nn.Conv2d(c_in, c_out, k_size, stride, padding, groups=groups, bias=False)
def _build(self, s_in: Shape, c_out: int, weight_functions=()) -> Shape: padding = get_padding(self.padding, self.k_size, self.stride, self.dilation) conv = nn.Conv2d(s_in.num_features(), c_out, kernel_size=self.k_size, stride=self.stride, padding=padding, dilation=self.dilation, groups=get_number(self.groups, s_in.num_features()), bias=self.bias) wf = list(weight_functions) + [conv] return super()._build(s_in, c_out, weight_functions=wf)
def __init__(self, c_in: int, c_out: int, k_size=(3, 5, 7), stride=1, dilation=1, groups=-1, bias=False, padding='same', mode='even', divisible=1): super().__init__() assert isinstance(k_size, (tuple, list)) assert c_in == c_out or groups == 1 self.splits_in = get_splits(c_in, len(k_size), mode=mode, divisible=divisible) self.splits_out = get_splits(c_out, len(k_size), mode=mode, divisible=divisible) groups = [groups] * len(k_size) if groups > 0 else self.splits_in ops = [] for k, g, si, so in zip(k_size, groups, self.splits_in, self.splits_out): p = get_padding(padding, k, stride, dilation) ops.append( nn.Conv2d(si, so, k, stride=stride, padding=p, groups=g, bias=bias)) self.ops = nn.ModuleList(ops)
def _build(self, s_in: Shape, c_out: int, weight_functions=()) -> Shape: padding = get_padding(self.padding, self.k_size, self.stride, 1) pool = (nn.AvgPool2d if self.pool_type == 'avg' else nn.MaxPool2d)( self.k_size, self.stride, padding) wf = list(weight_functions) + [pool] return super()._build(s_in, c_out, weight_functions=wf)
def __init__(self, c_in: int, c_out: int, name: str, strategy_name='default', k_sizes=(3, 5), c_multipliers=(0.5, 1.0), dilation=1, stride=1, padding='same', groups=-1, bias=False): """ A super-kernel that applies convolution with a masked weight, using architecture weights to figure out the best masking, thus kernel size and num output channels. Since the architecture weights are applied to the mask rather than generating different outputs, this module can be used efficiently for differentiable weight strategies. :param c_in: num input channels :param c_out: num output channels :param name: name under which to register architecture weights :param strategy_name: name of the strategy for architecture weights :param k_sizes: kernel sizes :param c_multipliers: :param dilation: dilation for the kernel :param stride: stride for the kernel :param padding: :param padding: 'same' or number :param bias: """ super().__init__() self.name_c = '%s/c' % name self.name_k = '%s/k' % name self.k_sizes = k_sizes self.c_multipliers = c_multipliers assert max( c_multipliers ) <= 1.0, "Can only reduce max channels, choose a higher c_in/c_out" self._stride = stride self._groups = get_number(groups, c_out) self._dilation = dilation assert c_in % self._groups == 0 max_k = max(k_sizes) channels = [int(c_out * ci) for ci in sorted(c_multipliers)] masks_c, masks_k = [], [] # arc weights self.ws = StrategyManager().make_weight(strategy_name, self.name_k, only_single_path=True, num_choices=len(k_sizes)) self.ws = StrategyManager().make_weight(strategy_name, self.name_c, only_single_path=True, num_choices=len(channels)) # conv weight self._padding = get_padding(padding, max_k, stride, 1) self.weight = nn.Parameter(torch.Tensor(c_out, c_in // self._groups, max_k, max_k), requires_grad=True) nn.init.kaiming_normal_(self.weight, mode='fan_out') # bias if bias: self.bias = nn.Parameter(torch.Tensor(c_out)) nn.init.zeros_(self.bias) else: self.bias = None # mask c for cs in channels: mask = torch.ones(size=(c_out, 1, 1, 1), dtype=self.weight.dtype) mask[cs:c_out, :, :, :].zero_() masks_c.append(mask) self.register_buffer('masks_c', torch.stack(masks_c, dim=0)) # mask k for k in sorted(k_sizes): mask = torch.zeros(size=(1, 1, max_k, max_k), dtype=self.weight.dtype) dk = (max_k - k) // 2 if dk == 0: mask += 1 else: mask[:, :, dk:-dk, dk:-dk] += 1 masks_k.append(mask) self.register_buffer('masks_k', torch.stack(masks_k, dim=0))
def __init__(self, c_in: int, c_out: int, k_sizes=(3, 5, 7), c_multipliers=(0.5, 1.0), dilation=1, stride=1, padding='same', groups=1, bias=False): """ A super-kernel that applies convolution with a masked weight, using differentiable weights and thresholds to figure out the best masking, thus kernel size and num output channels. Since the mask is learned, rather than generating different outputs, this module can be used efficiently to learn the architecture of (huge) networks. :param c_in: num input channels :param c_out: num output channels :param k_sizes: kernel sizes :param c_multipliers: :param dilation: dilation for the kernel :param stride: stride for the kernel :param padding: 'same' or number :param bias: whether to use a bias """ super().__init__() k_sizes = sorted(k_sizes) max_k = max(k_sizes) c_multipliers = sorted(c_multipliers) assert max(c_multipliers) == 1.0, "Can only reduce max channels, choose a higher c_in/c_out" self.c_in = c_in self.c_out = c_out self.k_sizes = k_sizes self.c_multipliers = c_multipliers self.c_out_list = [int(cm * c_out) for cm in c_multipliers] self._padding = get_padding(padding, max_k, stride, 1) self._stride = stride self._dilation = dilation self._groups = get_number(groups, c_out) assert c_in % self._groups == 0 # conv and bias weights self.weight = nn.Parameter(torch.zeros(c_out, c_in // self._groups, max_k, max_k), requires_grad=True) self.bias = nn.Parameter(torch.zeros(c_out), requires_grad=True) if bias else None nn.init.kaiming_normal_(self.weight, mode='fan_out') # channel masks masks_c = [] for cs in self.c_out_list: mask = torch.ones(size=(c_out, 1, 1, 1), dtype=self.weight.dtype) mask[cs:c_out, :, :, :].zero_() for prev_mask in masks_c: mask -= prev_mask masks_c.append(mask) self.mask_c = TrainableMask(masks_c) # kernel masks masks_k = [] for k in sorted(k_sizes): mask = torch.zeros(size=(1, 1, max_k, max_k), dtype=self.weight.dtype) dk = (max_k - k) // 2 if dk == 0: mask += 1 else: mask[:, :, dk:-dk, dk:-dk] += 1 for prev_mask in masks_k: mask -= prev_mask masks_k.append(mask) self.mask_k = TrainableMask(masks_k)