def resnet_model(ctx, x, inmaps=64, act=F.relu, test=False): # Conv -> BN -> Relu with nn.context_scope(ctx): with nn.parameter_scope("conv1"): h = PF.convolution(x, inmaps, kernel=(3, 3), pad=(1, 1), with_bias=False) h = PF.batch_normalization(h, decay_rate=0.9, batch_stat=not test) h = act(h) h = res_unit(h, "conv2", act, False) # -> 32x32 h = res_unit(h, "conv3", act, True) # -> 16x16 with nn.parameter_scope("bn0"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) h = res_unit(h, "conv4", act, False) # -> 16x16 h = res_unit(h, "conv5", act, True) # -> 8x8 with nn.parameter_scope("bn1"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) h = res_unit(h, "conv6", act, False) # -> 8x8 h = res_unit(h, "conv7", act, True) # -> 4x4 with nn.parameter_scope("bn2"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) h = res_unit(h, "conv8", act, False) # -> 4x4 h = F.average_pooling(h, kernel=(4, 4)) # -> 1x1 pred = PF.affine(h, 10) return pred
def get_loss(l1, l2, x, t, w_init, b_init, num_words, batch_size, state_size, dropout=False, dropout_rate=0.5, embed_name='embed', pred_name='pred'): e_list = [ PF.embed(x_elm, num_words, state_size, name=embed_name) for x_elm in F.split(x, axis=1) ] t_list = F.split(t, axis=1) loss = 0 for i, (e_t, t_t) in enumerate(zip(e_list, t_list)): if dropout: h1 = l1(F.dropout(e_t, dropout_rate), w_init, b_init) h2 = l2(F.dropout(h1, dropout_rate), w_init, b_init) y = PF.affine(F.dropout(h2, dropout_rate), num_words, name=pred_name) else: h1 = l1(e_t, w_init, b_init) h2 = l2(h1, w_init, b_init) y = PF.affine(h2, num_words, name=pred_name) t_t = F.reshape(t_t, [batch_size, 1]) loss += F.mean(F.softmax_cross_entropy(y, t_t)) loss /= float(i + 1) return loss
def cnn_model_003(ctx, x, act=F.relu, test=False): with nn.context_scope(ctx): # Convblock0 h = conv_unit(x, "conv00", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv01", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv02", 128, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 32 -> 16 with nn.parameter_scope("bn0"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) # Convblock 1 h = conv_unit(h, "conv10", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv11", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv12", 256, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 16 -> 8 with nn.parameter_scope("bn1"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) # Convblock 2 h = conv_unit(h, "conv20", 512, k=3, s=1, p=0, act=act, test=test) # 8 -> 6 h = conv_unit(h, "conv21", 256, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv22", 128, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv23", 10, k=1, s=1, p=0, act=act, test=test) # Convblock 3 h = F.average_pooling(h, (6, 6)) with nn.parameter_scope("bn2"): h = PF.batch_normalization(h, batch_stat=not test) h = F.reshape(h, (h.shape[0], np.prod(h.shape[1:]))) return h
def cnn_model_003(ctx, x, act=F.relu, test=False): with nn.context_scope(ctx): # Convblock0 h = conv_unit(x, "conv00", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv01", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv02", 128, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 32 -> 16 with nn.parameter_scope("bn0"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) # Convblock 1 h = conv_unit(h, "conv10", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv11", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv12", 256, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 16 -> 8 with nn.parameter_scope("bn1"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) # Convblock 2 h = conv_unit(h, "conv20", 512, k=3, s=1, p=0, act=act, test=test) # 8 -> 6 h = conv_unit(h, "conv21", 256, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv22", 128, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv23", 10, k=1, s=1, p=0, act=act, test=test) return h
def construct_networks(args, images, model, num_class, test): try: pooled = model(images, force_global_pooling=1, use_up_to="pool", training=not test) except: pooled = model(images, use_up_to="pool", training=not test) with nn.parameter_scope("finetuning"): if args.model == "VGG": pooled = F.relu(pooled) with nn.parameter_scope("additional_fc_1"): pooled = PF.affine(pooled, 4096) pooled = F.relu(pooled) if not test: pooled = F.dropout(pooled, 0.5) with nn.parameter_scope("additional_fc_2"): pooled = PF.affine(pooled, 4096) pooled = F.relu(pooled) if not test: pooled = F.dropout(pooled, 0.5) with nn.parameter_scope("last_fc"): pred = PF.affine(pooled, num_class) return pred
def cnn_model_003(ctx, x, act=F.relu, test=False): with nn.context_scope(ctx): # Convblock0 h = conv_unit(x, "conv00", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv01", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv02", 128, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 28 -> 14 with nn.parameter_scope("bn0"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) # Convblock 1 h = conv_unit(h, "conv10", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv11", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv12", 256, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 14 -> 7 with nn.parameter_scope("bn1"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) # Convblock 2 h = conv_unit(h, "conv20", 512, k=3, s=1, p=0, act=act, test=test) # 7 -> 5 h = conv_unit(h, "conv21", 256, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv22", 128, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv23", 10, k=1, s=1, p=0, act=act, test=test) # Convblock 3 h = F.average_pooling(h, (5, 5)) with nn.parameter_scope("bn2"): h = PF.batch_normalization(h, batch_stat=not test) h = F.reshape(h, (h.shape[0], np.prod(h.shape[1:]))) return h
def cnn_ae_model_000(ctx, x, act=F.relu, test=False): with nn.parameter_scope("ae"): with nn.context_scope(ctx): # Convblock0 h = conv_unit(x, "conv00", 32, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv01", 32, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv02", 32, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv03", 32, k=4, s=2, p=1, act=act, test=test) # 32 -> 16 if not test: h = F.dropout(h) # Convblock 1 h = conv_unit(h, "conv10", 64, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv11", 64, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv12", 64, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv13", 64, k=4, s=2, p=1, act=act, test=test) # 16 -> 8 if not test: h = F.dropout(h) # Deconvblock0 h = deconv_unit(h, "deconv00", 64, k=4, s=2, p=1, act=act, test=test) # 8 -> 16 h = deconv_unit(h, "deconv01", 64, k=3, s=1, p=1, act=act, test=test) h = deconv_unit(h, "deconv02", 64, k=3, s=1, p=1, act=act, test=test) h = deconv_unit(h, "deconv03", 64, k=3, s=1, p=1, act=act, test=test) # Deconvblock 1 h = deconv_unit(h, "deconv10", 32, k=4, s=2, p=1, act=act, test=test) # 16 -> 32 h = deconv_unit(h, "deconv11", 32, k=3, s=1, p=1, act=act, test=test) h = deconv_unit(h, "deconv12", 32, k=3, s=1, p=1, act=act, test=test) h = deconv_unit(h, "deconv13", 3, k=3, s=1, p=1, act=None, test=test) return h
def cnn_model_003(ctx, x, act=F.elu, do=True, test=False): with nn.context_scope(ctx): # Convblock0 h = conv_unit(x, "conv00", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv01", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv02", 128, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 32 -> 16 with nn.parameter_scope("bn0"): h = PF.batch_normalization(h, batch_stat=not test) if not test and do: h = F.dropout(h) # Convblock 1 h = conv_unit(h, "conv10", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv11", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv12", 256, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 16 -> 8 with nn.parameter_scope("bn1"): h = PF.batch_normalization(h, batch_stat=not test) if not test and do: h = F.dropout(h) # Convblock 2 h = conv_unit(h, "conv20", 512, k=3, s=1, p=0, act=act, test=test) # 8 -> 6 h = conv_unit(h, "conv21", 256, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv22", 128, k=1, s=1, p=0, act=act, test=test) h_branch = h # Convblock 3 h = conv_unit(h_branch, "conv23", 10, k=1, s=1, p=0, act=act, test=test) h = F.average_pooling(h, (6, 6)) with nn.parameter_scope("bn2"): h = PF.batch_normalization(h, batch_stat=not test) pred = F.reshape(h, (h.shape[0], np.prod(h.shape[1:]))) # Uncertainty u0 = conv_unit(h_branch, "u0", 10, k=1, s=1, p=0, act=act, test=test) u0 = F.average_pooling(u0, (6, 6)) with nn.parameter_scope("u0bn"): u0 = PF.batch_normalization(u0, batch_stat=not test) log_var = F.reshape(u0, (u0.shape[0], np.prod(u0.shape[1:]))) # Uncertainty for uncertainty u1 = conv_unit(h_branch, "u1", 10, k=1, s=1, p=0, act=act, test=test) u1 = F.average_pooling(u1, (6, 6)) with nn.parameter_scope("u1bn"): u1 = PF.batch_normalization(u1, batch_stat=not test) log_s = F.reshape(u1, (u1.shape[0], np.prod(u1.shape[1:]))) return pred, log_var, log_s
def discriminator(x, maxh=256, test=False, output_hidden=False): """ Building discriminator network which maps a (B, 1, 28, 28) input to a (B, 1). """ # Define shortcut functions def bn(xx): # Batch normalization return PF.batch_normalization(xx, batch_stat=not test) def downsample2(xx, c): return PF.convolution(xx, c, (3, 3), pad=(1, 1), stride=(2, 2), with_bias=False) assert maxh / 8 > 0 with nn.parameter_scope("dis"): # (1, 56, 56) --> (32, 28, 28) with nn.parameter_scope("conv0"): c0 = F.elu(bn(downsample2(x, maxh / 8))) if not test: c0 = F.dropout(c0, 0.2) # (32, 28, 28) --> (32, 16, 16) with nn.parameter_scope("conv1"): c1 = F.elu( bn( PF.convolution(c0, maxh / 8, (3, 3), pad=(3, 3), stride=(2, 2), with_bias=False))) if not test: c1 = F.dropout(c1, 0.2) # (32, 16, 16) --> (64, 8, 8) with nn.parameter_scope("conv2"): c2 = F.elu(bn(downsample2(c1, maxh / 4))) # (64, 8, 8) --> (128, 4, 4) with nn.parameter_scope("conv3"): c3 = F.elu(bn(downsample2(c2, maxh / 2))) # (128, 4, 4) --> (256, 4, 4) with nn.parameter_scope("conv4"): c4 = bn( PF.convolution(c3, maxh, (3, 3), pad=(1, 1), with_bias=False)) # (256, 4, 4) --> (1,) with nn.parameter_scope("fc1"): f = PF.affine(c4, 1) if output_hidden: return f, [c1, c2, c3, c4] return f
def cnn_model_003(ctx, h, act=F.elu, do=True, test=False): with nn.context_scope(ctx): if not test: b, c, s, s = h.shape h = F.image_augmentation(h, (c, s, s), min_scale=1.0, max_scale=1.5, angle=0.5, aspect_ratio=1.3, distortion=0.2, flip_lr=True) # Convblock0 h = conv_unit(h, "conv00", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv01", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv02", 128, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 32 -> 16 with nn.parameter_scope("bn0"): h = PF.batch_normalization(h, batch_stat=not test) if not test and do: h = F.dropout(h) # Convblock 1 h = conv_unit(h, "conv10", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv11", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv12", 256, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 16 -> 8 with nn.parameter_scope("bn1"): h = PF.batch_normalization(h, batch_stat=not test) if not test and do: h = F.dropout(h) # Convblock 2 h = conv_unit(h, "conv20", 512, k=3, s=1, p=0, act=act, test=test) # 8 -> 6 h = conv_unit(h, "conv21", 256, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv22", 128, k=1, s=1, p=0, act=act, test=test) u = h # Convblock 3 h = conv_unit(h, "conv23", 10, k=1, s=1, p=0, act=act, test=test) h = F.average_pooling(h, (6, 6)) with nn.parameter_scope("bn2"): h = PF.batch_normalization(h, batch_stat=not test) pred = F.reshape(h, (h.shape[0], np.prod(h.shape[1:]))) # Uncertainty u = conv_unit(u, "u0", 10, k=1, s=1, p=0, act=act, test=test) u = F.average_pooling(u, (6, 6)) with nn.parameter_scope("u0bn"): u = PF.batch_normalization(u, batch_stat=not test) log_var = F.reshape(u, (u.shape[0], np.prod(u.shape[1:]))) return pred, log_var
def network(x, y_index, test=False): # Input -> 3,64,64 # Convolution -> 16,31,31 with nn.parameter_scope('Convolution'): h = PF.convolution(x, 16, (3, 3), (0, 0), (2, 2)) # Tanh h = F.tanh(h) # MaxPooling -> 16,16,11 h = F.max_pooling(h, (2, 3), (2, 3)) # Dropout if not test: h = F.dropout(h) # Convolution_2 -> 32,6,5 with nn.parameter_scope('Convolution_2'): h = PF.convolution(h, 32, (5, 3), (0, 0), (2, 2)) # ReLU_4 h = F.relu(h, True) # MaxPooling_2 -> 32,3,3 h = F.max_pooling(h, (2, 2), (2, 2)) # Dropout_2 if not test: h = F.dropout(h) # Convolution_3 -> 64,1,1 with nn.parameter_scope('Convolution_3'): h = PF.convolution(h, 64, (3, 3), (0, 0), (2, 2)) # Tanh_2 h = F.tanh(h) # Dropout_3 if not test: h = F.dropout(h) # Affine -> 50 with nn.parameter_scope('Affine'): h = PF.affine(h, (50, )) # ReLU_2 h = F.relu(h, True) # Dropout_4 if not test: h = F.dropout(h) # Affine_2 -> 5 with nn.parameter_scope('Affine_2'): h = PF.affine(h, (5, )) # ELU h = F.elu(h) # Affine_3 -> 1 with nn.parameter_scope('Affine_3'): h = PF.affine(h, (1, )) # SquaredError #h = F.squared_error(h, y_index) return h
def cnn(batch_size, vocab_size, text_len, classes, features=128, train=True): text = nn.Variable([batch_size, text_len]) with nn.parameter_scope("text_embed"): embed = PF.embed(text, n_inputs=vocab_size, n_features=features) print("embed", embed.shape) embed = F.reshape(embed, (batch_size, 1, text_len, features)) print("embed", embed.shape) combined = None for n in range(2, 6): # 2 - 5 gram with nn.parameter_scope(str(n) + "_gram"): with nn.parameter_scope("conv"): conv = PF.convolution(embed, 128, kernel=(n, features)) conv = F.relu(conv) with nn.parameter_scope("pool"): pool = F.max_pooling(conv, kernel=(conv.shape[2], 1)) if not combined: combined = F.identity(pool) else: combined = F.concatenate(combined, pool) if train: combined = F.dropout(combined, 0.5) with nn.parameter_scope("output"): y = PF.affine(combined, classes) t = nn.Variable([batch_size, 1]) _loss = F.softmax_cross_entropy(y, t) loss = F.reduce_mean(_loss) return text, y, loss, t
def __call__(self, x): # First conv h = self.conv_bn_relu6(x, int(self.init_maps * self.depth_mul), stride=(2, 2), name="first-conv") # Inverted residual blocks for i, elm in enumerate(self.settings): t, c, n, s = elm # TODO: where to multiply depth_mul c = round(c * self.depth_mul) mbconv_s = partial(self.inverted_residual, maps=c, stride=(s, s), ef=t) mbconv_1 = partial(self.inverted_residual, maps=c, stride=(1, 1), ef=t) for j in range(n): name = "mbconv-{:02d}-{:02d}".format(i, j) h = mbconv_s(h, name=name) if j == 0 else mbconv_1( h, name=name) # Last conv h = self.conv_bn_relu6(h, int(1280 * self.depth_mul), kernel=(1, 1), name="last-conv") # Classifier if not self.test: h = F.dropout(h, 0.2) pool_shape = get_spatial_shape(x.shape, self.channel_last) h = F.average_pooling(h, pool_shape, channel_last=self.channel_last) h = PF.affine(h, self.num_classes, w_init=I.NormalInitializer(0.01), name="linear") return h, {}
def __call__(self, x): # First conv h = self.conv_bn_act(x, int(self.maps0 * self.depth_mul), stride=(2, 2), act="hswish", name="first-conv") # Inverted residual blocks for i, elm in enumerate(self.settings): maps, kernel, stride, ef, act, se = elm maps = round(maps * self.depth_mul) name = "mbconv-{:03d}".format(i) h = self.inverted_residual( h, maps, kernel, stride, ef, act, se, name=name) # Conv -> Avepool -> Conv h = self.conv_bn_act(h, int(self.maps1 * self.depth_mul), (1, 1), act="hswish", name="last-conv-1") pool_shape = get_spatial_shape(x.shape, self.channel_last) h = F.average_pooling(h, pool_shape, channel_last=self.channel_last) h = self.conv_act(h, int(self.maps2 * self.depth_mul), (1, 1), act="hswish", name="last-conv-2") # Classifier if not self.test: h = F.dropout(h, 0.2) h = PF.affine(h, self.num_classes, w_init=I.NormalInitializer(0.01), name="linear") return h, {}
def wrapper(x, *args, **kwargs): residual = x h = layer_normalization(x) h = layer(h, *args, **kwargs) if kwargs['train']: h = F.dropout(h, p=kwargs['dropout_ratio']) return residual + h
def transformer(train=True, droput_ratio=0.1): x = nn.Variable((batch_size, max_len)) t = nn.Variable((batch_size, 1)) mask = get_mask(x) with nn.parameter_scope('embedding_layer'): # h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask h = token_embedding(x, vocab_size, embedding_size) h = position_encoding(h) if train: h = F.dropout(h, p=droput_ratio) for i in range(hopping_num): with nn.parameter_scope(f'encoder_hopping_{i}'): h = residual_normalization_wrapper(multihead_self_attention)( h, head_num, mask=mask, train=train, dropout_ratio=droput_ratio) h = residual_normalization_wrapper(positionwise_feed_forward)( h, train=train, dropout_ratio=droput_ratio) with nn.parameter_scope('output_layer'): y = F.sigmoid(PF.affine(h[:, 0, :], 1)) accuracy = F.mean(F.equal(F.round(y), t)) loss = F.mean(F.binary_cross_entropy(y, t)) return x, y, t, accuracy, loss
def csc(x, scope_name, dn=False): C = x.shape[1] h = x with nn.parameter_scope(scope_name): with nn.parameter_scope("conv1"): h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h, True) h = PF.convolution(h, C, kernel=(1, 1), pad=(0, 0), with_bias=False) with nn.parameter_scope("shift"): # no meaning but semantics h = shift(h) with nn.parameter_scope("conv2"): h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h, True) stride = (2, 2) if dn else (1, 1) if p > 0: h = F.dropout(h, p=0.5) if not test else h h = PF.convolution(h, C, kernel=(1, 1), pad=(0, 0), stride=stride, with_bias=False) s = F.average_pooling(x, (2, 2)) if dn else x return h + s
def bert_embed(input_ids, token_type_ids=None, position_ids=None, vocab_size=30522, embed_dim=768, num_pos_ids=512, dropout_prob=0.1, test=True): """Construct the embeddings from word, position and token type.""" batch_size = input_ids.shape[0] seq_len = input_ids.shape[1] if position_ids is None: position_ids = F.arange(0, seq_len) position_ids = F.broadcast(F.reshape( position_ids, (1,)+position_ids.shape), (batch_size,) + position_ids.shape) if token_type_ids is None: token_type_ids = F.constant(val=0, shape=(batch_size, seq_len)) embeddings = PF.embed(input_ids, vocab_size, embed_dim, name='word_embeddings') position_embeddings = PF.embed( position_ids, num_pos_ids, embed_dim, name='position_embeddings') token_type_embeddings = PF.embed( token_type_ids, 2, embed_dim, name='token_type_embeddings') embeddings += position_embeddings embeddings += token_type_embeddings embeddings = PF.layer_normalization( embeddings, batch_axis=(0, 1), eps=1e-12, name='embed') if dropout_prob > 0.0 and not test: embeddings = F.dropout(embeddings, dropout_prob) return embeddings
def res_unit(x, scope_name, act=F.relu, dn=False, test=False): C = x.shape[1] with nn.parameter_scope(scope_name): # Conv -> BN -> Relu with nn.parameter_scope("conv1"): h = PF.convolution(x, C/2, kernel=(1, 1), pad=(0, 0), with_bias=False) h = PF.batch_normalization(h, decay_rate=0.9, batch_stat=not test) h = act(h) # Conv -> BN -> Relu with nn.parameter_scope("conv2"): h = PF.convolution(h, C/2, kernel=(3, 3), pad=(1, 1), with_bias=False) h = PF.batch_normalization(h, decay_rate=0.9, batch_stat=not test) h = act(h) # Conv -> BN with nn.parameter_scope("conv3"): h = PF.convolution(h, C, kernel=(1, 1), pad=(0, 0), with_bias=False) h = PF.batch_normalization(h, decay_rate=0.9, batch_stat=not test) # Residual -> Relu if not test: h = F.dropout(h) with nn.parameter_scope(scope_name): h = PF.batch_normalization(h, decay_rate=0.9, batch_stat=not test) h = F.add2(h, x) h = act(h) # Maxpooling if dn: h = F.max_pooling(h, kernel=(2, 2), stride=(2, 2)) return h
def build_self_attention_model(train=True): x = nn.Variable((batch_size, max_len)) t = nn.Variable((batch_size, 1)) mask = get_mask(x) attention_mask = (F.constant(1, shape=mask.shape) - mask) * F.constant( np.finfo(np.float32).min, shape=mask.shape) with nn.parameter_scope('embedding'): h = time_distributed(PF.embed)(x, vocab_size, embedding_size) * mask with nn.parameter_scope('forward'): h_f = lstm(h, hidden_size, mask=mask, return_sequences=True, return_state=False) with nn.parameter_scope('backward'): h_b = lstm(h[:, ::-1, ], hidden_size, mask=mask, return_sequences=True, return_state=False)[:, ::-1, ] h = F.concatenate(h_f, h_b, axis=2) if train: h = F.dropout(h, p=dropout_ratio) with nn.parameter_scope('da'): a = F.tanh(time_distributed(PF.affine)(h, da)) if train: a = F.dropout(a, p=dropout_ratio) with nn.parameter_scope('r'): a = time_distributed(PF.affine)(a, r) if train: a = F.dropout(a, p=dropout_ratio) a = F.softmax(a + attention_mask, axis=1) m = F.batch_matmul(a, h, transpose_a=True) with nn.parameter_scope('output_mlp'): output = F.relu(PF.affine(m, output_mlp_size)) if train: output = F.dropout(output, p=dropout_ratio) with nn.parameter_scope('output'): y = F.sigmoid(PF.affine(output, 1)) accuracy = F.mean(F.equal(F.round(y), t)) loss = F.mean(F.binary_cross_entropy( y, t)) + attention_penalty_coef * frobenius( F.batch_matmul(a, a, transpose_a=True) - batch_eye(batch_size, r)) return x, t, accuracy, loss
def test_dropout_forward_backward(p, seed, ctx, func_name): from nbla_test_utils import cap_ignore_region # Note: each backward execution requires a forward execution in NNabla. with nn.context_scope(ctx): # Create inputs rng = np.random.RandomState(seed) inputs = [ cap_ignore_region( rng.randn(2, 3, 4).astype(np.float32) * 2, (-1e-3, 1e-3)) ] # Ensure there is no zero. x = nn.Variable(inputs[0].shape, need_grad=True) x.d = inputs[0] init_dx = rng.randn(*x.shape).astype(x.data.dtype) init_dy = rng.randn(*x.shape).astype(x.data.dtype) # Construct graph y = F.dropout(x, p) # Reference parameter scale = 1. / (1. - p) # Test forward y.forward(clear_buffer=True) mask = (y.d != 0) ref_y = x.d * mask * scale assert_allclose(y.d, ref_y) assert y.parent.name == func_name # Test backward x.g[...] = init_dx y.backward(init_dy, clear_buffer=True) ref_dx = init_dy * mask * scale assert_allclose(x.g, init_dx + ref_dx) # Test accumulation y.forward(clear_no_need_grad=True) mask = (y.d != 0) x.g[...] = 1 y.g = init_dy y.parent.backward([x], [y], [False]) ref_dx = init_dy * mask * scale assert_allclose(x.g, ref_dx) # Test accum=False with NaN gradient y.forward(clear_no_need_grad=True) x.g = np.float32('nan') y.parent.backward([x], [y], [False]) assert not np.any(np.isnan(x.g)) # Test need_grad y.forward(clear_no_need_grad=True) x.g[...] = 0 x.need_grad = False y.backward(init_dy) assert np.all(x.g == 0)
def network_LSTM(x, D, C, InputShape, HiddenSize, test=False): # Input_2:x -> 687 # Delya_in:D -> 100 # Cell_in:C -> 100 # Concatenate -> 787 h = F.concatenate(D, x, axis=1) # Affine -> 100 h1 = PF.affine(h, HiddenSize, name='Affine') # InputGate -> 100 h2 = PF.affine(h, HiddenSize, name='InputGate') # OutputGate -> 100 h3 = PF.affine(h, HiddenSize, name='OutputGate') # ForgetGate -> 100 h4 = PF.affine(h, HiddenSize, name='ForgetGate') # Sigmoid h1 = F.sigmoid(h1) # Sigmoid_2 h2 = F.sigmoid(h2) # Sigmoid_3 h3 = F.sigmoid(h3) # Sigmoid_4 h4 = F.sigmoid(h4) # Mul2 -> 100 h1 = F.mul2(h1, h2) # Mul2_3 -> 100 h4 = F.mul2(h4, C) # Add2 -> 100 h1 = F.add2(h1, h4, True) # Tanh h5 = F.tanh(h1) # Cell_out h6 = F.identity(h1) # Mul2_2 -> 100 h5 = F.mul2(h5, h3) # Dropout if not test: h5 = F.dropout(h5) # Output h5 = F.identity(h5) # Concatenate_2 -> 200 h5 = F.concatenate(h5, h6, axis=1) return h5
def cnn_model_003(ctx, x, act=F.relu, test=False): with nn.context_scope(ctx): # Convblock0 h = conv_unit(x, "conv00", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv01", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv02", 128, k=3, s=1, p=1, act=act, test=test) # Learned attention multiplication h = one_by_one_conv(h, "attend0") h = F.max_pooling(h, (2, 2)) # 32 -> 16 with nn.parameter_scope("bn0"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) # Convblock 1 h = conv_unit(h, "conv10", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv11", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv12", 256, k=3, s=1, p=1, act=act, test=test) # Learned attention multiplication h = one_by_one_conv(h, "attend1") h = F.max_pooling(h, (2, 2)) # 16 -> 8 with nn.parameter_scope("bn1"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) # Convblock 2 h = conv_unit(h, "conv20", 512, k=3, s=1, p=0, act=act, test=test) # 8 -> 6 h = conv_unit(h, "conv21", 256, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv22", 128, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv23", 10, k=1, s=1, p=0, act=act, test=test) # Learned attention multiplication h = one_by_one_conv(h, "attend2") # Convblock 3 h = F.average_pooling(h, (6, 6)) with nn.parameter_scope("bn2"): h = PF.batch_normalization(h, batch_stat=not test) h = F.reshape(h, (h.shape[0], np.prod(h.shape[1:]))) return h
def cnn_model_003(ctx, x, act=F.elu, do=True, test=False): with nn.context_scope(ctx): # Convblock0 h = conv_unit(x, "conv00", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv01", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv02", 128, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 28 -> 14 with nn.parameter_scope("bn0"): h = PF.batch_normalization(h, batch_stat=not test) if not test and do: h = F.dropout(h) # Convblock 1 h = conv_unit(h, "conv10", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv11", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv12", 256, k=3, s=1, p=1, act=act, test=test) h = F.max_pooling(h, (2, 2)) # 14 -> 7 with nn.parameter_scope("bn1"): h = PF.batch_normalization(h, batch_stat=not test) if not test and do: h = F.dropout(h) # Convblock 2 h = conv_unit(h, "conv20", 512, k=3, s=1, p=0, act=act, test=test) # 7 -> 5 h = conv_unit(h, "conv21", 256, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv22", 128, k=1, s=1, p=0, act=act, test=test) u = h # Convblock 3 h = conv_unit(h, "conv23", 10, k=1, s=1, p=0, act=act, test=test) h = F.average_pooling(h, (5, 5)) with nn.parameter_scope("bn2"): h = PF.batch_normalization(h, batch_stat=not test) pred = F.reshape(h, (h.shape[0], np.prod(h.shape[1:]))) # Uncertainty u = conv_unit(u, "u0", 10, k=1, s=1, p=0, act=act, test=test) u = F.average_pooling(u, (5, 5)) with nn.parameter_scope("u0bn"): u = PF.batch_normalization(u, batch_stat=not test) log_var = F.reshape(u, (u.shape[0], np.prod(u.shape[1:]))) return pred, log_var
def call(self, inputs): r"""Encoder layer. Args: inputs (nn.Variable): An input variable of shape (B, T) indicates indices of character embeddings. Returns: nn.Variable: Output variable of shape (T, B, C). """ hp = self._hparams with nn.parameter_scope('embeddings'): val = np.sqrt(6.0 / (len(hp.vocab) + hp.symbols_embedding_dim)) inputs = PF.embed( inputs, n_inputs=len(hp.vocab), n_features=hp.symbols_embedding_dim, initializer=UniformInitializer(lim=(-val, val))) # (B, T, C=512) with nn.parameter_scope('ngrams'): out = inputs for i in range(hp.encoder_n_convolutions): with nn.parameter_scope(f'filter_{i}'): out = conv_norm(out, out_channels=hp.encoder_embedding_dim, kernel_size=hp.encoder_kernel_size, padding=(hp.encoder_kernel_size - 1) // 2, bias=False, stride=1, dilation=1, w_init_gain='relu', scope='conv_norm', channel_last=True) # (B, C=512, T) out = PF.batch_normalization(out, batch_stat=self.training, axes=[2]) out = F.relu(out) if self.training: # (B, C=512, T) --> (B, T, C=512) out = F.dropout(out, 0.5) with nn.parameter_scope('lstm_encoder'): out = F.transpose(out, (1, 0, 2)) # (2, 0, 1)) h = F.constant(shape=(2, 2, hp.batch_size, hp.encoder_embedding_dim // 2)) c = F.constant(shape=(2, 2, hp.batch_size, hp.encoder_embedding_dim // 2)) out, _, _ = PF.lstm(out, h, c, training=self.training, bidirectional=True) return out # (T, B, C=512)
def positionwise_feed_forward(x, train: bool = True, dropout_ratio: float = 0.1): batch_size, length, dim = x.shape with nn.parameter_scope('pff'): with nn.parameter_scope('w1'): h = F.relu(time_distributed(PF.affine)(x, dim * 4)) if train: h = F.dropout(h, p=dropout_ratio) with nn.parameter_scope('w2'): h = time_distributed(PF.affine)(h, dim) return h
def clf_resnet50(layer, n_classes=1, train=True): """ This function uses ResNet-50 pretrained on ImageNet as the base architecture and replaces the linear layer in ResNet with two linear layers with the hidden layer of size 2,048. Dropout and ReLU are applied between these """ layer_1 = F.relu(PF.affine(layer, 2048, name='classifier_1')) if train: layer_1 = F.dropout(layer_1, 0.5) out = PF.affine(layer_1, n_classes, name='classifier_2') return out
def _scaled_dot_product_attention(q, k, v, attn_mask, dropout): B, Nt, E = q.shape q *= float(E)**-0.5 # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns) attn = F.batch_matmul(q, k, transpose_b=True) if attn_mask is not None: attn += attn_mask attn_output_weights = F.softmax(attn, axis=len(attn.shape) - 1) if dropout > 0.0: attn = F.dropout(attn, p=dropout) # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E) attn_output = F.batch_matmul(attn_output_weights, v) return attn_output, attn_output_weights
def out_layers(self, h, emb): if self.scale_shift_norm: scale, shift = chunk(emb, num_chunk=2, axis=1) h = normalize(h, name="norm_out") * (scale + 1) + shift else: h += emb h = normalize(h, name="norm_out") h = nonlinearity(h) if self.dropout > 0: h = F.dropout(h, p=self.dropout) h = conv(h, self.out_channels, name="conv_out", zeroing_w=True) return h
def __call__(self, x): depth_coef = self.net_setting["depth_coef"] width_coef = self.net_setting["width_coef"] resolution = self.net_setting["resolution"] p = self.net_setting["p"] assert get_spatial_shape(x.shape, self.channel_last) == [resolution, resolution], \ "(x.shape = {}, resolution = {})".format(x.shape, resolution) # First conv maps = self.round_filters(32, width_coef) h = self.conv_bn(x, maps, stride=(2, 2), name="first-conv") # Inverted residual blocks for i, elm in enumerate(self.mbc_settings): t, c, k, n, s = elm c = self.round_filters(c, width_coef) n = int(np.ceil(n * depth_coef)) mbconv_s = partial(self.inverted_residual, maps=c, kernel=(k, k), stride=(s, s), ef=t) mbconv_1 = partial(self.inverted_residual, maps=c, kernel=(k, k), stride=(1, 1), ef=t) for j in range(n): name = "mbconv-{:02d}-{:02d}".format(i, j) h = mbconv_s(h, name=name) if j == 0 else mbconv_1(h, name=name) # Last conv maps = self.round_filters(1280, width_coef) h = self.conv_bn_swish(h, maps, kernel=(1, 1), name="last-conv") # Classifier if not self.test: h = F.dropout(h, p) pool_shape = get_spatial_shape(x.shape, self.channel_last) h = F.average_pooling(h, pool_shape, channel_last=self.channel_last) h = PF.affine(h, self.num_classes, w_init=I.NormalInitializer(0.01), name="linear") return h, {}
def res_unit_default(x, scope, bn_idx, test): # BatchNorm is independent from parameter sharing C = x.shape[1] with nn.parameter_scope(scope): with nn.parameter_scope('conv1'): with nn.parameter_scope('bn_{}-a'.format(bn_idx)): h = PF.batch_normalization(x, batch_stat=not test) h = F.relu(h) h = PF.convolution(h, C, (3, 3), pad=(1, 1), with_bias=False) with nn.parameter_scope('bn_{}-b'.format(bn_idx)): h = PF.batch_normalization(h, batch_stat=not test) h = F.relu(h) if not test: h = F.dropout(h, 0.25) with nn.parameter_scope('conv2'): h = PF.convolution(h, C, (3, 3), pad=(1, 1), with_bias=False) return x + h
def res_unit(x, scope_name, act=F.relu, dn=False, test=False): C = x.shape[1] with nn.parameter_scope(scope_name): # Conv -> BN -> Relu with nn.parameter_scope("conv1"): h = PF.convolution(x, C / 2, kernel=(1, 1), pad=(0, 0), with_bias=False) h = PF.batch_normalization(h, decay_rate=0.9, batch_stat=not test) h = act(h) # Conv -> BN -> Relu with nn.parameter_scope("conv2"): h = PF.convolution(h, C / 2, kernel=(3, 3), pad=(1, 1), with_bias=False) h = PF.batch_normalization(h, decay_rate=0.9, batch_stat=not test) h = act(h) # Conv -> BN with nn.parameter_scope("conv3"): h = PF.convolution(h, C, kernel=(1, 1), pad=(0, 0), with_bias=False) h = PF.batch_normalization(h, decay_rate=0.9, batch_stat=not test) # Residual -> Relu if not test: h = F.dropout(h) with nn.parameter_scope(scope_name): h = PF.batch_normalization(h, decay_rate=0.9, batch_stat=not test) h = F.add2(h, x) h = act(h) # Maxpooling if dn: h = F.max_pooling(h, kernel=(2, 2), stride=(2, 2)) return h
def test_dropout_forward_backward(p, seed, ctx, func_name): from nbla_test_utils import cap_ignore_region, function_tester rng = np.random.RandomState(seed) inputs = [ cap_ignore_region( rng.randn(2, 3, 4).astype(np.float32) * 2, (-1e-3, 1e-3))] # Ensure there is no zero. i = nn.Variable(inputs[0].shape, need_grad=True) i.d = inputs[0] # NNabla forward with nn.context_scope(ctx), nn.auto_forward(): o = F.dropout(i, p) scale = 1. / (1. - p) mask = o.d != 0 assert np.allclose(o.d, i.d * mask * scale) assert o.parent.name == func_name # NNabla backward orig_grad = rng.randn(*i.shape).astype(i.data.dtype) i.g[...] = orig_grad o_grad = rng.randn(*i.shape).astype(i.data.dtype) o.backward(o_grad) ref_grad = o_grad * mask * scale # Verify assert np.allclose(i.g, orig_grad + ref_grad) # Check if accum option works i.g[...] = 1 o.g = o_grad o.parent.backward([i], [o], [False]) assert np.allclose(i.g, ref_grad) # Check accum=False with NaN gradient i.g = np.float32('nan') o.parent.backward([i], [o], [False]) assert not np.any(np.isnan(i.g)) # Check if need_grad works i.g[...] = 0 i.need_grad = False o.backward(o_grad) assert np.all(i.g == 0)
def test_clearing_without_recompute_flag(self): x0 = nn.Variable((1, 128, 128), need_grad=True) x1 = F.sin(x0).apply(recompute=True) x2 = F.dropout(x1) x3 = F.sin(x2).apply(recompute=True) x4 = F.sin(x3).apply(recompute=True) y = F.identity(x4) # Skip this code temporarily since it cause # randomly crash when perform CI testing on windows 10 with nnabla-cuda-ext pytest.skip( 'Skipped for randomly crash when perform CI testing on windows 10 with nnabla-cuda-ext') y.forward(clear_no_need_grad=True) x2.data.clear() with pytest.raises(RuntimeError, match="Failed `called_setup_recompute_`"): # x2.data cannot be recomputed correctly since `setup_recompute` is not called during forward propagation. # Backward should raise when some intermediate variables are cleared by user. y.backward()
def test_dropout_forward_backward(p, seed, ctx, func_name): from nbla_test_utils import cap_ignore_region, function_tester rng = np.random.RandomState(seed) inputs = [ cap_ignore_region( rng.randn(2, 3, 4).astype(np.float32) * 2, (-1e-3, 1e-3)) ] # Ensure there is no zero. i = nn.Variable(inputs[0].shape, need_grad=True) i.d = inputs[0] # NNabla forward with nn.context_scope(ctx), nn.auto_forward(): o = F.dropout(i, p) scale = 1. / (1. - p) mask = o.d != 0 assert_allclose(o.d, i.d * mask * scale) assert o.parent.name == func_name # NNabla backward orig_grad = rng.randn(*i.shape).astype(i.data.dtype) i.g[...] = orig_grad o_grad = rng.randn(*i.shape).astype(i.data.dtype) o.backward(o_grad) ref_grad = o_grad * mask * scale # Verify assert_allclose(i.g, orig_grad + ref_grad) # Check if accum option works i.g[...] = 1 o.g = o_grad o.parent.backward([i], [o], [False]) assert_allclose(i.g, ref_grad) # Check accum=False with NaN gradient i.g = np.float32('nan') o.parent.backward([i], [o], [False]) assert not np.any(np.isnan(i.g)) # Check if need_grad works i.g[...] = 0 i.need_grad = False o.backward(o_grad) assert np.all(i.g == 0)
def bn_dropout(h, scope_name, test=False): with nn.parameter_scope(scope_name): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) return h
def cnn_model_003_with_cross_attention(ctx, x_list, act=F.relu, test=False): """With attention before pooling """ with nn.context_scope(ctx): # Convblock0 h0_list = [] for x in x_list: h = conv_unit(x, "conv00", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv01", 128, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv02", 128, k=3, s=1, p=1, act=act, test=test) h0_list.append(h) # Corss attention ca0 = attention(h0_list[0], h0_list[1], h0_list[1], div_dim=True, softmax=True) ca1 = attention(h0_list[1], h0_list[0], h0_list[0], div_dim=True, softmax=True) # Maxpooing, Batchnorm, Dropout h0_list = [] for h in [ca0, ca1]: h = F.max_pooling(h, (2, 2)) # 32 -> 16 with nn.parameter_scope("bn0"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) h0_list.append(h) # Convblock 1 h1_list = [] for h in h0_list: h = conv_unit(h, "conv10", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv11", 256, k=3, s=1, p=1, act=act, test=test) h = conv_unit(h, "conv12", 256, k=3, s=1, p=1, act=act, test=test) h1_list.append(h) # Corss attention ca0 = attention(h1_list[0], h1_list[1], h1_list[1], div_dim=True, softmax=True) ca1 = attention(h1_list[1], h1_list[0], h1_list[0], div_dim=True, softmax=True) # Maxpooing, Batchnorm, Dropout h1_list = [] for h in [ca0, ca1]: h = F.max_pooling(h, (2, 2)) # 16 -> 8 with nn.parameter_scope("bn1"): h = PF.batch_normalization(h, batch_stat=not test) if not test: h = F.dropout(h) h1_list.append(h) # Convblock 2 h2_list = [] for h in h1_list: h = conv_unit(h, "conv20", 512, k=3, s=1, p=0, act=act, test=test) # 8 -> 6 h = conv_unit(h, "conv21", 256, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv22", 128, k=1, s=1, p=0, act=act, test=test) h = conv_unit(h, "conv23", 10, k=1, s=1, p=0, act=act, test=test) h2_list.append(h) # Corss attention ca0 = attention(h2_list[0], h2_list[1], h2_list[1], div_dim=True, softmax=True) ca1 = attention(h2_list[1], h2_list[0], h2_list[0], div_dim=True, softmax=True) # Convblock 3 h3_list = [] for h in [ca0, ca1]: h = F.average_pooling(h, (6, 6)) with nn.parameter_scope("bn2"): h = PF.batch_normalization(h, batch_stat=not test) h = F.reshape(h, (h.shape[0], np.prod(h.shape[1:]))) h3_list.append(h) return h3_list