예제 #1
0
def downblock(x, out_features, norm=False, kernel_size=4, pool=False, sn=False, test=False):
    out = x

    if sn:
        def apply_w(w): return PF.spectral_norm(w, dim=0, test=test)
    else:
        apply_w = None

    inmaps, outmaps = out.shape[1], out_features
    k_w = I.calc_normal_std_he_forward(
        inmaps, outmaps, kernel=(kernel_size, kernel_size)) / np.sqrt(2.)
    k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.)
    w_init = I.UniformInitializer((-k_w, k_w))
    b_init = I.UniformInitializer((-k_b, k_b))

    out = PF.convolution(out, out_features,
                         kernel=(kernel_size, kernel_size), pad=(0, 0),
                         stride=(1, 1), w_init=w_init, b_init=b_init,
                         apply_w=apply_w)

    if norm:
        out = PF.instance_normalization(out)

    out = F.leaky_relu(out, 0.2, inplace=True)

    if pool:
        out = F.average_pooling(out, kernel=(2, 2))

    return out
예제 #2
0
def detect_keypoint(x, block_expansion, num_kp, num_channels, max_features,
                    num_blocks, temperature, estimate_jacobian=False, scale_factor=1,
                    single_jacobian_map=False, pad=0,
                    test=False, comm=None):

    if scale_factor != 1:
        x = anti_alias_interpolate(x, num_channels, scale_factor)

    with nn.parameter_scope("hourglass"):
        feature_map = hourglass(x, block_expansion, num_blocks=num_blocks,
                                max_features=max_features, test=test, comm=comm)

    with nn.parameter_scope("keypoint_detector"):
        inmaps, outmaps = feature_map.shape[1], num_kp
        k_w = I.calc_normal_std_he_forward(
            inmaps, outmaps, kernel=(7, 7)) / np.sqrt(2.)
        k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.)
        w_init = I.UniformInitializer((-k_w, k_w))
        b_init = I.UniformInitializer((-k_b, k_b))
        prediction = PF.convolution(feature_map, outmaps=num_kp,
                                    kernel=(7, 7), pad=(pad, pad),
                                    w_init=w_init, b_init=b_init)

    final_shape = prediction.shape

    heatmap = F.reshape(prediction, (final_shape[0], final_shape[1], -1))
    heatmap = F.softmax(heatmap / temperature, axis=2)
    heatmap = F.reshape(heatmap, final_shape, inplace=False)

    out = gaussian2kp(heatmap)  # {"value": value}, keypoint positions.

    if estimate_jacobian:
        if single_jacobian_map:
            num_jacobian_maps = 1
        else:
            num_jacobian_maps = num_kp

        with nn.parameter_scope("jacobian_estimator"):
            jacobian_map = PF.convolution(feature_map,
                                          outmaps=4*num_jacobian_maps,
                                          kernel=(7, 7), pad=(pad, pad),
                                          w_init=I.ConstantInitializer(0),
                                          b_init=np.array([1, 0, 0, 1]*num_jacobian_maps))

        jacobian_map = F.reshape(
            jacobian_map, (final_shape[0], num_jacobian_maps, 4, final_shape[2], final_shape[3]))
        heatmap = F.reshape(
            heatmap, heatmap.shape[:2] + (1,) + heatmap.shape[2:], inplace=False)

        jacobian = heatmap * jacobian_map
        jacobian = F.sum(jacobian, axis=(3, 4))
        jacobian = F.reshape(
            jacobian, (jacobian.shape[0], jacobian.shape[1], 2, 2), inplace=False)
        out['jacobian'] = jacobian  # jacobian near each keypoint.

    # out is a dictionary containing {"value": value, "jacobian": jacobian}

    return out
예제 #3
0
def resblock(x,
             in_features: int,
             kernel_size: int,
             padding: int,
             test: bool = False,
             comm=None):
    if comm:
        batchnorm = functools.partial(PF.sync_batch_normalization,
                                      comm=comm,
                                      group='world',
                                      axes=[1],
                                      decay_rate=0.9,
                                      eps=1e-05,
                                      batch_stat=not test)
    else:
        # 1 GPU
        batchnorm = functools.partial(PF.batch_normalization,
                                      axes=[1],
                                      decay_rate=0.9,
                                      eps=1e-05,
                                      batch_stat=not test)

    inmaps, outmaps = x.shape[1], in_features
    k_w = I.calc_normal_std_he_forward(
        inmaps, outmaps, kernel=(kernel_size, kernel_size)) / np.sqrt(2.)
    k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.)
    w_init = I.UniformInitializer((-k_w, k_w))
    b_init = I.UniformInitializer((-k_b, k_b))

    with nn.parameter_scope("convblock_0"):
        out = batchnorm(x)
        out = F.relu(out, inplace=True)
        out = PF.convolution(out,
                             outmaps=in_features,
                             kernel=(kernel_size, kernel_size),
                             pad=(padding, padding),
                             w_init=w_init,
                             b_init=b_init)

    with nn.parameter_scope("convblock_2"):
        out = batchnorm(out)
        out = F.relu(out, inplace=True)
        out = PF.convolution(out,
                             outmaps=in_features,
                             kernel=(kernel_size, kernel_size),
                             pad=(padding, padding),
                             w_init=w_init,
                             b_init=b_init)
    out = F.add2(out, x, inplace=True)
    return out
예제 #4
0
def inspecs_params():
    inspecs = []
    u = I.UniformInitializer((0.5, 1.0))
    inspecs.append([Inspec((64, 1000), u)])
    inspecs.append([Inspec((64, 32, 224, 224), u)])
    inspecs.append([Inspec((64, 128, 56, 56), u)])
    return inspecs
예제 #5
0
파일: test_pad.py 프로젝트: aswifi/nnabla
def pad_params():
    inspecs = []
    u = I.UniformInitializer((0.5, 1.0))
    inspecs.append([Inspec((2, 2, 2, 2), u)])
    inspecs.append([Inspec((2, 3, 2, 3), u)])
    inspecs.append([Inspec((2, 20, 200, 200), u)])
    return inspecs
예제 #6
0
파일: test_logical.py 프로젝트: sony/nnabla
def pairwise_inspecs_params():
    inspecs = []
    u = I.UniformInitializer((0, 2))
    inspecs.append(
        [Inspec((64, 32, 224, 224), u),
         Inspec((64, 32, 224, 224), u)])
    return inspecs
예제 #7
0
def nin(x, c, name, zeroing_w=False):
    lim = np.sqrt(x.shape[1])**-1
    w_init = I.UniformInitializer(lim=(-lim, lim))  # same as pytorch's default
    b_init = I.UniformInitializer(lim=(-lim, lim))  # same as pytorch's default

    if zeroing_w:
        w_init = I.ConstantInitializer(0)
        b_init = I.ConstantInitializer(0)

    return PF.convolution(x,
                          c,
                          kernel=(1, 1),
                          pad=(0, 0),
                          stride=(1, 1),
                          name=name,
                          w_init=w_init,
                          b_init=b_init)
예제 #8
0
def discriminator(x, kp=None, num_channels=3, block_expansion=64,
                  num_blocks=4, max_features=512, sn=False, use_kp=False,
                  num_kp=10, kp_variance=0.01, test=False, **kwargs):

    down_blocks = []
    for i in range(num_blocks):
        down_blocks.append(
                functools.partial(downblock,
                                  out_features=min(
                                      max_features, block_expansion * (2 ** (i + 1))),
                                  norm=(i != 0), kernel_size=4,
                                  pool=(i != num_blocks - 1), sn=sn,
                                  test=test))

    feature_maps = []
    out = x

    if use_kp:
        heatmap = kp2gaussian(kp, x.shape[2:], kp_variance)
        out = F.concatenate(out, heatmap, axis=1)

    for i, down_block in enumerate(down_blocks):
        with nn.parameter_scope(f"downblock_{i}"):
            feature_maps.append(down_block(out))
            out = feature_maps[-1]

    if sn:
        def apply_w(w): return PF.spectral_norm(w, dim=0, test=test)
    else:
        apply_w = None

    with nn.parameter_scope("prediction"):
        inmaps, outmaps = out.shape[1], 1
        k_w = I.calc_normal_std_he_forward(
            inmaps, outmaps, kernel=(1, 1)) / np.sqrt(2.)
        k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.)
        w_init = I.UniformInitializer((-k_w, k_w))
        b_init = I.UniformInitializer((-k_b, k_b))
        prediction_map = PF.convolution(out, 1, kernel=(1, 1), pad=(0, 0),
                                        stride=(1, 1),
                                        w_init=w_init,
                                        b_init=b_init,
                                        apply_w=apply_w)
    return feature_maps, prediction_map
예제 #9
0
def downblock(x,
              out_features,
              kernel_size=3,
              padding=1,
              groups=1,
              test=False,
              comm=None):
    if comm:
        batchnorm = functools.partial(PF.sync_batch_normalization,
                                      comm=comm,
                                      group='world',
                                      axes=[1],
                                      decay_rate=0.9,
                                      eps=1e-05,
                                      batch_stat=not test)
    else:
        # 1 GPU
        batchnorm = functools.partial(PF.batch_normalization,
                                      axes=[1],
                                      decay_rate=0.9,
                                      eps=1e-05,
                                      batch_stat=not test)

    inmaps, outmaps = x.shape[1], out_features
    k_w = I.calc_normal_std_he_forward(
        inmaps, outmaps, kernel=(kernel_size, kernel_size)) / np.sqrt(2.)
    k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.)
    w_init = I.UniformInitializer((-k_w, k_w))
    b_init = I.UniformInitializer((-k_b, k_b))

    with nn.parameter_scope("downblock"):
        out = PF.convolution(x,
                             outmaps=out_features,
                             kernel=(kernel_size, kernel_size),
                             pad=(padding, padding),
                             group=groups,
                             w_init=w_init,
                             b_init=b_init)
        out = batchnorm(out)
    out = F.relu(out, inplace=True)
    out = F.average_pooling(out, kernel=(2, 2))
    return out
예제 #10
0
    def __init__(self, embedding_dim, num_embedding, commitment_cost, rng,
                 scope_name='vector_quantizer'):
        self.embedding_dim = embedding_dim
        self.num_embedding = num_embedding
        self.commitment_cost = commitment_cost
        self.rng = rng
        self.scope_name = scope_name

        with nn.parameter_scope(scope_name):
            self.embedding_weight = nn.parameter.get_parameter_or_create('W', shape=(self.num_embedding, self.embedding_dim),
                                                                         initializer=I.UniformInitializer((-1./self.num_embedding, 1./self.num_embedding), rng=self.rng), need_grad=True)
예제 #11
0
def pf_affine(r, num_classes=1000, channel_last=False):
    # Initializer supposes the final classifaction layer
    fan_in = int(np.prod(r.shape[1:]))
    k = 1 / np.sqrt(fan_in)
    init = I.UniformInitializer((-k, k), rng=RNG)
    r = PF.convolution(r,
                       num_classes, (1, 1),
                       channel_last=channel_last,
                       w_init=init,
                       b_init=init,
                       name='fc')
    return F.reshape(r, (r.shape[0], -1), inplace=False)
def embedding(x, input_dim, output_dim, init=None, mask_zero=False):
    if init is None:
        init = I.UniformInitializer((-0.1, 0.1))
    initialized = "embed/W" in nn.get_parameters()
    result = PF.embed(x, input_dim, output_dim)
    if not initialized:
        nn.get_parameters()["embed/W"].d = init(
            nn.get_parameters()["embed/W"].shape)

    if mask_zero:
        return result, 1 - F.equal_scalar(x, 0)
    else:
        return result
예제 #13
0
def conv(x,
         c,
         name,
         kernel=(3, 3),
         pad=(1, 1),
         stride=(1, 1),
         zeroing_w=False):
    # init weight and bias with uniform, which is the same as pytorch
    lim = I.calc_normal_std_he_forward(x.shape[1] * 2, c, tuple(kernel))
    w_init = I.UniformInitializer(lim=(-lim, lim), rng=None)
    b_init = I.UniformInitializer(lim=(-lim, lim), rng=None)

    if zeroing_w:
        w_init = I.ConstantInitializer(0)
        b_init = I.ConstantInitializer(0)

    return PF.convolution(x,
                          c,
                          kernel,
                          pad=pad,
                          stride=stride,
                          name=name,
                          w_init=w_init,
                          b_init=b_init)
예제 #14
0
def dense(x,
          output_dim,
          base_axis=1,
          w_init=None,
          b_init=I.ConstantInitializer(0),
          activation=F.tanh):
    if w_init is None:
        w_init = I.UniformInitializer(
            I.calc_uniform_lim_glorot(np.prod(x.shape[1:]), output_dim))
    return activation(
        PF.affine(x,
                  output_dim,
                  base_axis=base_axis,
                  w_init=w_init,
                  b_init=b_init))
예제 #15
0
def resnet50_inspecs_params_without_broadcast():
    inspecs = []
    u = I.UniformInitializer((0.5, 1.0))

    inspecs.append([Inspec((5, 2048, 7, 7), u), Inspec((5, 2048, 7, 7), u)])
    inspecs.append(
        [Inspec((5, 1024, 14, 14), u),
         Inspec((5, 1024, 14, 14), u)])
    inspecs.append([Inspec((5, 512, 28, 28), u), Inspec((5, 512, 28, 28), u)])
    inspecs.append([Inspec((5, 256, 56, 56), u), Inspec((5, 256, 56, 56), u)])
    inspecs.append([Inspec((5, 56, 56, 256), u), Inspec((5, 56, 56, 256), u)])
    inspecs.append([Inspec((5, 28, 28, 512), u), Inspec((5, 28, 28, 512), u)])
    inspecs.append(
        [Inspec((5, 14, 14, 1024), u),
         Inspec((5, 14, 14, 1024), u)])
    inspecs.append([Inspec((5, 7, 7, 2048), u), Inspec((5, 7, 7, 2048), u)])

    return inspecs
def conv_initializer(f_in, n_out, base_axis, kernel, mode):
    '''
    Conv initializer function
        This function returns various types of initialization for weights and bias parameters in convolution layer.

        Args:
            f_in (~nnabla.Variable): input variable.
            n_out (int) : number of output neurons per data.
            base_axis (int): dimensions up to base_axis are treated as the sample dimensions.
            kernel (tuple of int) : convolution kernel size.
            mode (str) : type of initialization to use.
        Returns:
            w (~nnabla.initializer.BaseInitializer): weight parameters
            b (~nnabla.initializer.BaseInitializer): bias parameters
    '''
    if mode == 'nnabla':
        # https://github.com/sony/nnabla/blob/master/python/src/nnabla/parametric_functions.py, line415, 417
        # https://github.com/sony/nnabla/blob/master/python/src/nnabla/initializer.py, line224. 121
        # uniform_lim_glorot = uniform(sqrt(6/(fin+fout)))
        n_input_plane = f_in.shape[base_axis]
        s = np.sqrt(6.0 / (n_input_plane * np.prod(kernel) + n_out))
        w = I.UniformInitializer([-s, s])
        b = I.ConstantInitializer(0)
        return w, b
예제 #17
0
def main():
    """
        Start architecture search.
    """
    args = get_args()
    print(args)

    ctx = get_extension_context(args.context,
                                device_id=args.device_id,
                                type_config=args.type_config)
    nn.set_default_context(ctx)
    ext = nn.ext_utils.import_extension_module(args.context)

    ops = {
        0: dil_conv_3x3,
        1: dil_conv_5x5,
        2: sep_conv_3x3,
        3: sep_conv_5x5,
        4: max_pool_3x3,
        5: avg_pool_3x3,
        6: identity,
        7: zero
    }

    initializer = I.UniformInitializer((-0.1, 0.1))
    num_of_nodes = args.num_nodes

    alphas_dict = dict()
    w_shape = (len(ops), ) + (1, 1, 1, 1)

    # prepare architecture parameters in advance
    for i in range(num_of_nodes):
        for j in range(i + 1, num_of_nodes - 1):
            if j < 2:
                continue  # no connection exists between 1st and 2nd nodes.
            else:
                w_name_normal = "alpha_normal_{}_{}".format(i, j)
                w_name_reduction = "alpha_reduction_{}_{}".format(i, j)
                alphas_dict[w_name_normal] = \
                    nn.parameter.get_parameter_or_create(w_name_normal,
                                                         w_shape, initializer)
                alphas_dict[w_name_reduction] = \
                    nn.parameter.get_parameter_or_create(w_name_reduction,
                                                         w_shape, initializer)

    # run architecture search
    alphas_dict = CNN_run(args, ops, alphas_dict)
    for k in nn.get_parameters(grad_only=False).keys():
        if "alpha_" not in k:
            nn.parameter.pop_parameter(k)  # delete unnecessary parameters.

    print("Architecture Search is finished. The saved architecture is,")
    alpha_normal, alpha_reduction = arrange_weights(args, ops)
    arch_normal = parse_weights(args, alpha_normal)
    arch_reduction = parse_weights(args, alpha_reduction)
    show_derived_cell(args, ops, arch_normal, "normal")
    show_derived_cell(args, ops, arch_reduction, "reduction")

    arch_data = {"arch_normal": arch_normal, "arch_reduction": arch_reduction}
    print("Saving the architecture parameter: {}/{}".format(
        args.monitor_path, args.model_arch_name))
    model_path = args.model_arch_name
    with open(model_path, 'w') as f:
        json.dump(arch_data, f)

    print("when you want to train the network from scratch\n\
    type 'python darts_train.py <OPTION> \
    --monitor-path {} --model-arch-name {}".format(args.monitor_path,
                                                   args.model_arch_name))

    return
예제 #18
0
            F.exp(-distance(u, x)) for x in F.split(negative_samples, axis=2)
        ])))


u = nn.Variable((batch_size, ))
v = nn.Variable((batch_size, ))
negative_samples = nn.Variable((batch_size, negative_sample_size))

_u = PF.embed(u, vocab_size, embedding_size)
_v = PF.embed(v, vocab_size, embedding_size)
_neg = PF.embed(negative_samples, vocab_size, embedding_size)
_neg = F.transpose(_neg, axes=(0, 2, 1))

loss = loss_function(_u, _v, _neg)

nn.get_parameters()["embed/W"].d = I.UniformInitializer(
    [-0.01, 0.01])(shape=(vocab_size, embedding_size))

solver = RiemannianSgd(lr=0.1)
solver.set_parameters(nn.get_parameters())

trainer = Trainer(inputs=[u, v, negative_samples], loss=loss, solver=solver)
trainer.run(train_data_iter, None, epochs=max_epoch)

line_points = [['mustang.n.01', 'odd-toed_ungulate.n.01'],
               ['elk.n.01', 'even-toed_ungulate.n.01'],
               ['even-toed_ungulate.n.01', 'ungulate.n.01'],
               ['squirrel.n.01', 'rodent.n.01'], ['beagle.n.01', 'dog.n.01'],
               ['dog.n.01', 'canine.n.02'], ['liger.n.01', 'carnivore.n.01'],
               ['bison.n.01', 'even-toed_ungulate.n.01'],
               ['collie.n.01', 'dog.n.01'],
               ['odd-toed_ungulate.n.01', 'ungulate.n.01'],
예제 #19
0
def lstm(x,
         mask,
         state_size,
         w_init=None,
         inner_w_init=None,
         forget_bias_init=I.ConstantInitializer(1),
         b_init=I.ConstantInitializer(0),
         initial_state=None,
         dropout=0,
         train=True,
         rng=np.random):
    """
    x: (batch_size, length, input_size)
    mask: (batch_size, length)
    """
    batch_size, length, input_size = x.shape

    if w_init is None:
        w_init = I.UniformInitializer(
            I.calc_uniform_lim_glorot(input_size, state_size))
    if inner_w_init is None:
        inner_w_init = orthogonal

    retain_prob = 1.0 - dropout
    z_w = nn.Variable((batch_size, 4, input_size), need_grad=False)
    z_w.d = 1
    z_u = nn.Variable((batch_size, 4, state_size), need_grad=False)
    z_u.d = 1

    if dropout > 0:
        if train:
            z_w = F.dropout(z_w, p=retain_prob)
            z_u = F.dropout(z_u, p=retain_prob)
        z_w *= retain_prob
        z_u *= retain_prob

    z_w = F.reshape(z_w, (batch_size, 4, 1, input_size))
    z_w = F.broadcast(z_w, (batch_size, 4, length, input_size))
    z_w = F.split(z_w, axis=1)
    z_u = F.split(z_u, axis=1)
    xi = z_w[0] * x
    xf = z_w[1] * x
    xc = z_w[2] * x
    xo = z_w[3] * x

    with nn.parameter_scope("lstm"):
        # (batch_size, length, state_size)
        xi = PF.affine(xi,
                       state_size,
                       base_axis=2,
                       w_init=w_init,
                       b_init=b_init,
                       name="Wi")
        xf = PF.affine(xf,
                       state_size,
                       base_axis=2,
                       w_init=w_init,
                       b_init=forget_bias_init,
                       name="Wf")
        xc = PF.affine(xc,
                       state_size,
                       base_axis=2,
                       w_init=w_init,
                       b_init=b_init,
                       name="Wc")
        xo = PF.affine(xo,
                       state_size,
                       base_axis=2,
                       w_init=w_init,
                       b_init=b_init,
                       name="Wo")

    if initial_state is None:
        h = nn.Variable((batch_size, state_size), need_grad=False)
        h.data.zero()
    else:
        h = initial_state
    c = nn.Variable((batch_size, state_size), need_grad=False)
    c.data.zero()

    # (batch_size, state_size)
    xi = split(xi, axis=1)
    xf = split(xf, axis=1)
    xc = split(xc, axis=1)
    xo = split(xo, axis=1)
    mask = F.reshape(mask, [batch_size, length, 1])  # (batch_size, length, 1)
    mask = F.broadcast(mask, [batch_size, length, state_size])
    # (batch_size, state_size)
    mask = split(mask, axis=1)

    hs = []
    cs = []
    with nn.parameter_scope("lstm"):
        for i, f, c2, o, m in zip(xi, xf, xc, xo, mask):
            i_t = PF.affine(z_u[0] * h,
                            state_size,
                            w_init=inner_w_init(state_size, state_size),
                            with_bias=False,
                            name="Ui")
            i_t = F.sigmoid(i + i_t)
            f_t = PF.affine(z_u[1] * h,
                            state_size,
                            w_init=inner_w_init(state_size, state_size),
                            with_bias=False,
                            name="Uf")
            f_t = F.sigmoid(f + f_t)
            c_t = PF.affine(z_u[2] * h,
                            state_size,
                            w_init=inner_w_init(state_size, state_size),
                            with_bias=False,
                            name="Uc")
            c_t = f_t * c + i_t * F.tanh(c2 + c_t)
            o_t = PF.affine(z_u[3] * h,
                            state_size,
                            w_init=inner_w_init(state_size, state_size),
                            with_bias=False,
                            name="Uo")
            o_t = F.sigmoid(o + o_t)
            h_t = o_t * F.tanh(c_t)

            h_t = (1 - m) * h + m * h_t
            c_t = (1 - m) * c + m * c_t
            h = h_t
            c = c_t
            h_t = F.reshape(h_t, (batch_size, 1, state_size), inplace=False)
            c_t = F.reshape(c_t, (batch_size, 1, state_size), inplace=False)
            hs.append(h_t)
            cs.append(c_t)
    return concatenate(*hs, axis=1), concatenate(*cs, axis=1)
예제 #20
0
 def last_affine(self, x, dims, name):
     c = x.shape[1]
     l, u = I.calc_uniform_lim_glorot(c, 1)
     w_init = I.UniformInitializer((l, u))
     return PF.affine(x, 1, w_init=w_init, name=name)
예제 #21
0
파일: test_loss.py 프로젝트: sony/nnabla
        Inspec((64, 128, 56, 56)),
        Inspec((64, 128, 56, 56), label_init, False)
    ])
    return inspecs


@pytest.mark.parametrize('inspecs', pairwise_inspecs_params())
@pytest.mark.parametrize('loss',
                         ['sigmoid_cross_entropy', 'binary_cross_entropy'])
def test_binary_classification_loss(inspecs, loss, nnabla_opts):
    func = getattr(F, loss)
    fb = FunctionBenchmark(func, inspecs, [], {}, nnabla_opts.ext,
                           nnabla_opts.ext_kwargs)
    fb.benchmark()
    fb.write(writer=nnabla_opts.function_benchmark_writer)


@pytest.mark.parametrize('inspecs',
                         pairwise_inspecs_params(I.UniformInitializer((0, 1))))
@pytest.mark.parametrize('loss',
                         ['squared_error', 'huber_loss', 'kl_multinomial'])
def test_pairwise_loss(inspecs, loss, nnabla_opts):
    func = getattr(F, loss)
    fb = FunctionBenchmark(func, inspecs, [], {}, nnabla_opts.ext,
                           nnabla_opts.ext_kwargs)
    fb.benchmark()
    fb.write(writer=nnabla_opts.function_benchmark_writer)


# ============================================================================
예제 #22
0
def predict_dense_motion(source_image,
                         kp_driving,
                         kp_source,
                         block_expansion,
                         num_blocks,
                         max_features,
                         num_kp,
                         num_channels,
                         estimate_occlusion_map=False,
                         scale_factor=1,
                         kp_variance=0.01,
                         test=False,
                         comm=None):
    if scale_factor != 1:
        source_image = anti_alias_interpolate(source_image, num_channels,
                                              scale_factor)

    bs, _, h, w = source_image.shape

    out_dict = dict()
    heatmap_representation = create_heatmap_representations(
        source_image, kp_driving, kp_source, kp_variance)
    sparse_motion = create_sparse_motions(source_image, kp_driving, kp_source,
                                          num_kp)
    deformed_source = create_deformed_source_image(source_image, sparse_motion,
                                                   num_kp)
    out_dict['sparse_deformed'] = deformed_source

    input = F.concatenate(heatmap_representation, deformed_source, axis=2)
    input = F.reshape(input, (bs, -1, h, w))

    with nn.parameter_scope("hourglass"):
        prediction = hourglass(input,
                               block_expansion=block_expansion,
                               num_blocks=num_blocks,
                               max_features=max_features,
                               test=test,
                               comm=comm)

    with nn.parameter_scope("mask"):
        inmaps, outmaps = prediction.shape[1], num_kp + 1
        k_w = I.calc_normal_std_he_forward(inmaps, outmaps,
                                           kernel=(7, 7)) / np.sqrt(2.)
        k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.)
        w_init = I.UniformInitializer((-k_w, k_w))
        b_init = I.UniformInitializer((-k_b, k_b))
        mask = PF.convolution(prediction,
                              outmaps=num_kp + 1,
                              kernel=(7, 7),
                              pad=(3, 3),
                              w_init=w_init,
                              b_init=b_init)

    mask = F.softmax(mask, axis=1)
    out_dict['mask'] = mask
    reshaped_mask = F.reshape(mask,
                              mask.shape[:2] + (1, ) + mask.shape[2:],
                              inplace=False)
    sparse_motion = F.transpose(sparse_motion, (0, 1, 4, 2, 3))
    deformation = F.sum(sparse_motion * reshaped_mask, axis=1)
    deformation = F.transpose(deformation, (0, 2, 3, 1))

    out_dict['deformation'] = deformation

    if estimate_occlusion_map:
        with nn.parameter_scope("occlusion_map"):
            occlusion_map = F.sigmoid(
                PF.convolution(prediction,
                               outmaps=1,
                               kernel=(7, 7),
                               pad=(3, 3),
                               w_init=w_init,
                               b_init=b_init))
        out_dict['occlusion_map'] = occlusion_map
    else:
        occlusion_map = None

    return out_dict
예제 #23
0
def occlusion_aware_generator(source_image, kp_driving, kp_source,
                              num_channels, num_kp, block_expansion, max_features,
                              num_down_blocks, num_bottleneck_blocks,
                              estimate_occlusion_map=False, dense_motion_params=None,
                              estimate_jacobian=False, test=False, comm=None):

    # pre-downsampling
    out = sameblock(source_image, out_features=block_expansion,
                    kernel_size=7, padding=3, test=test, comm=comm)

    # downsampling
    for i in range(num_down_blocks):
        with nn.parameter_scope(f"downblock_{i}"):
            out_features = min(max_features, block_expansion * (2 ** (i + 1)))
            out = downblock(out, out_features=out_features,
                            kernel_size=3, padding=1, test=test, comm=comm)

    output_dict = {}
    if dense_motion_params is not None:
        with nn.parameter_scope("dense_motion_prediction"):
            dense_motion = predict_dense_motion(source_image=source_image,
                                                kp_driving=kp_driving, kp_source=kp_source,
                                                num_kp=num_kp, num_channels=num_channels,
                                                estimate_occlusion_map=estimate_occlusion_map,
                                                test=test, comm=comm, **dense_motion_params)
        # dense_motion is a dictionay containing:
        # 'sparse_deformed': <Variable((8, 11, 3, 256, 256)),
        # 'mask': <Variable((8, 11, 256, 256)),
        # 'deformation': <Variable((8, 256, 256, 2)),
        # 'occlusion_map': <Variable((8, 1, 256, 256))}

        output_dict['mask'] = dense_motion['mask']
        output_dict['sparse_deformed'] = dense_motion['sparse_deformed']

        # Transform feature representation by deformation (+ occlusion)
        if 'occlusion_map' in dense_motion:
            occlusion_map = dense_motion['occlusion_map']
            output_dict['occlusion_map'] = occlusion_map
        else:
            occlusion_map = None
        deformation = dense_motion['deformation']
        out = deform_input(out, deformation)

        if occlusion_map is not None:
            if out.shape[2] != occlusion_map.shape[2] or out.shape[3] != occlusion_map.shape[3]:
                resized_occlusion_map = F.interpolate(occlusion_map,
                                                      output_size=out.shape[2:], mode="linear",
                                                      align_corners=False, half_pixel=True)
            else:
                resized_occlusion_map = F.identity(occlusion_map)
            out = out * resized_occlusion_map

        if test:
            output_dict["deformed"] = deform_input(source_image, deformation)

    # intermediate residual blocks
    in_features = min(max_features, block_expansion * (2 ** num_down_blocks))
    for i in range(num_bottleneck_blocks):
        with nn.parameter_scope(f"residual_block_{i}"):
            out = resblock(out, in_features=in_features,
                           kernel_size=3, padding=1, test=test, comm=comm)

    # upsampling
    for i in range(num_down_blocks):
        with nn.parameter_scope(f"upblock_{i}"):
            out_features = min(max_features, block_expansion *
                               (2 ** (num_down_blocks - i - 1)))
            out = upblock(out, out_features=out_features,
                          kernel_size=3, padding=1, test=test, comm=comm)

    with nn.parameter_scope("final_conv"):
        inmaps, outmaps = out.shape[1], num_channels
        k_w = I.calc_normal_std_he_forward(
            inmaps, outmaps, kernel=(7, 7)) / np.sqrt(2.)
        k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.)
        w_init = I.UniformInitializer((-k_w, k_w))
        b_init = I.UniformInitializer((-k_b, k_b))
        out = PF.convolution(out, outmaps=num_channels, kernel=(7, 7),
                             pad=(3, 3), w_init=w_init, b_init=b_init)
    out = F.sigmoid(out)
    output_dict["prediction"] = out

    return output_dict
예제 #24
0
def sample_from_controller(args):
    """
        2-layer RNN(LSTM) based controller which outputs an architecture of CNN, 
        represented as a sequence of integers and its list.
        Given the number of layers, for each layer, 
        it executes 2 types of computation, one for sampling the operation at that layer,
        another for sampling the skip connection patterns.
    """

    entropys = nn.Variable([1, 1], need_grad=True)
    log_probs = nn.Variable([1, 1], need_grad=True)
    skip_penaltys = nn.Variable([1, 1], need_grad=True)

    entropys.d = log_probs.d = skip_penaltys.d = 0.0  # initialize them all

    num_layers = args.num_layers
    lstm_size = args.lstm_size
    state_size = args.state_size
    lstm_num_layers = args.lstm_layers
    skip_target = args.skip_prob
    temperature = args.temperature
    tanh_constant = args.tanh_constant
    num_branch = args.num_ops

    arc_seq = []
    initializer = I.UniformInitializer((-0.1, 0.1))

    prev_h = [
        nn.Variable([1, lstm_size], need_grad=True)
        for _ in range(lstm_num_layers)
    ]
    prev_c = [
        nn.Variable([1, lstm_size], need_grad=True)
        for _ in range(lstm_num_layers)
    ]

    for i in range(len(prev_h)):
        prev_h[i].d = 0  # initialize variables in lstm layers.
        prev_c[i].d = 0

    inputs = nn.Variable([1, lstm_size])
    inputs.d = np.random.normal(0, 0.5, [1, lstm_size])

    g_emb = nn.Variable([1, lstm_size])
    g_emb.d = np.random.normal(0, 0.5, [1, lstm_size])

    skip_targets = nn.Variable([1, 2])
    skip_targets.d = np.array([[1.0 - skip_target, skip_target]])

    for layer_id in range(num_layers):
        # One-step stacked LSTM.
        with nn.parameter_scope("controller_lstm"):
            next_h, next_c = stack_lstm(inputs, prev_h, prev_c, state_size)
        prev_h, prev_c = next_h, next_c  # shape:(1, lstm_size)

        # Compute for operation.
        with nn.parameter_scope("ops"):
            logit = PF.affine(next_h[-1],
                              num_branch,
                              w_init=initializer,
                              with_bias=False)

        if temperature is not None:
            logit = F.mul_scalar(logit, (1 / temperature))

        if tanh_constant is not None:
            logit = F.mul_scalar(F.tanh(logit),
                                 tanh_constant)  # (1, num_branch)

        # normalizing logits.
        normed_logit = np.e**logit.d
        normed_logit = normed_logit / np.sum(normed_logit)

        # Sampling operation id from multinomial distribution.
        ops_id = np.random.multinomial(1, normed_logit[0], 1).nonzero()[1]
        ops_id = nn.Variable.from_numpy_array(ops_id)  # (1, )
        arc_seq.append(ops_id.d)

        # log policy for operation.
        log_prob = F.softmax_cross_entropy(logit,
                                           F.reshape(ops_id,
                                                     shape=(1, 1)))  # (1, )
        # accumulate log policy as log probs
        log_probs = F.add2(log_probs, log_prob)

        entropy = log_prob * F.exp(-log_prob)
        entropys = F.add2(entropys, entropy)  # accumulate entropy as entropys.

        w_emb = nn.parameter.get_parameter_or_create("w_emb",
                                                     [num_branch, lstm_size],
                                                     initializer,
                                                     need_grad=False)

        inputs = F.reshape(w_emb[int(ops_id.d)],
                           (1, w_emb.shape[1]))  # (1, lstm_size)

        with nn.parameter_scope("controller_lstm"):
            next_h, next_c = stack_lstm(inputs, prev_h, prev_c, lstm_size)
        prev_h, prev_c = next_h, next_c  # (1, lstm_size)

        with nn.parameter_scope("skip_affine_3"):
            adding_w_1 = PF.affine(next_h[-1],
                                   lstm_size,
                                   w_init=initializer,
                                   with_bias=False)  # (1, lstm_size)

        if layer_id == 0:
            inputs = g_emb  # (1, lstm_size)
            anchors = next_h[-1]  # (1, lstm_size)
            anchors_w_1 = adding_w_1  # then goes back to the entry point of the loop

        else:
            # (layer_id, lstm_size) this shape during the process
            query = anchors_w_1

            with nn.parameter_scope("skip_affine_1"):
                query = F.tanh(
                    F.add2(
                        query,
                        PF.affine(next_h[-1],
                                  lstm_size,
                                  w_init=initializer,
                                  with_bias=False)))
                #              (layer_id, lstm_size)   +   (1, lstm_size)
                # broadcast occurs here. resulting shape is; (layer_id, lstm_size)

            with nn.parameter_scope("skip_affine_2"):
                query = PF.affine(query,
                                  1,
                                  w_init=initializer,
                                  with_bias=False)  # (layer_id, 1)
            # note that each weight for skip_affine_X is shared across all steps of LSTM.

            # re-define logits, now its shape is;(layer_id, 2)
            logit = F.concatenate(-query, query, axis=1)

            if temperature is not None:
                logit = F.mul_scalar(logit, (1 / temperature))

            if tanh_constant is not None:
                logit = F.mul_scalar(F.tanh(logit), tanh_constant)

            skip_prob_unnormalized = F.exp(logit)  # (layer_id, 2)

            # normalizing skip_prob_unnormalized.
            summed = F.sum(skip_prob_unnormalized, axis=1,
                           keepdims=True).apply(need_grad=False)
            summed = F.concatenate(summed, summed, axis=1)

            skip_prob_normalized = F.div2(skip_prob_unnormalized,
                                          summed)  # (layer_id, 2)

            # Sampling skip_pattern from multinomial distribution.
            skip_pattern = np.random.multinomial(
                1, skip_prob_normalized.d[0],
                layer_id).nonzero()[1]  # (layer_id, 1)
            arc_seq.append(skip_pattern)
            skip = nn.Variable.from_numpy_array(skip_pattern)

            # compute skip penalty.
            # (layer_id, 2) broadcast occurs here too
            kl = F.mul2(skip_prob_normalized,
                        F.log(F.div2(skip_prob_normalized, skip_targets)))
            kl = F.sum(kl, keepdims=True)
            # get the mean value here in advance.
            kl = kl * (1.0 / (num_layers - 1))

            # accumulate kl divergence as skip penalty.
            skip_penaltys = F.add2(skip_penaltys, kl)

            # log policy for connection.
            log_prob = F.softmax_cross_entropy(
                logit, F.reshape(skip, shape=(skip.shape[0], 1)))
            log_probs = F.add2(log_probs, F.sum(log_prob, keepdims=True))

            entropy = F.sum(log_prob * F.exp(-log_prob), keepdims=True)
            # accumulate entropy as entropys.
            entropys = F.add2(entropys, entropy)

            skip = F.reshape(skip, (1, layer_id))

            inputs = F.affine(skip,
                              anchors).apply(need_grad=False)  # (1, lstm_size)
            inputs = F.mul_scalar(inputs, (1.0 / (1.0 + (np.sum(skip.d)))))

            # add new row for the next computation
            # (layer_id + 1, lstm_size)
            anchors = F.concatenate(anchors, next_h[-1], axis=0)
            # (layer_id + 1, lstm_size)
            anchors_w_1 = F.concatenate(anchors_w_1, adding_w_1, axis=0)

    return arc_seq, log_probs, entropys, skip_penaltys
예제 #25
0
def main():
    """
        Start architecture search and save the architecture found by the controller during the search.
    """
    args = get_macro_args()
    arguments_assertion(args)

    ctx = get_extension_context(args.context,
                                device_id=args.device_id,
                                type_config=args.type_config)
    nn.set_default_context(ctx)
    ext = nn.ext_utils.import_extension_module(args.context)

    if args.sampling_only:
        sample_from_pretrained_controller(args)
        return

    data_iterator = data_iterator_cifar10
    tdata = data_iterator(args.batch_size, True)
    vdata = data_iterator(args.batch_size, False)
    mean_val_train, std_val_train, channel, img_height, img_width, num_class = get_data_stats(
        tdata)
    mean_val_valid, std_val_valid, _, _, _, _ = get_data_stats(vdata)

    data_dict = {
        "train_data": (tdata, mean_val_train, std_val_train),
        "valid_data": (vdata, mean_val_valid, std_val_valid),
        "basic_info": (channel, img_height, img_width, num_class)
    }

    initializer = I.UniformInitializer((-0.1, 0.1))

    # Prepare all the weights in advance
    controller_weights_and_shape = {
        'controller_lstm/0/lstm/affine/W':
        (2 * args.lstm_size, 4, args.lstm_size),
        'controller_lstm/0/lstm/affine/b': (4, args.lstm_size),
        'controller_lstm/1/lstm/affine/W':
        (2 * args.lstm_size, 4, args.lstm_size),
        'controller_lstm/1/lstm/affine/b': (4, args.lstm_size),
        'ops/affine/W': (args.lstm_size, args.num_ops),
        'skip_affine_1/affine/W': (args.lstm_size, args.lstm_size),
        'skip_affine_2/affine/W': (args.lstm_size, 1),
        'skip_affine_3/affine/W': (args.lstm_size, args.lstm_size)
    }
    for w_name, w_shape in controller_weights_and_shape.items():
        nn.parameter.get_parameter_or_create(w_name,
                                             w_shape,
                                             initializer=initializer,
                                             need_grad=True)

    # create dictionary of controller's weights
    controller_weights_dict = {
        w_name: nn.get_parameters()[w_name]
        for w_name in controller_weights_and_shape.keys()
    }

    arch_change, best_arch = search_architecture(args, data_dict,
                                                 controller_weights_dict)

    if args.select_strategy == "best":
        print(
            "saving the model which achieved the best validation accuracy as {}."
            .format(args.recommended_arch))
        check_arch = best_arch
    else:
        # Use the latest architecture. it's not necessarily the one with the best architecture.
        print("saving the latest model recommended by the controller as {}.".
              format(args.recommended_arch))
        check_arch = arch_change[-1]
        np.save(args.recommended_arch, np.array(check_arch))

    print("The saved architecture is;")
    show_arch(check_arch)
    print("when you want to train the network from scratch,\n\
    type 'python macro_retrain.py <OPTION> --recommended-arch {}'".format(
        args.recommended_arch))

    # save the controller's weights so that another architectures can be made.
    all_params = nn.get_parameters(grad_only=False)
    controller_weights = list(controller_weights_and_shape.keys()) + ["w_emb"]
    for param_name in all_params.keys():
        if param_name not in controller_weights:
            nn.parameter.pop_parameter(param_name)
    nn.save_parameters(
        os.path.join(args.model_save_path, 'controller_params.h5'))

    # If you want to train the model recommended by the controller from scratch
    # right after architecture search, uncomment the lines below
    # nn.clear_parameters()
    # ext.clear_memory_cache()  # clear all the Variables
    # val_acc = CNN_run(args, check_arch, data_dict, with_train=True, after_search=True)
    return
예제 #26
0
def main():

    args = get_args()
    state_size = args.state_size
    batch_size = args.batch_size
    num_steps = args.num_steps
    num_layers = args.num_layers
    max_epoch = args.max_epoch
    max_norm = args.gradient_clipping_max_norm
    num_words = 10000
    lr = args.learning_rate

    train_data, val_data, test_data = get_data()

    # Get context.
    from nnabla.ext_utils import get_extension_context
    logger.info("Running in %s" % args.context)
    ctx = get_extension_context(
        args.context, device_id=args.device_id, type_config=args.type_config)
    nn.set_default_context(ctx)

    from nnabla.monitor import Monitor, MonitorSeries
    monitor = Monitor(args.work_dir)
    monitor_perplexity = MonitorSeries(
        "Training perplexity", monitor, interval=10)
    monitor_vperplexity = MonitorSeries("Validation perplexity", monitor, interval=(
        len(val_data)//(num_steps*batch_size)))
    monitor_tperplexity = MonitorSeries(
        "Test perplexity", monitor, interval=(len(test_data)//(num_steps*1)))

    l1 = LSTMWrapper(batch_size, state_size)
    l2 = LSTMWrapper(batch_size, state_size)

    # train graph

    x = nn.Variable((batch_size, num_steps))
    t = nn.Variable((batch_size, num_steps))
    w = I.UniformInitializer((-0.1, 0.1))
    b = I.ConstantInitializer(1)
    loss = get_loss(l1, l2, x, t, w, b, num_words,
                    batch_size, state_size, True)
    l1.share_data()
    l2.share_data()

    # validation graph

    vx = nn.Variable((batch_size, num_steps))
    vt = nn.Variable((batch_size, num_steps))
    vloss = get_loss(l1, l2, vx, vt, w, b, num_words, batch_size, state_size)
    solver = S.Sgd(lr)
    solver.set_parameters(nn.get_parameters())

    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    best_val = 10000
    for epoch in range(max_epoch):
        l1.reset_state()
        l2.reset_state()
        for i in range(len(train_data)//(num_steps*batch_size)):
            x.d, t.d = get_batch(train_data, i*num_steps,
                                 batch_size, num_steps)
            solver.zero_grad()
            loss.forward()
            loss.backward(clear_buffer=True)
            solver.weight_decay(1e-5)
            gradient_clipping(nn.get_parameters().values(), max_norm)
            solver.update()
            perp = perplexity(loss.d.copy())
            monitor_perplexity.add(
                (len(train_data)//(num_steps*batch_size))*(epoch)+i, perp)
        l1.reset_state()
        l2.reset_state()
        vloss_avg = 0
        for i in range(len(val_data)//(num_steps * batch_size)):
            vx.d, vt.d = get_batch(val_data, i*num_steps,
                                   batch_size, num_steps)
            vloss.forward()
            vloss_avg += vloss.d.copy()
        vloss_avg /= float((len(val_data)//(num_steps*batch_size)))
        vper = perplexity(vloss_avg)

        if vper < best_val:
            best_val = vper
            if vper < 200:
                save_name = "params_epoch_{:02d}.h5".format(epoch)
                nn.save_parameters(os.path.join(args.save_dir, save_name))
        else:
            solver.set_learning_rate(solver.learning_rate()*0.25)
            logger.info("Decreased learning rate to {:05f}".format(
                solver.learning_rate()))
        monitor_vperplexity.add(
            (len(val_data)//(num_steps*batch_size))*(epoch)+i, vper)

    # for final test split
    t_batch_size = 1
    tl1 = LSTMWrapper(t_batch_size, state_size)
    tl2 = LSTMWrapper(t_batch_size, state_size)
    tloss_avg = 0
    tx = nn.Variable((t_batch_size, num_steps))
    tt = nn.Variable((t_batch_size, num_steps))
    tloss = get_loss(tl1, tl2, tx, tt, w, b, num_words, 1, state_size)

    tl1.share_data()
    tl2.share_data()

    for i in range(len(test_data)//(num_steps * t_batch_size)):
        tx.d, tt.d = get_batch(test_data, i*num_steps, 1, num_steps)
        tloss.forward()
        tloss_avg += tloss.d.copy()
    tloss_avg /= float((len(test_data)//(num_steps*t_batch_size)))
    tper = perplexity(tloss_avg)
    monitor_tperplexity.add(
        (len(test_data)//(num_steps*t_batch_size))*(epoch)+i, tper)
예제 #27
0
def pytorch_conv_init(inmaps, kernel):
    scale = 1 / np.sqrt(inmaps * np.prod(kernel))

    return I.UniformInitializer(lim=(-scale, scale))
예제 #28
0
def cond_att_lstm(x,
                  parent_index,
                  mask,
                  context,
                  context_mask,
                  state_size,
                  att_hidden_size,
                  initial_state=None,
                  initial_cell=None,
                  hist=None,
                  dropout=0,
                  train=True,
                  w_init=None,
                  inner_w_init=None,
                  b_init=I.ConstantInitializer(0),
                  forget_bias_init=I.ConstantInitializer(1)):
    """
    x: (batch_size, length, input_size)
    parent_index: (batch_size, length)
    mask: (batch_size, length)
    context: (batch_size, context_length, context_size)
    context_mask: (batch_size, context_length)
    hist: (batch_size, l, state_size)
    """
    batch_size, length, input_size = x.shape
    _, context_length, context_size = context.shape

    if w_init is None:
        w_init = I.UniformInitializer(
            I.calc_uniform_lim_glorot(input_size, state_size))
    if inner_w_init is None:
        inner_w_init = orthogonal

    retain_prob = 1.0 - dropout
    z_w = nn.Variable((batch_size, 4, input_size), need_grad=False)
    z_w.d = 1
    z_u = nn.Variable((batch_size, 4, state_size), need_grad=False)
    z_u.d = 1

    if dropout > 0:
        if train:
            z_w = F.dropout(z_w, p=retain_prob)
            z_u = F.dropout(z_u, p=retain_prob)
        z_w *= retain_prob
        z_u *= retain_prob

    z_w = F.reshape(z_w, (batch_size, 4, 1, input_size))
    z_w = F.broadcast(z_w, (batch_size, 4, length, input_size))
    z_w = F.split(z_w, axis=1)
    z_u = F.split(z_u, axis=1)
    xi = z_w[0] * x
    xf = z_w[1] * x
    xc = z_w[2] * x
    xo = z_w[3] * x

    with nn.parameter_scope("cond_att_lstm"):
        # (batch_size, length, state_size)
        with nn.parameter_scope("lstm"):
            xi = PF.affine(
                xi,
                state_size,
                base_axis=2,
                w_init=w_init,
                b_init=b_init,
                name="Wi")
            xf = PF.affine(
                xf,
                state_size,
                base_axis=2,
                w_init=w_init,
                b_init=forget_bias_init,
                name="Wf")
            xc = PF.affine(
                xc,
                state_size,
                base_axis=2,
                w_init=w_init,
                b_init=b_init,
                name="Wc")
            xo = PF.affine(
                xo,
                state_size,
                base_axis=2,
                w_init=w_init,
                b_init=b_init,
                name="Wo")

        with nn.parameter_scope("context"):
            # context_att_trans: (batch_size, context_size, att_hidden_size)
            context_att_trans = PF.affine(
                context,
                att_hidden_size,
                base_axis=2,
                w_init=w_init,
                b_init=b_init,
                name="layer1_c")

    if initial_state is None:
        h = nn.Variable((batch_size, state_size), need_grad=False)
        h.data.zero()
    else:
        h = initial_state

    if initial_cell is None:
        c = nn.Variable((batch_size, state_size), need_grad=False)
        c.data.zero()
    else:
        c = initial_cell

    if hist is None:
        hist = nn.Variable((batch_size, 1, state_size), need_grad=False)
        hist.data.zero()

    # (batch_size, state_size)
    xi = split(xi, axis=1)
    xf = split(xf, axis=1)
    xc = split(xc, axis=1)
    xo = split(xo, axis=1)
    mask = F.reshape(mask, [batch_size, length, 1])  # (batch_size, length, 1)
    mask = F.broadcast(mask, [batch_size, length, state_size])
    # (batch_size, state_size)
    mask = split(mask, axis=1)
    # (batch_size, max_action_length)
    parent_index = parent_index + 1  # index == 0 means that parent is root
    # (batch_size)
    parent_index = split(parent_index, axis=1)

    hs = []
    cs = []
    ctx = []

    for i, f, c2, o, m, p in zip(xi, xf, xc, xo, mask, parent_index):
        h_num = hist.shape[1]
        with nn.parameter_scope("context"):
            h_att_trans = PF.affine(
                h,
                att_hidden_size,
                with_bias=False,
                w_init=w_init,
                name="layer1_h")  # (batch_size, att_hidden_size)
            h_att_trans = F.reshape(h_att_trans,
                                    (batch_size, 1, att_hidden_size))
            h_att_trans = F.broadcast(
                h_att_trans, (batch_size, context_length, att_hidden_size))
            att_hidden = F.tanh(context_att_trans + h_att_trans)
            att_raw = PF.affine(
                att_hidden, 1, base_axis=2, w_init=w_init,
                b_init=b_init)  # (batch_size, context_length, 1)
            att_raw = F.reshape(att_raw, (batch_size, context_length))
            ctx_att = F.exp(att_raw - F.max(att_raw, axis=1, keepdims=True))
            ctx_att = ctx_att * context_mask
            ctx_att = ctx_att / F.sum(ctx_att, axis=1, keepdims=True)
            ctx_att = F.reshape(ctx_att, (batch_size, context_length, 1))
            ctx_att = F.broadcast(ctx_att,
                                  (batch_size, context_length, context_size))
            ctx_vec = F.sum(
                context * ctx_att, axis=1)  # (batch_size, context_size)

        # parent_history
        p = F.reshape(p, (batch_size, 1))
        p = F.one_hot(p, (h_num, ))
        p = F.reshape(p, (batch_size, 1, h_num))
        par_h = F.batch_matmul(p, hist)  # [batch_size, 1, state_size]
        par_h = F.reshape(par_h, (batch_size, state_size))

        with nn.parameter_scope("lstm"):
            i_t = PF.affine(
                z_u[0] * h,
                state_size,
                w_init=inner_w_init(state_size, state_size),
                with_bias=False,
                name="Ui")
            i_t += PF.affine(
                ctx_vec,
                state_size,
                w_init=inner_w_init(context_size, state_size),
                with_bias=False,
                name="Ci")
            i_t += PF.affine(
                par_h,
                state_size,
                w_init=inner_w_init(state_size, state_size),
                with_bias=False,
                name="Pi")
            i_t = F.sigmoid(i + i_t)
            f_t = PF.affine(
                z_u[1] * h,
                state_size,
                w_init=inner_w_init(state_size, state_size),
                with_bias=False,
                name="Uf")
            f_t += PF.affine(
                ctx_vec,
                state_size,
                w_init=inner_w_init(context_size, state_size),
                with_bias=False,
                name="Cf")
            f_t += PF.affine(
                par_h,
                state_size,
                w_init=inner_w_init(state_size, state_size),
                with_bias=False,
                name="Pf")
            f_t = F.sigmoid(f + f_t)
            c_t = PF.affine(
                z_u[2] * h,
                state_size,
                w_init=inner_w_init(state_size, state_size),
                with_bias=False,
                name="Uc")
            c_t += PF.affine(
                ctx_vec,
                state_size,
                w_init=inner_w_init(context_size, state_size),
                with_bias=False,
                name="Cc")
            c_t += PF.affine(
                par_h,
                state_size,
                w_init=inner_w_init(state_size, state_size),
                with_bias=False,
                name="Pc")
            c_t = f_t * c + i_t * F.tanh(c2 + c_t)
            o_t = PF.affine(
                z_u[3] * h,
                state_size,
                w_init=inner_w_init(state_size, state_size),
                with_bias=False,
                name="Uo")
            o_t += PF.affine(
                ctx_vec,
                state_size,
                w_init=inner_w_init(context_size, state_size),
                with_bias=False,
                name="Co")
            o_t += PF.affine(
                par_h,
                state_size,
                w_init=inner_w_init(state_size, state_size),
                with_bias=False,
                name="Po")
            o_t = F.sigmoid(o + o_t)
            h_t = o_t * F.tanh(c_t)

            h_t = (1 - m) * h + m * h_t
            c_t = (1 - m) * c + m * c_t
            h = h_t
            c = c_t
            h_t = F.reshape(h_t, (batch_size, 1, state_size), inplace=False)
            c_t = F.reshape(c_t, (batch_size, 1, state_size), inplace=False)
            ctx_vec = F.reshape(
                ctx_vec, (batch_size, 1, context_size), inplace=False)
            hs.append(h_t)
            cs.append(c_t)
            ctx.append(ctx_vec)

            hist = F.concatenate(
                hist, h_t, axis=1)  # (batch_size, h_num + 1, state_size)

    return concatenate(
        *hs, axis=1), concatenate(
            *cs, axis=1), concatenate(
                *ctx, axis=1), hist
예제 #29
0
def sample_from_controller(args):
    """
        2-layer RNN(LSTM) based controller which outputs an architecture of CNN, 
        represented as a sequence of integers and its list.
        Given the number of layers, for each layer, 
        it executes 2 types of computation, one for sampling the operation at that layer,
        another for sampling the skip connection patterns.
    """

    entropys = nn.Variable([1, 1], need_grad=True)
    log_probs = nn.Variable([1, 1], need_grad=True)

    entropys.d = log_probs.d = 0.0  # initialize them all

    num_cells = args.num_cells
    num_nodes = args.num_nodes
    lstm_size = args.lstm_size
    state_size = args.state_size
    lstm_num_layers = args.lstm_layers
    temperature = args.temperature
    tanh_constant = args.tanh_constant
    op_tanh_reduce = args.op_tanh_reduce
    num_branch = args.num_ops

    both_archs = [list(), list()]
    initializer = I.UniformInitializer((-0.1, 0.1))

    prev_h = [
        nn.Variable([1, lstm_size], need_grad=True)
        for _ in range(lstm_num_layers)
    ]
    prev_c = [
        nn.Variable([1, lstm_size], need_grad=True)
        for _ in range(lstm_num_layers)
    ]

    for i in range(len(prev_h)):
        prev_h[i].d = 0  # initialize.
        prev_c[i].d = 0

    inputs = nn.Variable([1, lstm_size])
    inputs.d = np.random.normal(0, 0.5, [1, lstm_size])

    g_emb = nn.Variable([1, lstm_size])
    g_emb.d = np.random.normal(0, 0.5, [1, lstm_size])

    for ind in range(2):
        # first create conv cell and then reduc cell.
        idx_seq = list()
        ops_seq = list()
        for node_id in range(num_nodes):
            if node_id == 0:
                anchors = nn.parameter.get_parameter_or_create("anchors",
                                                               [2, lstm_size],
                                                               initializer,
                                                               need_grad=False)
                anchors_w_1 = nn.parameter.get_parameter_or_create(
                    "anchors_w_1", [2, lstm_size],
                    initializer,
                    need_grad=False)
            else:
                assert anchors.shape[0] == node_id + \
                    2, "Something wrong with anchors."
                assert anchors_w_1.shape[0] == node_id + \
                    2, "Something wrong with anchors_w_1."

            # for each node, get the index used as inputs
            for i in range(2):
                # One-step stacked LSTM.
                with nn.parameter_scope("controller_lstm"):
                    next_h, next_c = stack_lstm(inputs, prev_h, prev_c,
                                                state_size)
                prev_h, prev_c = next_h, next_c  # shape:(1, lstm_size)
                query = anchors_w_1

                with nn.parameter_scope("skip_affine_1"):
                    query = F.tanh(
                        F.add2(
                            query,
                            PF.affine(next_h[-1],
                                      lstm_size,
                                      w_init=initializer,
                                      with_bias=False)))
                    #            (node_id + 2, lstm_size)   +   (1, lstm_size)
                    # broadcast occurs here. resulting shape is; (node_id + 2, lstm_size)

                with nn.parameter_scope("skip_affine_2"):
                    # (node_id + 2, 1)
                    logit = PF.affine(query,
                                      1,
                                      w_init=initializer,
                                      with_bias=False)

                if temperature is not None:
                    logit = F.mul_scalar(logit, (1 / temperature))
                if tanh_constant is not None:
                    logit = F.mul_scalar(F.tanh(logit), tanh_constant)

                index = F.exp(logit)
                index = F.mul_scalar(index, (1 / index.d.sum()))

                # Sampling input indices from multinomial distribution.
                index = np.random.multinomial(
                    1,
                    np.reshape(index.d, (1, index.d.size))[0], 1)
                idx_seq.append(index.nonzero()[1])

                label = nn.Variable.from_numpy_array(
                    index.transpose())  # (node_id + 2, 1)
                log_prob = F.softmax_cross_entropy(logit, label)
                log_probs = F.add2(log_probs, F.sum(log_prob, keepdims=True))

                curr_ent = F.softmax_cross_entropy(logit, F.softmax(logit))
                entropy = F.sum(curr_ent, keepdims=True)
                entropys = F.add2(entropys, entropy)
                taking_ind = int(index.nonzero()[1][0])

                # (1, lstm_size)
                inputs = F.reshape(anchors[taking_ind], (1, anchors.shape[1]))

            # ops
            for j in range(2):
                with nn.parameter_scope("controller_lstm"):
                    next_h, next_c = stack_lstm(inputs, prev_h, prev_c,
                                                state_size)
                prev_h, prev_c = next_h, next_c  # shape:(1, lstm_size)

                # Compute for operation.
                with nn.parameter_scope("ops"):
                    logit = PF.affine(next_h[-1],
                                      num_branch,
                                      w_init=initializer,
                                      with_bias=False)

                # shape of logit : (1, num_branch)
                if temperature is not None:
                    logit = F.mul_scalar(logit, (1 / temperature))

                if tanh_constant is not None:
                    op_tanh = tanh_constant / op_tanh_reduce
                    logit = F.mul_scalar(F.tanh(logit), op_tanh)

                # normalizing logits.
                normed_logit = np.e**logit.d
                normed_logit = normed_logit / np.sum(normed_logit)

                # Sampling operation id from multinomial distribution.
                branch_id = np.random.multinomial(1, normed_logit[0],
                                                  1).nonzero()[1]
                branch_id = nn.Variable.from_numpy_array(branch_id)
                ops_seq.append(branch_id.d)

                # log policy for operation.
                log_prob = F.softmax_cross_entropy(
                    logit, F.reshape(branch_id, shape=(1, 1)))
                # accumulate log policy as log probs
                log_probs = F.add2(log_probs, log_prob)

                logit = F.transpose(logit, axes=(1, 0))
                curr_ent = F.softmax_cross_entropy(logit, F.softmax(logit))
                entropy = F.sum(curr_ent, keepdims=True)
                entropys = F.add2(entropys, entropy)

                w_emb = nn.parameter.get_parameter_or_create(
                    "w_emb", [num_branch, lstm_size],
                    initializer,
                    need_grad=False)
                # (1, lstm_size)
                inputs = F.reshape(w_emb[int(branch_id.d)],
                                   (1, w_emb.shape[1]))

                with nn.parameter_scope("controller_lstm"):
                    next_h, next_c = stack_lstm(inputs, prev_h, prev_c,
                                                lstm_size)
                prev_h, prev_c = next_h, next_c

                with nn.parameter_scope("skip_affine_3"):
                    adding_w_1 = PF.affine(next_h[-1],
                                           lstm_size,
                                           w_init=initializer,
                                           with_bias=False)

            # (node_id + 2 + 1, lstm_size)
            anchors = F.concatenate(anchors, next_h[-1], axis=0)
            # (node_id + 2 + 1, lstm_size)
            anchors_w_1 = F.concatenate(anchors_w_1, adding_w_1, axis=0)

        for idx, ops in zip(idx_seq, ops_seq):
            both_archs[ind].extend([int(idx), int(ops)])

    return both_archs, log_probs, entropys