示例#1
0
def param_init_lstm(options, params, prefix='lstm', nin=None, dim=None):
    if nin is None:
        nin = options['dim_proj']
    if dim is None:
        dim = options['dim_proj']
    """
     Stack the weight matricies for all the gates
     for much cleaner code and slightly faster dot-prods
    """
    # input weights
    W = numpy.concatenate([
        norm_weight(nin, dim),
        norm_weight(nin, dim),
        norm_weight(nin, dim),
        norm_weight(nin, dim)
    ],
                          axis=1)
    params[_p(prefix, 'W')] = W
    # for the previous hidden activation
    U = numpy.concatenate([
        ortho_weight(dim),
        ortho_weight(dim),
        ortho_weight(dim),
        ortho_weight(dim)
    ],
                          axis=1)
    params[_p(prefix, 'U')] = U
    params[_p(prefix, 'b')] = numpy.zeros((4 * dim, )).astype('float32')

    return params
示例#2
0
def param_init_fflayer(options,
                       params,
                       prefix='ff',
                       nin=None,
                       nout=None,
                       ortho=True,
                       flag=False):

    if nin is None:
        nin = options['dim_proj']
    if nout is None:
        nout = options['dim_proj']
    params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho)
    flag = False
    if flag:
        #params[_p(prefix, 'b')] = np.full(nout,-1).astype('float32')
        import gzip
        import pickle
        with gzip.open('mnist.pkl.gz', 'rb') as f:
            train_set, _, _ = pickle.load(f)
            train_x, train_y = train_set
            marginals = np.clip(train_x.mean(axis=0), 1e-7, 1 - 1e-7)
            initial_baises = np.log(marginals / (1 - marginals))
            params[_p(prefix, 'b')] = initial_baises.astype('float32')

    else:
        params[_p(prefix, 'b')] = np.zeros((nout, )).astype('float32')

    return params
示例#3
0
def fflayer(tparams,
            state_below,
            options,
            prefix='rconv',
            activ='lambda x: tensor.tanh(x)',
            **kwargs):
    return eval(activ)(tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
                       tparams[_p(prefix, 'b')])
示例#4
0
def param_init_attention(options, params, prefix='attention'):
    dim_word = options['dim_word']
    params[_p(prefix, 'Wm')] = norm_weight(dim_word)
    params[_p(prefix, 'b')] = numpy.zeros((dim_word, ), dtype='float32')
    params[_p(prefix, 'W_att')] = norm_weight(dim_word)
    params[_p(prefix, 'U_att')] = norm_weight(dim_word, 1)
    params[_p(prefix, 'c_att')] = numpy.zeros((1, ), dtype='float32')
    return params
示例#5
0
def lstm_layer(tparams,
               state_below,
               options,
               prefix='lstm',
               mask=None,
               **kwargs):
    nsteps = state_below.shape[0]
    dim = tparams[_p(prefix, 'U')].shape[0]

    # if we are dealing with a mini-batch
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
        init_state = tensor.alloc(0., n_samples, dim)
        init_memory = tensor.alloc(0., n_samples, dim)
    # during sampling
    else:
        n_samples = 1
        init_state = tensor.alloc(0., dim)
        init_memory = tensor.alloc(0., dim)

    # if we have no mask, we assume all the inputs are valid
    if mask == None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    # use the slice to calculate all the different gates
    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        elif _x.ndim == 2:
            return _x[:, n * dim:(n + 1) * dim]
        return _x[n * dim:(n + 1) * dim]

    # one time step of the lstm
    def _step(m_, x_, h_, c_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        f = tensor.nnet.sigmoid(_slice(preact, 1, dim))
        o = tensor.nnet.sigmoid(_slice(preact, 2, dim))
        c = tensor.tanh(_slice(preact, 3, dim))

        c = f * c_ + i * c
        h = o * tensor.tanh(c)

        return h, c, i, f, o, preact

    state_below = tensor.dot(state_below, tparams[_p(
        prefix, 'W')]) + tparams[_p(prefix, 'b')]

    rval, updates = theano.scan(
        _step,
        sequences=[mask, state_below],
        outputs_info=[init_state, init_memory, None, None, None, None],
        name=_p(prefix, '_layers'),
        n_steps=nsteps,
        profile=False)
    return rval
示例#6
0
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None):
    if nin is None:
        nin = options['dim_proj']
    if nout is None:
        nout = options['dim_proj']
    params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01)
    params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32')

    return params
示例#7
0
def param_init_fflayer(options, params, prefix='ff', prefix_bnorm='bnorm', nin=None, nout=None, ortho=True, batch_norm=False):

    if prefix in params:
        print 'this layer is already present'
    else:
        params[_p(prefix, 'W')] = norm_weight(nin, nout)
        params[_p(prefix, 'b')] = np.zeros((nout,)).astype('float32')

    return params
示例#8
0
def bnorm_layer_init(input_shape, params, prefix):
    c1_b = BatchNormalization()
    c1_b.build(input_shape)
    qw = c1_b.get_weights()
    params[_p(prefix, 'gamma')] = qw[0]
    params[_p(prefix, 'beta')] = qw[1]
    params[_p(prefix, 'run_mean')] = qw[2]
    params[_p(prefix, 'run_std')] = qw[3]
    return params
示例#9
0
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None):
    if nin is None:
        nin = options['dim_proj']
    if nout is None:
        nout = options['dim_proj']
    params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01)
    params[_p(prefix, 'b')] = numpy.zeros((nout, )).astype('float32')

    return params
def param_init_fflayer(options,
                       params,
                       prefix='ff',
                       nin=None,
                       nout=None,
                       ortho=True,
                       flag=False):
    params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho)
    params[_p(prefix, 'b')] = np.zeros((nout, )).astype('float32')
    return params
示例#11
0
def mlp_attention_layer(tparams, state_below, options, prefix='attention'):
    mean_emb = state_below.mean(1)
    attention_vec = tensor.dot(state_below, tparams[_p(
        prefix, 'W_att')]) + tparams[_p(prefix, 'b')]
    attention_vec += tensor.dot(mean_emb, tparams[_p(prefix, 'Wm')])[:,
                                                                     None, :]
    attention_vec = tanh(attention_vec)
    alpha = tensor.dot(attention_vec, tparams[_p(
        prefix, 'U_att')]) + tparams[_p(prefix, 'c_att')]
    alpha_shp = alpha.shape
    alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0], alpha_shp[1]]))
    output = (state_below * alpha[:, :, None]).sum(1)
    return output
示例#12
0
def mlp_layer(tparams, state_below, options, prefix='predictor'):
    layer_num = len(options['dims'])
    for i in range(layer_num - 1):
        if i == 0:
            output = tensor.dot(state_below, tparams[_p(prefix, i)])
            output = tanh(output)
        elif i == layer_num - 2:
            output = tensor.dot(output, tparams[_p(prefix, i)])
            output = rectifier(output)
        else:
            output = tensor.dot(output, tparams[_p(prefix, i)])
            output = tanh(output)
    return output
def param_init_convlayer(options,
                         params,
                         prefix='ff',
                         nin=None,
                         nout=None,
                         kernel_len=5,
                         ortho=True,
                         batch_norm=False):
    params[_p(prefix,
              'W')] = 0.01 * np_rng.normal(size=(nout, nin, kernel_len,
                                                 kernel_len)).astype('float32')
    params[_p(prefix, 'b')] = np.zeros(shape=(nout, )).astype('float32')
    return params
    def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None):
        """ Each variable is one time slice of the LSTM
        m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory),
        a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context),
        pctx_ (projected context), dp_/dp_att_ (dropout masks)
        """
        # attention computation
        # [described in  equations (4), (5), (6) in
        # section "3.1.2 Decoder: Long Short Term Memory Network]
        pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')]) + tensor.dot(ct_, tparams[_p(prefix, 'Wct_att')])
        pctx_ = pctx_ + pstate_[:,None,:]
        pctx_list = []
        pctx_list.append(pctx_)
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')]
        alpha_pre = alpha
        alpha_shp = alpha.shape

        alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax
        ctx_ = (context * alpha[:,:,None]).sum(1) # current context
        alpha_sample = alpha # you can return something else reasonable here to debug

        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_
        preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')])

        # Recover the activations to the lstm gates
        # [equation (1)]
        i = _slice(preact, 0, dim)
        f = _slice(preact, 1, dim)
        o = _slice(preact, 2, dim)
        if options['use_dropout_lstm']:
            i = i * _slice(dp_, 0, dim)
            f = f * _slice(dp_, 1, dim)
            o = o * _slice(dp_, 2, dim)
        i = tensor.nnet.sigmoid(i)
        f = tensor.nnet.sigmoid(f)
        o = tensor.nnet.sigmoid(o)
        c = tensor.tanh(_slice(preact, 3, dim))

        # compute the new memory/hidden state
        # if the mask is 0, just copy the previous state
        c = f * c_ + i * c
        c = m_[:,None] * c + (1. - m_)[:,None] * c_

        h = o * tensor.tanh(c)
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        rval = [h, c, alpha, alpha_sample, ctx_]
        rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list
        return rval
示例#15
0
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None, **kwargs):
    nsteps = state_below.shape[0]
    dim = tparams[_p(prefix,'U')].shape[0]

    # if we are dealing with a mini-batch
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
        init_state = tensor.alloc(0., n_samples, dim)
        init_memory = tensor.alloc(0., n_samples, dim)
    # during sampling
    else:
        n_samples = 1
        init_state = tensor.alloc(0., dim)
        init_memory = tensor.alloc(0., dim)

    # if we have no mask, we assume all the inputs are valid
    if mask == None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    # use the slice to calculate all the different gates
    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        elif _x.ndim == 2:
            return _x[:, n*dim:(n+1)*dim]
        return _x[n*dim:(n+1)*dim]

    # one time step of the lstm
    def _step(m_, x_, h_, c_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        f = tensor.nnet.sigmoid(_slice(preact, 1, dim))
        o = tensor.nnet.sigmoid(_slice(preact, 2, dim))
        c = tensor.tanh(_slice(preact, 3, dim))

        c = f * c_ + i * c
        h = o * tensor.tanh(c)

        return h, c, i, f, o, preact

    state_below = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]

    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[init_state, init_memory, None, None, None, None],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps, profile=False)
    return rval
示例#16
0
def fflayer(tparams,
            state_below,
            options,
            index,
            prefix='rconv',
            prefix_bnorm='bnorm',
            activ='lambda x: tensor.tanh(x)',
            batch_norm = False,
            **kwargs):
    preactivation = T.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]

    if batch_norm:
        preactivation = (preactivation - preactivation.mean(axis=0)) / (0.0001 + preactivation.std(axis=0))
        preactivation = (tparams[_p(prefix_bnorm, 'newmu')][index] + preactivation* tparams[_p(prefix_bnorm, 'newsigma')][index])

    return preactivation
示例#17
0
def param_init_mlp(options, params, prefix='predictor'):
    dims = options['dims']
    layer_num = len(dims)
    assert layer_num >= 3
    for i in range(layer_num - 1):
        W = norm_weight(dims[i], dims[i + 1])
        params[_p(prefix, i)] = W
    return params
示例#18
0
    def _step(m_, x_, h_, c_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        f = tensor.nnet.sigmoid(_slice(preact, 1, dim))
        o = tensor.nnet.sigmoid(_slice(preact, 2, dim))
        c = tensor.tanh(_slice(preact, 3, dim))

        c = f * c_ + i * c
        h = o * tensor.tanh(c)

        return h, c, i, f, o, preact
示例#19
0
    def _step(m_, x_, h_, c_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, dim))
        f = tensor.nnet.sigmoid(_slice(preact, 1, dim))
        o = tensor.nnet.sigmoid(_slice(preact, 2, dim))
        c = tensor.tanh(_slice(preact, 3, dim))

        c = f * c_ + i * c
        h = o * tensor.tanh(c)

        return h, c, i, f, o, preact
示例#20
0
def param_init_lstm(options, params, prefix='lstm', nin=None, dim=None):
    if nin is None:
        nin = options['dim_proj']
    if dim is None:
        dim = options['dim_proj']
    """
     Stack the weight matricies for all the gates
     for much cleaner code and slightly faster dot-prods
    """
    # input weights
    W = numpy.concatenate([norm_weight(nin,dim),
                           norm_weight(nin,dim),
                           norm_weight(nin,dim),
                           norm_weight(nin,dim)], axis=1)
    params[_p(prefix,'W')] = W
    # for the previous hidden activation
    U = numpy.concatenate([ortho_weight(dim),
                           ortho_weight(dim),
                           ortho_weight(dim),
                           ortho_weight(dim)], axis=1)
    params[_p(prefix,'U')] = U
    params[_p(prefix,'b')] = numpy.zeros((4 * dim,)).astype('float32')

    return params
def init_params(options):
    params = OrderedDict()
    if use_conv:
        bn = True
        params = ConvLayer(3, 64, 5, 2, params=params, prefix='conv_1', bn=bn)
        params = ConvLayer(64,
                           128,
                           5,
                           2,
                           params=params,
                           prefix='conv_2',
                           bn=bn)
        params = ConvLayer(128,
                           256,
                           5,
                           2,
                           params=params,
                           prefix='conv_3',
                           bn=bn)
        '''
        params = get_layer('ff')[0](options, params, prefix='layer_1',nin=4*4*256, nout=2048,ortho=False)
        params = get_layer('ff')[0](options, params, prefix='layer_2',nin=2048, nout=2048,ortho=False)
        params = get_layer('ff')[0](options, params, prefix='layer_3',nin=2048, nout=2048,ortho=False)
        params = get_layer('ff')[0](options, params, prefix='layer_4',nin=2048, nout=2048,ortho=False)
        params = get_layer('ff')[0](options, params, prefix='layer_5',nin=2048, nout=4*4*256,ortho=False)
        '''
        '''

        params = param_init_convlayer(options, params, prefix='conv_1', nin=3, nout=64, kernel_len=5, batch_norm=bn)
        params[_p('conv_1', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 64)).astype('float32')
        params[_p('conv_1', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 64)).astype('float32')

        params = param_init_convlayer(options, params, prefix='conv_2', nin=64, nout=128, kernel_len=5, batch_norm=bn)
        params[_p('conv_2', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 128)).astype('float32')
        params[_p('conv_2', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 128)).astype('float32')

        params = param_init_convlayer(options, params, prefix='conv_3', nin=128, nout=256, kernel_len=5, batch_norm=bn)
        params[_p('conv_3', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 256)).astype('float32')
        params[_p('conv_3', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 256)).astype('float32')
        '''

        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_1',
                                    prefix_bnorm='layer_1_step_0',
                                    nin=4 * 4 * 256,
                                    nout=2048,
                                    ortho=False,
                                    batch_norm=True)
        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_2',
                                    prefix_bnorm='layer_2_step_0',
                                    nin=2048,
                                    nout=2048,
                                    ortho=False,
                                    batch_norm=True)
        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_3',
                                    prefix_bnorm='layer_3_step_0',
                                    nin=2048,
                                    nout=2048,
                                    ortho=False,
                                    batch_norm=True)
        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_4',
                                    prefix_bnorm='layer_4_step_0',
                                    nin=2048,
                                    nout=2048,
                                    ortho=False,
                                    batch_norm=True)
        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_5',
                                    prefix_bnorm='layer_5_step_0',
                                    nin=2048,
                                    nout=4 * 4 * 256,
                                    ortho=False,
                                    batch_norm=True)

        params[_p('layer1_bnorm',
                  'newmu')] = np.zeros(shape=(args.num_steps * args.meta_steps,
                                              2048)).astype('float32')
        params[_p('layer1_bnorm', 'newsigma')] = np.ones(
            shape=(args.num_steps * args.meta_steps, 2048)).astype('float32')

        params[_p('layer2_bnorm',
                  'newmu')] = np.zeros(shape=(args.num_steps * args.meta_steps,
                                              2048)).astype('float32')
        params[_p('layer2_bnorm', 'newsigma')] = np.ones(
            shape=(args.num_steps * args.meta_steps, 2048)).astype('float32')

        params[_p('layer3_bnorm',
                  'newmu')] = np.zeros(shape=(args.num_steps * args.meta_steps,
                                              2048)).astype('float32')
        params[_p('layer3_bnorm', 'newsigma')] = np.ones(
            shape=(args.num_steps * args.meta_steps, 2048)).astype('float32')

        params[_p('layer4_bnorm',
                  'newmu')] = np.zeros(shape=(args.num_steps * args.meta_steps,
                                              2048)).astype('float32')
        params[_p('layer4_bnorm', 'newsigma')] = np.ones(
            shape=(args.num_steps * args.meta_steps, 2048)).astype('float32')

        params[_p('layer5_bnorm',
                  'newmu')] = np.zeros(shape=(args.num_steps * args.meta_steps,
                                              4 * 4 * 256)).astype('float32')
        params[_p('layer5_bnorm',
                  'newsigma')] = np.ones(shape=(args.num_steps *
                                                args.meta_steps,
                                                4 * 4 * 256)).astype('float32')

        params = ConvLayer(256,
                           128,
                           5,
                           -2,
                           params=params,
                           prefix='conv_4_mu',
                           bn=bn)
        params = ConvLayer(128,
                           64,
                           5,
                           -2,
                           params=params,
                           prefix='conv_5_mu',
                           bn=bn)
        params = ConvLayer(64, 3, 5, -2, params=params, prefix='conv_6_mu')

        params = ConvLayer(256,
                           128,
                           5,
                           -2,
                           params=params,
                           prefix='conv_4_s',
                           bn=bn)
        params = ConvLayer(128,
                           64,
                           5,
                           -2,
                           params=params,
                           prefix='conv_5_s',
                           bn=bn)
        params = ConvLayer(64, 3, 5, -2, params=params, prefix='conv_6_s')
        '''
        params = param_init_convlayer(options, params, prefix='conv_4_mu', nin=256, nout=128, kernel_len=5, batch_norm=bn)
        params[_p('conv_4_mu', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 128)).astype('float32')
        params[_p('conv_4_mu', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 128)).astype('float32')

        params = param_init_convlayer(options, params, prefix='conv_5_mu', nin=128, nout=64, kernel_len=5, batch_norm=bn)
        params[_p('conv_5_mu', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 64)).astype('float32')
        params[_p('conv_5_mu', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 64)).astype('float32')

        params = param_init_convlayer(options, params, prefix='conv_6_mu', nin=64, nout=3, kernel_len=5, batch_norm =False)

        params = param_init_convlayer(options, params, prefix='conv_4_s', nin=256, nout=128, kernel_len=5, batch_norm=bn)
        params[_p('conv_4_s', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 128)).astype('float32')
        params[_p('conv_4_s', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 128)).astype('float32')

        params = param_init_convlayer(options, params, prefix='conv_5_s', nin=128, nout=64, kernel_len=5, batch_norm=bn)
        params[_p('conv_5_s', 'newmu')] = np.zeros(shape=(args.num_steps *args.meta_steps, 64)).astype('float32')
        params[_p('conv_5_s', 'newsigma')] = np.ones(shape=(args.num_steps *args.meta_steps, 64)).astype('float32')

        params = param_init_convlayer(options, params, prefix='conv_6_s', nin=64, nout=3, kernel_len=5, batch_norm = False)
        '''

    return params
示例#22
0
def lstm_cond_layer(tparams, state_below, options, prefix='lstm',
                    mask=None, context=None, one_step=False,
                    init_memory=None, init_state=None,
                    trng=None, use_noise=None, sampling=True,
                    argmax=False, **kwargs):

    assert context, 'Context must be provided'

    if one_step:
        assert init_memory, 'previous memory must be provided'
        assert init_state, 'previous state must be provided'

    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    # mask
    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    # infer lstm dimension
    dim = tparams[_p(prefix, 'U')].shape[0]

    # initial/previous state
    if init_state is None:
        init_state = tensor.alloc(0., n_samples, dim)
    # initial/previous memory
    if init_memory is None:
        init_memory = tensor.alloc(0., n_samples, dim)

    # projected context
    pctx_ = tensor.dot(context, tparams[_p(prefix,'Wc_att')]) + tparams[_p(prefix, 'b_att')]
    if options['n_layers_att'] > 1:
        for lidx in xrange(1, options['n_layers_att']):
            pctx_ = tensor.dot(pctx_, tparams[_p(prefix,'W_att_%d'%lidx)])+tparams[_p(prefix, 'b_att_%d'%lidx)]
            # note to self: this used to be options['n_layers_att'] - 1, so no extra non-linearity if n_layers_att < 3
            if lidx < options['n_layers_att']:
                pctx_ = tanh(pctx_)

    # projected x
    # state_below is timesteps*num samples by d in training (TODO change to notation of paper)
    # this is n * d during sampling
    state_below = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]

    # additional parameters for stochastic hard attention
    if options['attn_type'] == 'stochastic':
        # temperature for softmax
        temperature = options.get("temperature", 1)
        # [see (Section 4.1): Stochastic "Hard" Attention]
        semi_sampling_p = options.get("semi_sampling_p", 0.5)
        temperature_c = theano.shared(numpy.float32(temperature), name='temperature_c')
        h_sampling_mask = trng.binomial((1,), p=semi_sampling_p, n=1, dtype=theano.config.floatX).sum()

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None):
        """ Each variable is one time slice of the LSTM
        m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory),
        a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context),
        pctx_ (projected context), dp_/dp_att_ (dropout masks)
        """
        # attention computation
        # [described in  equations (4), (5), (6) in
        # section "3.1.2 Decoder: Long Short Term Memory Network]
        pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')])
        pctx_ = pctx_ + pstate_[:,None,:]
        pctx_list = []
        pctx_list.append(pctx_)
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')]
        alpha_pre = alpha
        alpha_shp = alpha.shape

        if options['attn_type'] == 'deterministic':
            alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax
            ctx_ = (context * alpha[:,:,None]).sum(1) # current context
            alpha_sample = alpha # you can return something else reasonable here to debug
        else:
            alpha = tensor.nnet.softmax(temperature_c*alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax
            # TODO return alpha_sample
            if sampling:
                alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\
                               + (1.-h_sampling_mask) * alpha
            else:
                if argmax:
                    alpha_sample = tensor.cast(tensor.eq(tensor.arange(alpha_shp[1])[None,:],
                                               tensor.argmax(alpha,axis=1,keepdims=True)), theano.config.floatX)
                else:
                    alpha_sample = alpha
            ctx_ = (context * alpha_sample[:,:,None]).sum(1) # current context

        if options['selector']:
            sel_ = tensor.nnet.sigmoid(tensor.dot(h_, tparams[_p(prefix, 'W_sel')])+tparams[_p(prefix,'b_sel')])
            sel_ = sel_.reshape([sel_.shape[0]])
            ctx_ = sel_[:,None] * ctx_

        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_
        preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')])

        # Recover the activations to the lstm gates
        # [equation (1)]
        i = _slice(preact, 0, dim)
        f = _slice(preact, 1, dim)
        o = _slice(preact, 2, dim)
        if options['use_dropout_lstm']:
            i = i * _slice(dp_, 0, dim)
            f = f * _slice(dp_, 1, dim)
            o = o * _slice(dp_, 2, dim)
        i = tensor.nnet.sigmoid(i)
        f = tensor.nnet.sigmoid(f)
        o = tensor.nnet.sigmoid(o)
        c = tensor.tanh(_slice(preact, 3, dim))

        # compute the new memory/hidden state
        # if the mask is 0, just copy the previous state
        c = f * c_ + i * c
        c = m_[:,None] * c + (1. - m_)[:,None] * c_

        h = o * tensor.tanh(c)
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        rval = [h, c, alpha, alpha_sample, ctx_]
        if options['selector']:
            rval += [sel_]
        rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list
        return rval

    if options['use_dropout_lstm']:
        if options['selector']:
            _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, sel_, pctx_: \
                            _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_)
        else:
            _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, pctx_: \
                            _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_)
        dp_shape = state_below.shape
        if one_step:
            dp_mask = tensor.switch(use_noise,
                                    trng.binomial((dp_shape[0], 3*dim),
                                                  p=0.5, n=1, dtype=state_below.dtype),
                                    tensor.alloc(0.5, dp_shape[0], 3 * dim))
        else:
            dp_mask = tensor.switch(use_noise,
                                    trng.binomial((dp_shape[0], dp_shape[1], 3*dim),
                                                  p=0.5, n=1, dtype=state_below.dtype),
                                    tensor.alloc(0.5, dp_shape[0], dp_shape[1], 3*dim))
    else:
        if options['selector']:
            _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, sel_, pctx_: _step(m_, x_, h_, c_, a_, as_, ct_, pctx_)
        else:
            _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, pctx_: _step(m_, x_, h_, c_, a_, as_, ct_, pctx_)

    if one_step:
        if options['use_dropout_lstm']:
            if options['selector']:
                rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, None, pctx_)
            else:
                rval = _step0(mask, state_below, dp_mask, init_state, init_memory, None, None, None, pctx_)
        else:
            if options['selector']:
                rval = _step0(mask, state_below, init_state, init_memory, None, None, None, None, pctx_)
            else:
                rval = _step0(mask, state_below, init_state, init_memory, None, None, None, pctx_)
        return rval
    else:
        seqs = [mask, state_below]
        if options['use_dropout_lstm']:
            seqs += [dp_mask]
        outputs_info = [init_state,
                        init_memory,
                        tensor.alloc(0., n_samples, pctx_.shape[1]),
                        tensor.alloc(0., n_samples, pctx_.shape[1]),
                        tensor.alloc(0., n_samples, context.shape[2])]
        if options['selector']:
            outputs_info += [tensor.alloc(0., n_samples)]
        outputs_info += [None,
                         None,
                         None,
                         None,
                         None,
                         None,
                         None] + [None] # *options['n_layers_att']
        rval, updates = theano.scan(_step0,
                                    sequences=seqs,
                                    outputs_info=outputs_info,
                                    non_sequences=[pctx_],
                                    name=_p(prefix, '_layers'),
                                    n_steps=nsteps, profile=False)
        return rval, updates
示例#23
0
def param_init_lstm_cond(options,
                         params,
                         prefix='lstm_cond',
                         nin=None,
                         dim=None,
                         dimctx=None):
    if nin is None:
        nin = options['dim']
    if dim is None:
        dim = options['dim']
    if dimctx is None:
        dimctx = options['dim']
    # input to LSTM, similar to the above, we stack the matricies for compactness, do one
    # dot product, and use the slice function below to get the activations for each "gate"
    W = numpy.concatenate([
        norm_weight(nin, dim),
        norm_weight(nin, dim),
        norm_weight(nin, dim),
        norm_weight(nin, dim)
    ],
                          axis=1)
    params[_p(prefix, 'W')] = W

    # LSTM to LSTM
    U = numpy.concatenate([
        ortho_weight(dim),
        ortho_weight(dim),
        ortho_weight(dim),
        ortho_weight(dim)
    ],
                          axis=1)
    params[_p(prefix, 'U')] = U

    # bias to LSTM
    params[_p(prefix, 'b')] = numpy.zeros((4 * dim, )).astype('float32')

    # context to LSTM
    Wc = norm_weight(dimctx, dim * 4)
    params[_p(prefix, 'Wc')] = Wc

    # attention: context -> hidden
    Wc_att = norm_weight(dimctx, ortho=False)
    params[_p(prefix, 'Wc_att')] = Wc_att

    # attention: LSTM -> hidden
    Wd_att = norm_weight(dim, dimctx)
    params[_p(prefix, 'Wd_att')] = Wd_att

    # attention: hidden bias
    b_att = numpy.zeros((dimctx, )).astype('float32')
    params[_p(prefix, 'b_att')] = b_att

    # optional "deep" attention
    if options['n_layers_att'] > 1:
        for lidx in xrange(1, options['n_layers_att']):
            params[_p(prefix, 'W_att_%d' % lidx)] = ortho_weight(dimctx)
            params[_p(prefix, 'b_att_%d' % lidx)] = numpy.zeros(
                (dimctx, )).astype('float32')

    # attention:
    U_att = norm_weight(dimctx, 1)
    params[_p(prefix, 'U_att')] = U_att
    c_att = numpy.zeros((1, )).astype('float32')
    params[_p(prefix, 'c_tt')] = c_att

    if options['selector']:
        # attention: selector
        W_sel = norm_weight(dim, 1)
        params[_p(prefix, 'W_sel')] = W_sel
        b_sel = numpy.float32(0.)
        params[_p(prefix, 'b_sel')] = b_sel

    return params
示例#24
0
    def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None):
        """ Each variable is one time slice of the LSTM
        m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory),
        a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context),
        pctx_ (projected context), dp_/dp_att_ (dropout masks)
        """
        # attention computation
        # [described in  equations (4), (5), (6) in
        # section "3.1.2 Decoder: Long Short Term Memory Network]
        pstate_ = tensor.dot(h_, tparams[_p(prefix,'Wd_att')])
        pctx_ = pctx_ + pstate_[:,None,:]
        pctx_list = []
        pctx_list.append(pctx_)
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(pctx_, tparams[_p(prefix,'U_att')])+tparams[_p(prefix, 'c_tt')]
        alpha_pre = alpha
        alpha_shp = alpha.shape

        if options['attn_type'] == 'deterministic':
            alpha = tensor.nnet.softmax(alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax
            ctx_ = (context * alpha[:,:,None]).sum(1) # current context
            alpha_sample = alpha # you can return something else reasonable here to debug
        else:
            alpha = tensor.nnet.softmax(temperature_c*alpha.reshape([alpha_shp[0],alpha_shp[1]])) # softmax
            # TODO return alpha_sample
            if sampling:
                alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\
                               + (1.-h_sampling_mask) * alpha
            else:
                if argmax:
                    alpha_sample = tensor.cast(tensor.eq(tensor.arange(alpha_shp[1])[None,:],
                                               tensor.argmax(alpha,axis=1,keepdims=True)), theano.config.floatX)
                else:
                    alpha_sample = alpha
            ctx_ = (context * alpha_sample[:,:,None]).sum(1) # current context

        if options['selector']:
            sel_ = tensor.nnet.sigmoid(tensor.dot(h_, tparams[_p(prefix, 'W_sel')])+tparams[_p(prefix,'b_sel')])
            sel_ = sel_.reshape([sel_.shape[0]])
            ctx_ = sel_[:,None] * ctx_

        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_
        preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')])

        # Recover the activations to the lstm gates
        # [equation (1)]
        i = _slice(preact, 0, dim)
        f = _slice(preact, 1, dim)
        o = _slice(preact, 2, dim)
        if options['use_dropout_lstm']:
            i = i * _slice(dp_, 0, dim)
            f = f * _slice(dp_, 1, dim)
            o = o * _slice(dp_, 2, dim)
        i = tensor.nnet.sigmoid(i)
        f = tensor.nnet.sigmoid(f)
        o = tensor.nnet.sigmoid(o)
        c = tensor.tanh(_slice(preact, 3, dim))

        # compute the new memory/hidden state
        # if the mask is 0, just copy the previous state
        c = f * c_ + i * c
        c = m_[:,None] * c + (1. - m_)[:,None] * c_

        h = o * tensor.tanh(c)
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        rval = [h, c, alpha, alpha_sample, ctx_]
        if options['selector']:
            rval += [sel_]
        rval += [pstate_, pctx_, i, f, o, preact, alpha_pre]+pctx_list
        return rval
示例#25
0
def lstm_cond_layer(tparams,
                    state_below,
                    options,
                    prefix='lstm',
                    mask=None,
                    context=None,
                    one_step=False,
                    init_memory=None,
                    init_state=None,
                    trng=None,
                    use_noise=None,
                    sampling=True,
                    argmax=False,
                    **kwargs):

    assert context, 'Context must be provided'

    if one_step:
        assert init_memory, 'previous memory must be provided'
        assert init_state, 'previous state must be provided'

    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    # mask
    if mask is None:
        mask = tensor.alloc(1., state_below.shape[0], 1)

    # infer lstm dimension
    dim = tparams[_p(prefix, 'U')].shape[0]

    # initial/previous state
    if init_state is None:
        init_state = tensor.alloc(0., n_samples, dim)
    # initial/previous memory
    if init_memory is None:
        init_memory = tensor.alloc(0., n_samples, dim)

    # projected context
    pctx_ = tensor.dot(context, tparams[_p(prefix, 'Wc_att')]) + tparams[_p(
        prefix, 'b_att')]
    if options['n_layers_att'] > 1:
        for lidx in xrange(1, options['n_layers_att']):
            pctx_ = tensor.dot(pctx_, tparams[_p(
                prefix, 'W_att_%d' % lidx)]) + tparams[_p(
                    prefix, 'b_att_%d' % lidx)]
            # note to self: this used to be options['n_layers_att'] - 1, so no extra non-linearity if n_layers_att < 3
            if lidx < options['n_layers_att']:
                pctx_ = tanh(pctx_)

    # projected x
    # state_below is timesteps*num samples by d in training (TODO change to notation of paper)
    # this is n * d during sampling
    state_below = tensor.dot(state_below, tparams[_p(
        prefix, 'W')]) + tparams[_p(prefix, 'b')]

    # additional parameters for stochastic hard attention
    if options['attn_type'] == 'stochastic':
        # temperature for softmax
        temperature = options.get("temperature", 1)
        # [see (Section 4.1): Stochastic "Hard" Attention]
        semi_sampling_p = options.get("semi_sampling_p", 0.5)
        temperature_c = theano.shared(numpy.float32(temperature),
                                      name='temperature_c')
        h_sampling_mask = trng.binomial((1, ),
                                        p=semi_sampling_p,
                                        n=1,
                                        dtype=theano.config.floatX).sum()

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None):
        """ Each variable is one time slice of the LSTM
        m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory),
        a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context),
        pctx_ (projected context), dp_/dp_att_ (dropout masks)
        """
        # attention computation
        # [described in  equations (4), (5), (6) in
        # section "3.1.2 Decoder: Long Short Term Memory Network]
        pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')])
        pctx_ = pctx_ + pstate_[:, None, :]
        pctx_list = []
        pctx_list.append(pctx_)
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(pctx_, tparams[_p(prefix, 'U_att')]) + tparams[_p(
            prefix, 'c_tt')]
        alpha_pre = alpha
        alpha_shp = alpha.shape

        if options['attn_type'] == 'deterministic':
            alpha = tensor.nnet.softmax(
                alpha.reshape([alpha_shp[0], alpha_shp[1]]))  # softmax
            ctx_ = (context * alpha[:, :, None]).sum(1)  # current context
            alpha_sample = alpha  # you can return something else reasonable here to debug
        else:
            alpha = tensor.nnet.softmax(
                temperature_c *
                alpha.reshape([alpha_shp[0], alpha_shp[1]]))  # softmax
            # TODO return alpha_sample
            if sampling:
                alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\
                               + (1.-h_sampling_mask) * alpha
            else:
                if argmax:
                    alpha_sample = tensor.cast(
                        tensor.eq(
                            tensor.arange(alpha_shp[1])[None, :],
                            tensor.argmax(alpha, axis=1, keepdims=True)),
                        theano.config.floatX)
                else:
                    alpha_sample = alpha
            ctx_ = (context * alpha_sample[:, :, None]).sum(
                1)  # current context

        if options['selector']:
            sel_ = tensor.nnet.sigmoid(
                tensor.dot(h_, tparams[_p(prefix, 'W_sel')]) +
                tparams[_p(prefix, 'b_sel')])
            sel_ = sel_.reshape([sel_.shape[0]])
            ctx_ = sel_[:, None] * ctx_

        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_
        preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')])

        # Recover the activations to the lstm gates
        # [equation (1)]
        i = _slice(preact, 0, dim)
        f = _slice(preact, 1, dim)
        o = _slice(preact, 2, dim)
        if options['use_dropout_lstm']:
            i = i * _slice(dp_, 0, dim)
            f = f * _slice(dp_, 1, dim)
            o = o * _slice(dp_, 2, dim)
        i = tensor.nnet.sigmoid(i)
        f = tensor.nnet.sigmoid(f)
        o = tensor.nnet.sigmoid(o)
        c = tensor.tanh(_slice(preact, 3, dim))

        # compute the new memory/hidden state
        # if the mask is 0, just copy the previous state
        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        rval = [h, c, alpha, alpha_sample, ctx_]
        if options['selector']:
            rval += [sel_]
        rval += [pstate_, pctx_, i, f, o, preact, alpha_pre] + pctx_list
        return rval

    if options['use_dropout_lstm']:
        if options['selector']:
            _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, sel_, pctx_: \
                            _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_)
        else:
            _step0 = lambda m_, x_, dp_, h_, c_, a_, as_, ct_, pctx_: \
                            _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_)
        dp_shape = state_below.shape
        if one_step:
            dp_mask = tensor.switch(
                use_noise,
                trng.binomial((dp_shape[0], 3 * dim),
                              p=0.5,
                              n=1,
                              dtype=state_below.dtype),
                tensor.alloc(0.5, dp_shape[0], 3 * dim))
        else:
            dp_mask = tensor.switch(
                use_noise,
                trng.binomial((dp_shape[0], dp_shape[1], 3 * dim),
                              p=0.5,
                              n=1,
                              dtype=state_below.dtype),
                tensor.alloc(0.5, dp_shape[0], dp_shape[1], 3 * dim))
    else:
        if options['selector']:
            _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, sel_, pctx_: _step(
                m_, x_, h_, c_, a_, as_, ct_, pctx_)
        else:
            _step0 = lambda m_, x_, h_, c_, a_, as_, ct_, pctx_: _step(
                m_, x_, h_, c_, a_, as_, ct_, pctx_)

    if one_step:
        if options['use_dropout_lstm']:
            if options['selector']:
                rval = _step0(mask, state_below, dp_mask, init_state,
                              init_memory, None, None, None, None, pctx_)
            else:
                rval = _step0(mask, state_below, dp_mask, init_state,
                              init_memory, None, None, None, pctx_)
        else:
            if options['selector']:
                rval = _step0(mask, state_below, init_state, init_memory, None,
                              None, None, None, pctx_)
            else:
                rval = _step0(mask, state_below, init_state, init_memory, None,
                              None, None, pctx_)
        return rval
    else:
        seqs = [mask, state_below]
        if options['use_dropout_lstm']:
            seqs += [dp_mask]
        outputs_info = [
            init_state, init_memory,
            tensor.alloc(0., n_samples, pctx_.shape[1]),
            tensor.alloc(0., n_samples, pctx_.shape[1]),
            tensor.alloc(0., n_samples, context.shape[2])
        ]
        if options['selector']:
            outputs_info += [tensor.alloc(0., n_samples)]
        outputs_info += [None, None, None, None, None, None, None
                         ] + [None]  # *options['n_layers_att']
        rval, updates = theano.scan(_step0,
                                    sequences=seqs,
                                    outputs_info=outputs_info,
                                    non_sequences=[pctx_],
                                    name=_p(prefix, '_layers'),
                                    n_steps=nsteps,
                                    profile=False)
        return rval, updates
示例#26
0
def init_params(options):

    params = OrderedDict()

    if not use_conv:

        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_1',
                                    nin=INPUT_SIZE,
                                    nout=args.dims[0],
                                    ortho=False)

        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_2',
                                    nin=args.dims[0],
                                    nout=args.dims[0],
                                    ortho=False)
    dilated_conv = False
    if dilated_conv:
        bn = True
        filter_size = 5

        c1 = AtrousConvolution2D(128,
                                 filter_size,
                                 filter_size,
                                 atrous_rate=(1, 1),
                                 border_mode='same')
        c1.build((100, 3, 32, 32))
        qw = c1.get_weights()
        params[_p('c1', 'w')] = qw[0]
        params[_p('c1', 'b')] = qw[1]

        if bn:
            params = bnorm_layer_init((100, 3, 32, 128), params, 'c1_bn')

        c2 = AtrousConvolution2D(128,
                                 filter_size,
                                 filter_size,
                                 atrous_rate=(2, 2),
                                 border_mode='same')
        c2.build((100, 3, 32, 128))
        qw = c2.get_weights()
        params[_p('c2', 'w')] = qw[0]
        params[_p('c2', 'b')] = qw[1]

        if bn:
            params = bnorm_layer_init((100, 3, 32, 128), params, 'c2_bn')

        c3 = AtrousConvolution2D(128,
                                 filter_size,
                                 filter_size,
                                 atrous_rate=(4, 4),
                                 border_mode='same')
        c3.build((100, 3, 32, 128))
        qw = c3.get_weights()
        params[_p('c3', 'w')] = qw[0]
        params[_p('c3', 'b')] = qw[1]

        if bn:
            params = bnorm_layer_init((100, 3, 32, 128), params, 'c3_bn')

        c4_mu = AtrousConvolution2D(128,
                                    filter_size,
                                    filter_size,
                                    atrous_rate=(4, 4),
                                    border_mode='same')
        c4_mu.build((100, 3, 32, 128))
        qw = c4_mu.get_weights()
        params[_p('c4_mu', 'w')] = qw[0]
        params[_p('c4_mu', 'b')] = qw[1]

        if bn:
            params = bnorm_layer_init((100, 3, 32, 128), params, 'c4_mu_bn')

        c5_mu = AtrousConvolution2D(128,
                                    filter_size,
                                    filter_size,
                                    atrous_rate=(2, 2),
                                    border_mode='same')
        c5_mu.build((100, 3, 32, 128))
        qw = c5_mu.get_weights()
        params[_p('c5_mu', 'w')] = qw[0]
        params[_p('c5_mu', 'b')] = qw[1]

        if bn:
            params = bnorm_layer_init((100, 3, 32, 128), params, 'c5_mu_bn')

        c6_mu = AtrousConvolution2D(32,
                                    filter_size,
                                    filter_size,
                                    atrous_rate=(1, 1),
                                    border_mode='same')
        c6_mu.build((100, 3, 32, 128))
        qw = c6_mu.get_weights()
        params[_p('c6_mu', 'w')] = qw[0]
        params[_p('c6_mu', 'b')] = qw[1]

        c4_s = AtrousConvolution2D(128,
                                   filter_size,
                                   filter_size,
                                   atrous_rate=(4, 4),
                                   border_mode='same')
        c4_s.build((100, 3, 32, 128))
        qw = c4_s.get_weights()
        params[_p('c4_s', 'w')] = qw[0]
        params[_p('c4_s', 'b')] = qw[1]
        if bn:
            params = bnorm_layer_init((100, 3, 32, 128), params, 'c4_s_bn')

        c5_s = AtrousConvolution2D(128,
                                   filter_size,
                                   filter_size,
                                   atrous_rate=(2, 2),
                                   border_mode='same')
        c5_s.build((100, 3, 32, 128))
        qw = c5_s.get_weights()
        params[_p('c5_s', 'w')] = qw[0]
        params[_p('c5_s', 'b')] = qw[1]

        if bn:
            params = bnorm_layer_init((100, 3, 32, 128), params, 'c5_s_bn')

        c6_s = AtrousConvolution2D(32,
                                   filter_size,
                                   filter_size,
                                   atrous_rate=(1, 1),
                                   border_mode='same')
        c6_s.build((100, 3, 32, 128))
        qw = c6_s.get_weights()
        params[_p('c6_s', 'w')] = qw[0]
        params[_p('c6_s', 'b')] = qw[1]

    if use_conv:

        bn = True
        params = ConvLayer(3, 64, 5, 2, params=params, prefix='conv_1', bn=bn)
        params = ConvLayer(64,
                           128,
                           5,
                           2,
                           params=params,
                           prefix='conv_2',
                           bn=bn)
        params = ConvLayer(128,
                           256,
                           5,
                           2,
                           params=params,
                           prefix='conv_3',
                           bn=bn)

        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_1',
                                    nin=4 * 4 * 256,
                                    nout=2048,
                                    ortho=False)
        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_2',
                                    nin=2048,
                                    nout=2048,
                                    ortho=False)
        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_3',
                                    nin=2048,
                                    nout=2048,
                                    ortho=False)
        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_4',
                                    nin=2048,
                                    nout=2048,
                                    ortho=False)
        params = get_layer('ff')[0](options,
                                    params,
                                    prefix='layer_5',
                                    nin=2048,
                                    nout=4 * 4 * 256,
                                    ortho=False)

        params = ConvLayer(256,
                           128,
                           5,
                           -2,
                           params=params,
                           prefix='conv_4_mu',
                           bn=bn)
        params = ConvLayer(128,
                           64,
                           5,
                           -2,
                           params=params,
                           prefix='conv_5_mu',
                           bn=bn)
        params = ConvLayer(64, 3, 5, -2, params=params, prefix='conv_6_mu')

        params = ConvLayer(256,
                           128,
                           5,
                           -2,
                           params=params,
                           prefix='conv_4_s',
                           bn=bn)
        params = ConvLayer(128,
                           64,
                           5,
                           -2,
                           params=params,
                           prefix='conv_5_s',
                           bn=bn)
        params = ConvLayer(64, 3, 5, -2, params=params, prefix='conv_6_s')

    else:
        #TODO: Ideally, only in the output layer, flag=True should be set.
        if len(args.dims) == 1:
            params = get_layer('ff')[0](options,
                                        params,
                                        prefix='mu_0',
                                        nin=args.dims[0],
                                        nout=INPUT_SIZE,
                                        ortho=False,
                                        flag=True)
            if args.noise == 'gaussian':
                params = get_layer('ff')[0](options,
                                            params,
                                            prefix='sigma_0',
                                            nin=args.dims[0],
                                            nout=INPUT_SIZE,
                                            ortho=False)

        for i in range(len(args.dims) - 1):
            params = get_layer('ff')[0](options,
                                        params,
                                        prefix='mu_' + str(i),
                                        nin=args.dims[i],
                                        nout=args.dims[i + 1],
                                        ortho=False)
            if args.noise == 'gaussian':
                params = get_layer('ff')[0](options,
                                            params,
                                            prefix='sigma_' + str(i),
                                            nin=args.dims[i],
                                            nout=args.dims[i + 1],
                                            ortho=False,
                                            flag=True)

        if len(args.dims) > 1:
            params = get_layer('ff')[0](options,
                                        params,
                                        prefix='mu_' + str(i + 1),
                                        nin=args.dims[i + 1],
                                        nout=INPUT_SIZE,
                                        ortho=False,
                                        flag=True)

            if args.noise == 'gaussian':
                params = get_layer('ff')[0](options,
                                            params,
                                            prefix='sigma_' + str(i + 1),
                                            nin=args.dims[i + 1],
                                            nout=INPUT_SIZE,
                                            ortho=False)
    return params
示例#27
0
def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs):
    return eval(activ)(tensor.dot(state_below, tparams[_p(prefix,'W')])+tparams[_p(prefix,'b')])
示例#28
0
    def _step(m_, x_, h_, c_, a_, as_, ct_, pctx_, dp_=None, dp_att_=None):
        """ Each variable is one time slice of the LSTM
        m_ - (mask), x_- (previous word), h_- (hidden state), c_- (lstm memory),
        a_ - (alpha distribution [eq (5)]), as_- (sample from alpha dist), ct_- (context),
        pctx_ (projected context), dp_/dp_att_ (dropout masks)
        """
        # attention computation
        # [described in  equations (4), (5), (6) in
        # section "3.1.2 Decoder: Long Short Term Memory Network]
        pstate_ = tensor.dot(h_, tparams[_p(prefix, 'Wd_att')])
        pctx_ = pctx_ + pstate_[:, None, :]
        pctx_list = []
        pctx_list.append(pctx_)
        pctx_ = tanh(pctx_)
        alpha = tensor.dot(pctx_, tparams[_p(prefix, 'U_att')]) + tparams[_p(
            prefix, 'c_tt')]
        alpha_pre = alpha
        alpha_shp = alpha.shape

        if options['attn_type'] == 'deterministic':
            alpha = tensor.nnet.softmax(
                alpha.reshape([alpha_shp[0], alpha_shp[1]]))  # softmax
            ctx_ = (context * alpha[:, :, None]).sum(1)  # current context
            alpha_sample = alpha  # you can return something else reasonable here to debug
        else:
            alpha = tensor.nnet.softmax(
                temperature_c *
                alpha.reshape([alpha_shp[0], alpha_shp[1]]))  # softmax
            # TODO return alpha_sample
            if sampling:
                alpha_sample = h_sampling_mask * trng.multinomial(pvals=alpha,dtype=theano.config.floatX)\
                               + (1.-h_sampling_mask) * alpha
            else:
                if argmax:
                    alpha_sample = tensor.cast(
                        tensor.eq(
                            tensor.arange(alpha_shp[1])[None, :],
                            tensor.argmax(alpha, axis=1, keepdims=True)),
                        theano.config.floatX)
                else:
                    alpha_sample = alpha
            ctx_ = (context * alpha_sample[:, :, None]).sum(
                1)  # current context

        if options['selector']:
            sel_ = tensor.nnet.sigmoid(
                tensor.dot(h_, tparams[_p(prefix, 'W_sel')]) +
                tparams[_p(prefix, 'b_sel')])
            sel_ = sel_.reshape([sel_.shape[0]])
            ctx_ = sel_[:, None] * ctx_

        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_
        preact += tensor.dot(ctx_, tparams[_p(prefix, 'Wc')])

        # Recover the activations to the lstm gates
        # [equation (1)]
        i = _slice(preact, 0, dim)
        f = _slice(preact, 1, dim)
        o = _slice(preact, 2, dim)
        if options['use_dropout_lstm']:
            i = i * _slice(dp_, 0, dim)
            f = f * _slice(dp_, 1, dim)
            o = o * _slice(dp_, 2, dim)
        i = tensor.nnet.sigmoid(i)
        f = tensor.nnet.sigmoid(f)
        o = tensor.nnet.sigmoid(o)
        c = tensor.tanh(_slice(preact, 3, dim))

        # compute the new memory/hidden state
        # if the mask is 0, just copy the previous state
        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        rval = [h, c, alpha, alpha_sample, ctx_]
        if options['selector']:
            rval += [sel_]
        rval += [pstate_, pctx_, i, f, o, preact, alpha_pre] + pctx_list
        return rval
示例#29
0
def param_init_lstm_cond(options, params, prefix='lstm_cond', nin=None, dim=None, dimctx=None):
    if nin is None:
        nin = options['dim']
    if dim is None:
        dim = options['dim']
    if dimctx is None:
        dimctx = options['dim']
    # input to LSTM, similar to the above, we stack the matricies for compactness, do one
    # dot product, and use the slice function below to get the activations for each "gate"
    W = numpy.concatenate([norm_weight(nin,dim),
                           norm_weight(nin,dim),
                           norm_weight(nin,dim),
                           norm_weight(nin,dim)], axis=1)
    params[_p(prefix,'W')] = W

    # LSTM to LSTM
    U = numpy.concatenate([ortho_weight(dim),
                           ortho_weight(dim),
                           ortho_weight(dim),
                           ortho_weight(dim)], axis=1)
    params[_p(prefix,'U')] = U

    # bias to LSTM
    params[_p(prefix,'b')] = numpy.zeros((4 * dim,)).astype('float32')

    # context to LSTM
    Wc = norm_weight(dimctx,dim*4)
    params[_p(prefix,'Wc')] = Wc

    # attention: context -> hidden
    Wc_att = norm_weight(dimctx, ortho=False)
    params[_p(prefix,'Wc_att')] = Wc_att

    # attention: LSTM -> hidden
    Wd_att = norm_weight(dim,dimctx)
    params[_p(prefix,'Wd_att')] = Wd_att

    # attention: hidden bias
    b_att = numpy.zeros((dimctx,)).astype('float32')
    params[_p(prefix,'b_att')] = b_att

    # optional "deep" attention
    if options['n_layers_att'] > 1:
        for lidx in xrange(1, options['n_layers_att']):
            params[_p(prefix,'W_att_%d'%lidx)] = ortho_weight(dimctx)
            params[_p(prefix,'b_att_%d'%lidx)] = numpy.zeros((dimctx,)).astype('float32')

    # attention:
    U_att = norm_weight(dimctx,1)
    params[_p(prefix,'U_att')] = U_att
    c_att = numpy.zeros((1,)).astype('float32')
    params[_p(prefix, 'c_tt')] = c_att

    if options['selector']:
        # attention: selector
        W_sel = norm_weight(dim, 1)
        params[_p(prefix, 'W_sel')] = W_sel
        b_sel = numpy.float32(0.)
        params[_p(prefix, 'b_sel')] = b_sel

    return params