示例#1
0
def build_sampler(tparams, options, use_noise, trng, sampling=True):
    """ Builds a sampler used for generating from the model
    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    use_noise: boolean
        If true, add noise to the sampling
    trng: random number generator
    sampling : boolean
        [If it is true, when using stochastic attention, follows
        the learning rule described in section 4. at the bottom left of
        page 5]
    Returns
    -------
    f_init : theano function
        Input: annotation, Output: initial lstm state and memory
        (also performs transformation on ctx0 if using lstm_encoder)
    f_next: theano function
        Takes the previous word/state/memory + ctx0 and runs ne
        step through the lstm (used for beam search)
    """
    # context: #annotations x dim
    ctx = tensor.matrix('ctx_sampler', dtype='float32')
    if options['lstm_encoder']:
        # encoder
        ctx_fwd = get_layer('lstm')[1](tparams, ctx, options,
                                       prefix='encoder')[0]
        ctx_rev = get_layer('lstm')[1](tparams,
                                       ctx[::-1, :],
                                       options,
                                       prefix='encoder_rev')[0][::-1, :]
        ctx = tensor.concatenate((ctx_fwd, ctx_rev), axis=1)

    # initial state/cell
    ctx_mean = ctx.mean(0)
    for lidx in xrange(1, options['n_layers_init']):
        ctx_mean = get_layer('ff')[1](tparams,
                                      ctx_mean,
                                      options,
                                      prefix='ff_init_%d' % lidx,
                                      activ='rectifier')
        if options['use_dropout']:
            ctx_mean = dropout_layer(ctx_mean, use_noise, trng)
    init_state = [
        get_layer('ff')[1](tparams,
                           ctx_mean,
                           options,
                           prefix='ff_state',
                           activ='tanh')
    ]
    init_memory = [
        get_layer('ff')[1](tparams,
                           ctx_mean,
                           options,
                           prefix='ff_memory',
                           activ='tanh')
    ]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state.append(
                get_layer('ff')[1](tparams,
                                   ctx_mean,
                                   options,
                                   prefix='ff_state_%d' % lidx,
                                   activ='tanh'))
            init_memory.append(
                get_layer('ff')[1](tparams,
                                   ctx_mean,
                                   options,
                                   prefix='ff_memory_%d' % lidx,
                                   activ='tanh'))

    print 'Building f_init...',
    f_init = theano.function([ctx], [ctx] + init_state + init_memory,
                             name='f_init',
                             profile=False,
                             allow_input_downcast=True)
    print 'Done'

    # build f_next
    ctx = tensor.matrix('ctx_sampler', dtype='float32')
    x = tensor.vector('x_sampler', dtype='int64')
    init_state = [tensor.matrix('init_state', dtype='float32')]
    init_memory = [tensor.matrix('init_memory', dtype='float32')]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state.append(tensor.matrix('init_state', dtype='float32'))
            init_memory.append(tensor.matrix('init_memory', dtype='float32'))

    # for the first word (which is coded with -1), emb should be all zero
    emb = tensor.switch(x[:, None] < 0,
                        tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
                        tparams['Wemb'][x])

    proj = get_layer('lstm_cond')[1](tparams,
                                     emb,
                                     options,
                                     prefix='decoder',
                                     mask=None,
                                     context=ctx,
                                     one_step=True,
                                     init_state=init_state[0],
                                     init_memory=init_memory[0],
                                     trng=trng,
                                     use_noise=use_noise,
                                     sampling=sampling)

    next_state, next_memory, ctxs = [proj[0]], [proj[1]], [proj[4]]
    proj_h = proj[0]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            proj = get_layer('lstm_cond')[1](tparams,
                                             proj_h,
                                             options,
                                             prefix='decoder_%d' % lidx,
                                             context=ctx,
                                             one_step=True,
                                             init_state=init_state[lidx],
                                             init_memory=init_memory[lidx],
                                             trng=trng,
                                             use_noise=use_noise,
                                             sampling=sampling)
            next_state.append(proj[0])
            next_memory.append(proj[1])
            ctxs.append(proj[4])
            proj_h = proj[0]

    if options['use_dropout']:
        proj_h = dropout_layer(proj[0], use_noise, trng)
    else:
        proj_h = proj[0]
    logit = get_layer('ff')[1](tparams,
                               proj_h,
                               options,
                               prefix='ff_logit_lstm',
                               activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams,
                                    ctxs[-1],
                                    options,
                                    prefix='ff_logit_ctx',
                                    activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams,
                                       logit,
                                       options,
                                       prefix='ff_logit_h%d' % lidx,
                                       activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    next_probs = tensor.nnet.softmax(logit)
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # next word probability
    f_next = theano.function([x, ctx] + init_state + init_memory,
                             [next_probs, next_sample] + next_state +
                             next_memory,
                             name='f_next',
                             profile=False,
                             allow_input_downcast=True)

    return f_init, f_next
示例#2
0
def build_model(tparams, options, sampling=True):
    """ Builds the entire computational graph used for training

    Basically does a forward pass through the data and calculates the cost function

    [This function builds a model described in Section 3.1.2 onwards
    as the convolutional feature are precomputed, some extra features
    which were not used are also implemented here.]

    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    sampling : boolean
        [If it is true, when using stochastic attention, follows
        the learning rule described in section 4. at the bottom left of
        page 5]
    Returns
    -------
    trng: theano random number generator
        Used for dropout, stochastic attention, etc
    use_noise: theano shared variable
        flag that toggles noise on and off
    [x, mask, ctx]: theano variables
        Represent the captions, binary mask, and annotations
        for a single batch (see dimensions below)
    alphas: theano variables
        Attention weights
    alpha_sample: theano variable
        Sampled attention weights used in REINFORCE for stochastic
        attention: [see the learning rule in eq (12)]
    cost: theano variable
        negative log likelihood
    opt_outs: OrderedDict
        extra outputs required depending on configuration in options
    """
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples,
    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('mask', dtype='float32')
    # context: #samples x #annotations x dim
    ctx = tensor.tensor3('ctx', dtype='float32')

    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # index into the word embedding matrix, shift it forward in time
    emb = tparams['Wemb'][x.flatten()].reshape(
        [n_timesteps, n_samples, options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted
    if options['lstm_encoder']:
        # encoder
        ctx_fwd = get_layer('lstm')[1](tparams,
                                       ctx.dimshuffle(1, 0, 2),
                                       options,
                                       prefix='encoder')[0].dimshuffle(
                                           1, 0, 2)
        ctx_rev = get_layer('lstm')[1](
            tparams,
            ctx.dimshuffle(1, 0, 2)[:, ::-1, :],
            options,
            prefix='encoder_rev')[0][:, ::-1, :].dimshuffle(1, 0, 2)
        ctx0 = tensor.concatenate((ctx_fwd, ctx_rev), axis=2)
    else:
        ctx0 = ctx

    # initial state/cell [top right on page 4]
    ctx_mean = ctx0.mean(1)
    for lidx in xrange(1, options['n_layers_init']):
        ctx_mean = get_layer('ff')[1](tparams,
                                      ctx_mean,
                                      options,
                                      prefix='ff_init_%d' % lidx,
                                      activ='rectifier')
        if options['use_dropout']:
            ctx_mean = dropout_layer(ctx_mean, use_noise, trng)

    init_state = get_layer('ff')[1](tparams,
                                    ctx_mean,
                                    options,
                                    prefix='ff_state',
                                    activ='tanh')
    init_memory = get_layer('ff')[1](tparams,
                                     ctx_mean,
                                     options,
                                     prefix='ff_memory',
                                     activ='tanh')
    # lstm decoder
    # [equation (1), (2), (3) in section 3.1.2]
    attn_updates = []
    proj, updates = get_layer('lstm_cond')[1](tparams,
                                              emb,
                                              options,
                                              prefix='decoder',
                                              mask=mask,
                                              context=ctx0,
                                              one_step=False,
                                              init_state=init_state,
                                              init_memory=init_memory,
                                              trng=trng,
                                              use_noise=use_noise,
                                              sampling=sampling)
    attn_updates += updates
    proj_h = proj[0]
    # optional deep attention
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state = get_layer('ff')[1](tparams,
                                            ctx_mean,
                                            options,
                                            prefix='ff_state_%d' % lidx,
                                            activ='tanh')
            init_memory = get_layer('ff')[1](tparams,
                                             ctx_mean,
                                             options,
                                             prefix='ff_memory_%d' % lidx,
                                             activ='tanh')
            proj, updates = get_layer('lstm_cond')[1](tparams,
                                                      proj_h,
                                                      options,
                                                      prefix='decoder_%d' %
                                                      lidx,
                                                      mask=mask,
                                                      context=ctx0,
                                                      one_step=False,
                                                      init_state=init_state,
                                                      init_memory=init_memory,
                                                      trng=trng,
                                                      use_noise=use_noise,
                                                      sampling=sampling)
            attn_updates += updates
            proj_h = proj[0]

    alphas = proj[2]
    alpha_sample = proj[3]
    ctxs = proj[4]

    # [beta value explained in note 4.2.1 "doubly stochastic attention"]
    if options['selector']:
        sels = proj[5]

    if options['use_dropout']:
        proj_h = dropout_layer(proj_h, use_noise, trng)

    # compute word probabilities
    # [equation (7)]
    logit = get_layer('ff')[1](tparams,
                               proj_h,
                               options,
                               prefix='ff_logit_lstm',
                               activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams,
                                    ctxs,
                                    options,
                                    prefix='ff_logit_ctx',
                                    activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams,
                                       logit,
                                       options,
                                       prefix='ff_logit_h%d' % lidx,
                                       activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)

    # compute softmax
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(
        logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

    # Index into the computed probability to give the log likelihood
    x_flat = x.flatten()
    p_flat = probs.flatten()
    cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0]) * probs.shape[1] +
                              x_flat] + 1e-8)
    cost = cost.reshape([x.shape[0], x.shape[1]])
    masked_cost = cost * mask
    cost = (masked_cost).sum(0)

    # optional outputs
    opt_outs = dict()
    if options['selector']:
        opt_outs['selector'] = sels
    if options['attn_type'] == 'stochastic':
        opt_outs['masked_cost'] = masked_cost  # need this for reinforce later
        opt_outs['attn_updates'] = attn_updates  # this is to update the rng

    return trng, use_noise, [x, mask,
                             ctx], alphas, alpha_sample, cost, opt_outs
示例#3
0
def build_sampler(tparams, options, use_noise, trng, sampling=True):
    """ Builds a sampler used for generating from the model
    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    use_noise: boolean
        If true, add noise to the sampling
    trng: random number generator
    sampling : boolean
        [If it is true, when using stochastic attention, follows
        the learning rule described in section 4. at the bottom left of
        page 5]
    Returns
    -------
    f_init : theano function
        Input: annotation, Output: initial lstm state and memory
        (also performs transformation on ctx0 if using lstm_encoder)
    f_next: theano function
        Takes the previous word/state/memory + ctx0 and runs ne
        step through the lstm (used for beam search)
    """
    # context: #annotations x dim
    ctx = tensor.matrix('ctx_sampler', dtype='float32')
    if options['lstm_encoder']:
        # encoder
        ctx_fwd = get_layer('lstm')[1](tparams, ctx,
                                       options, prefix='encoder')[0]
        ctx_rev = get_layer('lstm')[1](tparams, ctx[::-1,:],
                                       options, prefix='encoder_rev')[0][::-1,:]
        ctx = tensor.concatenate((ctx_fwd, ctx_rev), axis=1)

    # initial state/cell
    ctx_mean = ctx.mean(0)
    for lidx in xrange(1, options['n_layers_init']):
        ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options,
                                      prefix='ff_init_%d'%lidx, activ='rectifier')
        if options['use_dropout']:
            ctx_mean = dropout_layer(ctx_mean, use_noise, trng)
    init_state = [get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh')]
    init_memory = [get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh')]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state.append(get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d'%lidx, activ='tanh'))
            init_memory.append(get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d'%lidx, activ='tanh'))

    print 'Building f_init...',
    f_init = theano.function([ctx], [ctx]+init_state+init_memory, name='f_init', profile=False, allow_input_downcast=True)
    print 'Done'

    # build f_next
    ctx = tensor.matrix('ctx_sampler', dtype='float32')
    x = tensor.vector('x_sampler', dtype='int64')
    init_state = [tensor.matrix('init_state', dtype='float32')]
    init_memory = [tensor.matrix('init_memory', dtype='float32')]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state.append(tensor.matrix('init_state', dtype='float32'))
            init_memory.append(tensor.matrix('init_memory', dtype='float32'))

    # for the first word (which is coded with -1), emb should be all zero
    emb = tensor.switch(x[:,None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
                        tparams['Wemb'][x])

    proj = get_layer('lstm_cond')[1](tparams, emb, options,
                                     prefix='decoder',
                                     mask=None, context=ctx,
                                     one_step=True,
                                     init_state=init_state[0],
                                     init_memory=init_memory[0],
                                     trng=trng,
                                     use_noise=use_noise,
                                     sampling=sampling)

    next_state, next_memory, ctxs = [proj[0]], [proj[1]], [proj[4]]
    proj_h = proj[0]
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            proj = get_layer('lstm_cond')[1](tparams, proj_h, options,
                                             prefix='decoder_%d'%lidx,
                                             context=ctx,
                                             one_step=True,
                                             init_state=init_state[lidx],
                                             init_memory=init_memory[lidx],
                                             trng=trng,
                                             use_noise=use_noise,
                                             sampling=sampling)
            next_state.append(proj[0])
            next_memory.append(proj[1])
            ctxs.append(proj[4])
            proj_h = proj[0]

    if options['use_dropout']:
        proj_h = dropout_layer(proj[0], use_noise, trng)
    else:
        proj_h = proj[0]
    logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams, ctxs[-1], options, prefix='ff_logit_ctx', activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d'%lidx, activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear')
    logit_shp = logit.shape
    next_probs = tensor.nnet.softmax(logit)
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # next word probability
    f_next = theano.function([x, ctx]+init_state+init_memory, [next_probs, next_sample]+next_state+next_memory, name='f_next', profile=False, allow_input_downcast=True)

    return f_init, f_next
示例#4
0
def build_model(tparams, options, sampling=True):
    """ Builds the entire computational graph used for training

    Basically does a forward pass through the data and calculates the cost function

    [This function builds a model described in Section 3.1.2 onwards
    as the convolutional feature are precomputed, some extra features
    which were not used are also implemented here.]

    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    sampling : boolean
        [If it is true, when using stochastic attention, follows
        the learning rule described in section 4. at the bottom left of
        page 5]
    Returns
    -------
    trng: theano random number generator
        Used for dropout, stochastic attention, etc
    use_noise: theano shared variable
        flag that toggles noise on and off
    [x, mask, ctx]: theano variables
        Represent the captions, binary mask, and annotations
        for a single batch (see dimensions below)
    alphas: theano variables
        Attention weights
    alpha_sample: theano variable
        Sampled attention weights used in REINFORCE for stochastic
        attention: [see the learning rule in eq (12)]
    cost: theano variable
        negative log likelihood
    opt_outs: OrderedDict
        extra outputs required depending on configuration in options
    """
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples,
    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('mask', dtype='float32')
    # context: #samples x #annotations x dim
    ctx = tensor.tensor3('ctx', dtype='float32')

    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    # index into the word embedding matrix, shift it forward in time
    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps, n_samples, options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted
    if options['lstm_encoder']:
        # encoder
        ctx_fwd = get_layer('lstm')[1](tparams, ctx.dimshuffle(1,0,2),
                                       options, prefix='encoder')[0].dimshuffle(1,0,2)
        ctx_rev = get_layer('lstm')[1](tparams, ctx.dimshuffle(1,0,2)[:,::-1,:],
                                       options, prefix='encoder_rev')[0][:,::-1,:].dimshuffle(1,0,2)
        ctx0 = tensor.concatenate((ctx_fwd, ctx_rev), axis=2)
    else:
        ctx0 = ctx

    # initial state/cell [top right on page 4]
    ctx_mean = ctx0.mean(1)
    for lidx in xrange(1, options['n_layers_init']):
        ctx_mean = get_layer('ff')[1](tparams, ctx_mean, options,
                                      prefix='ff_init_%d'%lidx, activ='rectifier')
        if options['use_dropout']:
            ctx_mean = dropout_layer(ctx_mean, use_noise, trng)

    init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state', activ='tanh')
    init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory', activ='tanh')
    # lstm decoder
    # [equation (1), (2), (3) in section 3.1.2]
    attn_updates = []
    proj, updates = get_layer('lstm_cond')[1](tparams, emb, options,
                                              prefix='decoder',
                                              mask=mask, context=ctx0,
                                              one_step=False,
                                              init_state=init_state,
                                              init_memory=init_memory,
                                              trng=trng,
                                              use_noise=use_noise,
                                              sampling=sampling)
    attn_updates += updates
    proj_h = proj[0]
    # optional deep attention
    if options['n_layers_lstm'] > 1:
        for lidx in xrange(1, options['n_layers_lstm']):
            init_state = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_state_%d'%lidx, activ='tanh')
            init_memory = get_layer('ff')[1](tparams, ctx_mean, options, prefix='ff_memory_%d'%lidx, activ='tanh')
            proj, updates = get_layer('lstm_cond')[1](tparams, proj_h, options,
                                                      prefix='decoder_%d'%lidx,
                                                      mask=mask, context=ctx0,
                                                      one_step=False,
                                                      init_state=init_state,
                                                      init_memory=init_memory,
                                                      trng=trng,
                                                      use_noise=use_noise,
                                                      sampling=sampling)
            attn_updates += updates
            proj_h = proj[0]

    alphas = proj[2]
    alpha_sample = proj[3]
    ctxs = proj[4]

    # [beta value explained in note 4.2.1 "doubly stochastic attention"]
    if options['selector']:
        sels = proj[5]

    if options['use_dropout']:
        proj_h = dropout_layer(proj_h, use_noise, trng)

    # compute word probabilities
    # [equation (7)]
    logit = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams, ctxs, options, prefix='ff_logit_ctx', activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit_h%d'%lidx, activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)

    # compute softmax
    logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]]))

    # Index into the computed probability to give the log likelihood
    x_flat = x.flatten()
    p_flat = probs.flatten()
    cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0])*probs.shape[1]+x_flat]+1e-8)
    cost = cost.reshape([x.shape[0], x.shape[1]])
    masked_cost = cost * mask
    cost = (masked_cost).sum(0)

    # optional outputs
    opt_outs = dict()
    if options['selector']:
        opt_outs['selector'] = sels
    if options['attn_type'] == 'stochastic':
        opt_outs['masked_cost'] = masked_cost # need this for reinforce later
        opt_outs['attn_updates'] = attn_updates # this is to update the rng

    return trng, use_noise, [x, mask, ctx], alphas, alpha_sample, cost, opt_outs
def build_sampler(tparams, options, use_noise, trng):
    """ Builds a sampler used for generating from the model
    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    use_noise: boolean
        If true, add noise to the sampling
    trng: random number generator
    Returns
    -------
    f_init : theano function
        Input: annotation, Output: initial lstm state and memory
        (also performs transformation on ctx0 if using lstm_encoder)
    f_next: theano function
        Takes the previous word/state/memory + ctx0 and runs ne
        step through the lstm (used for beam search)
    """
    # context: #annotations x dim
    if options['with_glove']:
        ctx = tensor.matrix('ctx_sampler', dtype='float32')
        new_ctx = ctx
    else:
        ctx = tensor.vector('ctx_sampler', dtype='int32')
        new_ctx = tparams['VCemb'][ctx]
    if options['lstm_encoder']:
        ctx0, _ = get_layer('lstm_cond_nox')[1](tparams,
                                                options,
                                                prefix='encoder',
                                                context=new_ctx)
    else:
        ctx0 = new_ctx
    # initial state/cell
    cnn_features = tensor.vector('x_feats', dtype='float32')
    init_state, init_memory = [], []
    for lidx in range(options['n_layers_lstm']):
        init_state_prefix = 'CNNTrans_%d' % lidx if lidx > 0 else 'CNNTrans'
        init_memory_prefix = 'CNN_memory_%d' % lidx if lidx > 0 else 'CNN_memory'
        init_state.append(
            get_layer('ff')[1](tparams,
                               cnn_features,
                               options,
                               prefix=init_state_prefix,
                               activ='tanh'))
        init_memory.append(
            get_layer('ff')[1](tparams,
                               cnn_features,
                               options,
                               prefix=init_memory_prefix,
                               activ='tanh'))

    print 'Building f_init...',
    f_init = theano.function([ctx, cnn_features],
                             [ctx0] + init_state + init_memory,
                             name='f_init',
                             profile=False,
                             allow_input_downcast=True)
    print 'Done'

    # build f_next
    x = tensor.vector('x_sampler', dtype='int64')
    init_state = []
    init_memory = []
    for lidx in range(options['n_layers_lstm']):
        init_state.append(tensor.matrix('init_state', dtype='float32'))
        init_memory.append(tensor.matrix('init_memory', dtype='float32'))

    # for the first word (which is coded with -1), emb should be all zero
    emb = tensor.switch(x[:, None] < 0,
                        tensor.alloc(0., 1, tparams['Wemb'].shape[1]),
                        tparams['Wemb'][x])

    next_state, next_memory, ctxs = [], [], []
    for lidx in range(options['n_layers_lstm']):
        decoder_prefix = 'decoder_%d' % lidx if lidx > 0 else 'decoder'
        inps = proj_h if lidx > 0 else emb
        proj = get_layer('lstm_cond')[1](tparams,
                                         inps,
                                         options,
                                         prefix=decoder_prefix,
                                         context=ctx0,
                                         one_step=True,
                                         init_state=init_state[lidx],
                                         init_memory=init_memory[lidx],
                                         trng=trng,
                                         use_noise=use_noise)
        next_state.append(proj[0])
        next_memory.append(proj[1])
        ctxs.append(proj[4])
        next_alpha = proj[2]
        proj_h = proj[0]

    if options['use_dropout']:
        proj_h = dropout_layer(proj[0], use_noise, trng)
    else:
        proj_h = proj[0]
    logit = get_layer('ff')[1](tparams,
                               proj_h,
                               options,
                               prefix='ff_logit_lstm',
                               activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams,
                                    ctxs[-1],
                                    options,
                                    prefix='ff_logit_ctx',
                                    activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams,
                                       logit,
                                       options,
                                       prefix='ff_logit_h%d' % lidx,
                                       activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    next_probs = tensor.nnet.softmax(logit)
    next_sample = trng.multinomial(pvals=next_probs).argmax(1)

    # next word probability
    f_next = theano.function([x, ctx0] + init_state + init_memory,
                             [next_probs, next_sample, next_alpha] +
                             next_state + next_memory,
                             name='f_next',
                             profile=False,
                             allow_input_downcast=True)

    return f_init, f_next
def build_model(tparams, options):
    """ Builds the entire computational graph used for training

    Basically does a forward pass through the data and calculates the cost function

    [This function builds a model described in Section 3.1.2 onwards
    as the convolutional feature are precomputed, some extra features
    which were not used are also implemented here.]

    Parameters
    ----------
    tparams : OrderedDict
        maps names of variables to theano shared variables
    options : dict
        big dictionary with all the settings and hyperparameters
    Returns
    -------
    trng: theano random number generator
        Used for dropout, etc
    use_noise: theano shared variable
        flag that toggles noise on and off
    [x, mask, ctx, cnn_features]: theano variables
        Represent the captions, binary mask, and annotations
        for a single batch (see dimensions below)
    alphas: theano variables
        Attention weights
    alpha_sample: theano variable
        Sampled attention weights used in REINFORCE for stochastic
        attention: [see the learning rule in eq (12)]
    cost: theano variable
        negative log likelihood
    opt_outs: OrderedDict
        extra outputs required depending on configuration in options
    """
    trng = RandomStreams(1234)
    use_noise = theano.shared(numpy.float32(0.))

    # description string: #words x #samples,
    x = tensor.matrix('x', dtype='int64')
    # mask: #samples,
    mask = tensor.matrix('mask', dtype='float32')
    # context: #samples x #visual_words x dim
    if options['with_glove']:
        ctx = tensor.tensor3('ctx', dtype='float32')
        new_ctx = ctx
    else:
        ctx = tensor.matrix('ctx', dtype='int32')
        new_ctx = tparams['VCemb'][ctx]
    # fc7 features: #samples x dim
    cnn_features = tensor.matrix('cnn_feats', dtype='float32')

    # index into the word embedding matrix, shift it forward in time, the first element is zero
    # Time step x S x D
    emb = tparams['Wemb'][x.flatten()].reshape(
        [x.shape[0], x.shape[1], options['dim_word']])
    emb_shifted = tensor.zeros_like(emb)
    emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1])
    emb = emb_shifted

    # forward-backward lstm encoder
    if options['lstm_encoder']:
        rval, encoder_alphas = get_layer('lstm_cond_nox')[1](tparams,
                                                             options,
                                                             prefix='encoder',
                                                             context=new_ctx)
        ctx0 = rval.dimshuffle(1, 0, 2)
    else:
        ctx0 = new_ctx

    for lidx in range(options['n_layers_lstm']):
        init_state_prefix = 'CNNTrans_%d' % lidx if lidx > 0 else 'CNNTrans'
        init_memory_prefix = 'CNN_memory_%d' % lidx if lidx > 0 else 'CNN_memory'
        lstm_prefix = 'decoder_%d' % lidx if lidx > 0 else 'decoder'
        lstm_inps = proj_h if lidx > 0 else emb
        init_state = get_layer('ff')[1](tparams,
                                        cnn_features,
                                        options,
                                        prefix=init_state_prefix,
                                        activ='tanh')
        init_memory = get_layer('ff')[1](tparams,
                                         cnn_features,
                                         options,
                                         prefix=init_memory_prefix,
                                         activ='tanh')
        attn_updates = []
        proj, updates = get_layer('lstm_cond')[1](tparams,
                                                  lstm_inps,
                                                  options,
                                                  prefix=lstm_prefix,
                                                  mask=mask,
                                                  context=ctx0,
                                                  one_step=False,
                                                  init_state=init_state,
                                                  init_memory=init_memory,
                                                  trng=trng,
                                                  use_noise=use_noise)
        attn_updates += updates
        proj_h = proj[0]

    alphas = proj[2]
    ctxs = proj[4]

    if options['use_dropout']:
        proj_h = dropout_layer(proj_h, use_noise, trng)

    # compute word probabilities
    # [equation (7)]
    logit = get_layer('ff')[1](tparams,
                               proj_h,
                               options,
                               prefix='ff_logit_lstm',
                               activ='linear')
    if options['prev2out']:
        logit += emb
    if options['ctx2out']:
        logit += get_layer('ff')[1](tparams,
                                    ctxs,
                                    options,
                                    prefix='ff_logit_ctx',
                                    activ='linear')
    logit = tanh(logit)
    if options['use_dropout']:
        logit = dropout_layer(logit, use_noise, trng)
    if options['n_layers_out'] > 1:
        for lidx in xrange(1, options['n_layers_out']):
            logit = get_layer('ff')[1](tparams,
                                       logit,
                                       options,
                                       prefix='ff_logit_h%d' % lidx,
                                       activ='rectifier')
            if options['use_dropout']:
                logit = dropout_layer(logit, use_noise, trng)

    # compute softmax
    logit = get_layer('ff')[1](tparams,
                               logit,
                               options,
                               prefix='ff_logit',
                               activ='linear')
    logit_shp = logit.shape
    probs = tensor.nnet.softmax(
        logit.reshape([logit_shp[0] * logit_shp[1], logit_shp[2]]))

    # Index into the computed probability to give the log likelihood
    x_flat = x.flatten()
    p_flat = probs.flatten()
    cost = -tensor.log(p_flat[tensor.arange(x_flat.shape[0]) * probs.shape[1] +
                              x_flat] + 1e-8)
    cost = cost.reshape([x.shape[0], x.shape[1]])
    masked_cost = cost * mask
    #align_cost = (-standard_aligns*alphas).sum(2)

    cost = masked_cost.sum(0)

    # optional outputs
    opt_outs = dict()
    if options['lstm_encoder']:
        return trng, use_noise, [x, mask, ctx, cnn_features
                                 ], [alphas, encoder_alphas], cost, opt_outs
    else:
        return trng, use_noise, [x, mask, ctx,
                                 cnn_features], [alphas], cost, opt_outs