示例#1
0
文件: cnnrnn.py 项目: ittailup/argus
def prep_model(model, N, s0pad, s1pad, c):
    (sc, Nc) = cnn_input(model, N, s0pad, l2reg=c['l2reg'],
                         cnninit=c['cnninit'], cnnact=c['cnnact'],
                         cdim=c['cdim'], cfiltlen=c['cfiltlen'])

    if c['maxpool_len'] > 1:
        model.add_shared_node(name='pool', inputs=['e0c', 'e1c'], outputs=['e0g', 'e1g'],
                              layer=MaxPooling1D(pool_length=c['maxpool_len']))
        sc /= c['maxpool_len']
        cnn_outputs = ['e0g', 'e1g']
    else:
        cnn_outputs = ['e0c', 'e1c']
    model.add_node(name='e0c_', input=cnn_outputs[0], layer=Dropout(c['dropout']))
    model.add_node(name='e1c_', input=cnn_outputs[1], layer=Dropout(c['dropout']))

    B.rnn_input(model, Nc, sc, inputs=['e0c_', 'e1c_'],
                dropout=c['dropout'], sdim=c['sdim'],
                rnnbidi=c['rnnbidi'], rnn=c['rnn'], rnnact=c['rnnact'], rnninit=c['rnninit'],
                rnnbidi_mode=c['rnnbidi_mode'], rnnlevels=c['rnnlevels'])

    # Projection
    if c['project']:
        model.add_shared_node(name='proj', inputs=['e0s_', 'e1s_'], outputs=['e0p', 'e1p'],
                              layer=Dense(input_dim=int(N*c['sdim']), output_dim=int(N*c['pdim']),
                                          W_regularizer=l2(c['l2reg'])))
        # This dropout is controversial; it might be harmful to apply,
        # or at least isn't a clear win.
        # model.add_shared_node(name='projdrop', inputs=['e0p', 'e1p'], outputs=['e0p_', 'e1p_'],
        #                       layer=Dropout(c['dropout'], input_shape=(N,)))
        # return ('e0p_', 'e1p_')
        return ('e0p', 'e1p')
    else:
        return ('e0s_', 'e1s_')
示例#2
0
def prep_model(model, N, s0pad, s1pad, c):
    B.rnn_input(model,
                N,
                s0pad,
                dropout=c['dropout'],
                sdim=c['sdim'],
                rnnbidi=c['rnnbidi'],
                rnn=c['rnn'],
                rnnact=c['rnnact'],
                rnninit=c['rnninit'],
                rnnbidi_mode=c['rnnbidi_mode'],
                rnnlevels=c['rnnlevels'])

    # Projection
    if c['project']:
        model.add_shared_node(name='proj',
                              inputs=['e0s_', 'e1s_'],
                              outputs=['e0p', 'e1p'],
                              layer=Dense(input_dim=int(N * c['sdim']),
                                          output_dim=int(N * c['pdim']),
                                          W_regularizer=l2(c['l2reg']),
                                          activation=c['pact']))
        # This dropout is controversial; it might be harmful to apply,
        # or at least isn't a clear win.
        # model.add_shared_node(name='projdrop', inputs=['e0p', 'e1p'], outputs=['e0p_', 'e1p_'],
        #                       layer=Dropout(c['dropout'], input_shape=(N,)))
        # return ('e0p_', 'e1p_')
        return ('e0p', 'e1p')
    else:
        return ('e0s_', 'e1s_')
示例#3
0
def prep_model(model, N, s0pad, s1pad, c):
    (sc, Nc) = cnn_input(model,
                         N,
                         s0pad,
                         l2reg=c['l2reg'],
                         cnninit=c['cnninit'],
                         cnnact=c['cnnact'],
                         cdim=c['cdim'],
                         cfiltlen=c['cfiltlen'])

    if c['maxpool_len'] > 1:
        model.add_shared_node(name='pool',
                              inputs=['e0c', 'e1c'],
                              outputs=['e0g', 'e1g'],
                              layer=MaxPooling1D(pool_length=c['maxpool_len']))
        sc /= c['maxpool_len']
        cnn_outputs = ['e0g', 'e1g']
    else:
        cnn_outputs = ['e0c', 'e1c']
    model.add_node(name='e0c_',
                   input=cnn_outputs[0],
                   layer=Dropout(c['dropout']))
    model.add_node(name='e1c_',
                   input=cnn_outputs[1],
                   layer=Dropout(c['dropout']))

    B.rnn_input(model,
                Nc,
                sc,
                inputs=['e0c_', 'e1c_'],
                dropout=c['dropout'],
                dropoutfix_inp=c['dropoutfix_inp'],
                dropoutfix_rec=c['dropoutfix_rec'],
                sdim=c['sdim'],
                rnnbidi=c['rnnbidi'],
                rnn=c['rnn'],
                rnnact=c['rnnact'],
                rnninit=c['rnninit'],
                rnnbidi_mode=c['rnnbidi_mode'],
                rnnlevels=c['rnnlevels'])

    # Projection
    if c['project']:
        model.add_shared_node(name='proj',
                              inputs=['e0s_', 'e1s_'],
                              outputs=['e0p', 'e1p'],
                              layer=Dense(input_dim=int(N * c['sdim']),
                                          output_dim=int(N * c['pdim']),
                                          W_regularizer=l2(c['l2reg'])))
        # This dropout is controversial; it might be harmful to apply,
        # or at least isn't a clear win.
        # model.add_shared_node(name='projdrop', inputs=['e0p', 'e1p'], outputs=['e0p_', 'e1p_'],
        #                       layer=Dropout(c['dropout'], input_shape=(N,)))
        # return ('e0p_', 'e1p_')
        return ('e0p', 'e1p')
    else:
        return ('e0s_', 'e1s_')
示例#4
0
def prep_model(inputs, N, s0pad, s1pad, c):
    outputs = B.rnn_input(inputs,
                          N,
                          s0pad,
                          dropout=c['dropout'],
                          dropoutfix_inp=c['dropoutfix_inp'],
                          dropoutfix_rec=c['dropoutfix_rec'],
                          sdim=c['sdim'],
                          rnnbidi=c['rnnbidi'],
                          rnn=c['rnn'],
                          rnnact=c['rnnact'],
                          rnninit=c['rnninit'],
                          rnnbidi_mode=c['rnnbidi_mode'],
                          rnnlevels=c['rnnlevels'])

    # Projection
    if c['project']:
        proj = Dense(int(N * c['pdim']),
                     activation=c['pact'],
                     kernel_regularizer=l2(c['l2reg']),
                     name='proj')
        e0p = proj(outputs[0])
        e1p = proj(outputs[1])
        N = N * c['pdim']
        return [e0p, e1p], N
    else:
        return [outputs[0], outputs[1]], N
示例#5
0
def prep_model(model, N, s0pad, s1pad, c):
    # FIXME: pool_layer=None is in fact not supported, since this RNN
    # would return a scalar for e1s too; instead, we'l need to manually
    # pick the first&last element of the returned sequence from e0s
    B.rnn_input(model, N, s0pad, return_sequences=(c['pool_layer'] is not None),
                dropout=c['dropout'], dropoutfix_inp=c['dropoutfix_inp'], dropoutfix_rec=c['dropoutfix_rec'],
                sdim=c['sdim'], rnnlevels=c['rnnlevels'],
                rnnbidi=c['rnnbidi'], rnnbidi_mode=c['rnnbidi_mode'],
                rnn=c['rnn'], rnnact=c['rnnact'], rnninit=c['rnninit'])

    # Generate e0s aggregate embedding
    e0_aggreg, gwidth = aggregate(model, 'e0s_', 'e0', N, s0pad, c['pool_layer'],
                                  dropout=c['dropout'], l2reg=c['l2reg'], sdim=c['sdim'],
                                  cnnact=c['cnnact'], cdim=c['cdim'], cfiltlen=c['cfiltlen'],
                                  project=c['project'])

    if c['project']:
        # ...and re-embed e0, e1 in attention space
        awidth = int(N*c['adim'])
        model.add_node(name='e0a', input=e0_aggreg,
                       layer=Dense(input_dim=gwidth, output_dim=awidth, W_regularizer=l2(c['l2reg'])))
        e0_aggreg_attn = 'e0a'

        model.add_node(name='e1sa_', input='e1s',
                       layer=TimeDistributedDense(input_dim=int(N*c['sdim']), output_dim=awidth, W_regularizer=l2(c['l2reg'])))
        # XXX: this dummy works around a mysterious theano error
        model.add_node(name='e1sa', input='e1sa_', layer=Activation('linear'))
        e1_attn = 'e1sa'
    else:
        awidth = int(N*c['sdim'])
        e1_attn = 'e1s'
        e0_aggreg_attn = e0_aggreg

    # Now, build an attention function f(e0a, e1sa) -> e1a, producing an
    # (s1pad,) vector of scalars denoting the attention for each e1 token
    focus(model, N, e0_aggreg_attn, e1_attn, 'e1s_', 'e1a', 'e1sm', s1pad, c['sdim'], awidth,
          c['attn_mode'], c['focus_act'], c['l2reg'])

    # Generate e1sm aggregate embedding
    e1_aggreg, gwidth = aggregate(model, 'e1sm', 'e1', N, s1pad, c['pool_layer'],
                                  dropout=c['dropout'], l2reg=c['l2reg'], sdim=c['sdim'],
                                  cnnact=c['cnnact'], cdim=c['cdim'], cfiltlen=c['cfiltlen'],
                                  project=c['project'])

    return (e0_aggreg, e1_aggreg)
示例#6
0
文件: rnn.py 项目: brmson/dataset-sts
def prep_model(model, N, s0pad, s1pad, c):
    B.rnn_input(model, N, s0pad,
                dropout=c['dropout'], dropoutfix_inp=c['dropoutfix_inp'], dropoutfix_rec=c['dropoutfix_rec'],
                sdim=c['sdim'],
                rnnbidi=c['rnnbidi'], rnn=c['rnn'], rnnact=c['rnnact'], rnninit=c['rnninit'],
                rnnbidi_mode=c['rnnbidi_mode'], rnnlevels=c['rnnlevels'])

    # Projection
    if c['project']:
        model.add_shared_node(name='proj', inputs=['e0s_', 'e1s_'], outputs=['e0p', 'e1p'],
                              layer=Dense(input_dim=int(N*c['sdim']), output_dim=int(N*c['pdim']), init=c['pinit'],
                                          W_regularizer=l2(c['l2reg']), activation=c['pact']))
        # This dropout is controversial; it might be harmful to apply,
        # or at least isn't a clear win.
        # model.add_shared_node(name='projdrop', inputs=['e0p', 'e1p'], outputs=['e0p_', 'e1p_'],
        #                       layer=Dropout(c['dropout'], input_shape=(N,)))
        # return ('e0p_', 'e1p_')
        return ('e0p', 'e1p')
    else:
        return ('e0s_', 'e1s_')
示例#7
0
def prep_model(model, N, s0pad, s1pad, c):
    # FIXME: pool_layer=None is in fact not supported, since this RNN
    # would return a scalar for e1s too; instead, we'l need to manually
    # pick the first&last element of the returned sequence from e0s
    B.rnn_input(model,
                N,
                s0pad,
                return_sequences=(c['pool_layer'] is not None),
                dropout=c['dropout'],
                dropoutfix_inp=c['dropoutfix_inp'],
                dropoutfix_rec=c['dropoutfix_rec'],
                sdim=c['sdim'],
                rnnlevels=c['rnnlevels'],
                rnnbidi=c['rnnbidi'],
                rnnbidi_mode=c['rnnbidi_mode'],
                rnn=c['rnn'],
                rnnact=c['rnnact'],
                rnninit=c['rnninit'])

    # Generate e0s aggregate embedding
    e0_aggreg, gwidth = aggregate(model,
                                  'e0s_',
                                  'e0',
                                  N,
                                  s0pad,
                                  c['pool_layer'],
                                  dropout=c['dropout'],
                                  l2reg=c['l2reg'],
                                  sdim=c['sdim'],
                                  cnnact=c['cnnact'],
                                  cdim=c['cdim'],
                                  cfiltlen=c['cfiltlen'],
                                  project=c['project'])

    if c['project']:
        # ...and re-embed e0, e1 in attention space
        awidth = int(N * c['adim'])
        model.add_node(name='e0a',
                       input=e0_aggreg,
                       layer=Dense(input_dim=gwidth,
                                   output_dim=awidth,
                                   W_regularizer=l2(c['l2reg'])))
        e0_aggreg_attn = 'e0a'

        model.add_node(name='e1sa_',
                       input='e1s',
                       layer=TimeDistributedDense(input_dim=int(N * c['sdim']),
                                                  output_dim=awidth,
                                                  W_regularizer=l2(
                                                      c['l2reg'])))
        # XXX: this dummy works around a mysterious theano error
        model.add_node(name='e1sa', input='e1sa_', layer=Activation('linear'))
        e1_attn = 'e1sa'
    else:
        awidth = int(N * c['sdim'])
        e1_attn = 'e1s'
        e0_aggreg_attn = e0_aggreg

    # Now, build an attention function f(e0a, e1sa) -> e1a, producing an
    # (s1pad,) vector of scalars denoting the attention for each e1 token
    focus(model, N, e0_aggreg_attn, e1_attn, 'e1s_', 'e1a', 'e1sm', s1pad,
          c['sdim'], awidth, c['attn_mode'], c['focus_act'], c['l2reg'])

    # Generate e1sm aggregate embedding
    e1_aggreg, gwidth = aggregate(model,
                                  'e1sm',
                                  'e1',
                                  N,
                                  s1pad,
                                  c['pool_layer'],
                                  dropout=c['dropout'],
                                  l2reg=c['l2reg'],
                                  sdim=c['sdim'],
                                  cnnact=c['cnnact'],
                                  cdim=c['cdim'],
                                  cfiltlen=c['cfiltlen'],
                                  project=c['project'])

    return (e0_aggreg, e1_aggreg)
示例#8
0
def prep_model(glove, vocab, dropout=3/4, dropout_in=None, l2reg=1e-4,
               rnnbidi=True, rnn=GRU, rnnbidi_mode='sum', rnnact='tanh', rnninit='glorot_uniform',
               sdim=2, rnnlevels=1,
               pool_layer=MaxPooling1D, cnnact='tanh', cnninit='glorot_uniform', cdim=2, cfiltlen=3,
               project=True, adim=1/2, attn_mode='sum', fact='softmax',
               ptscorer=B.mlp_ptscorer, mlpsum='sum', Ddim=2,
               oact='sigmoid'):
    model = Graph()
    N = B.embedding(model, glove, vocab, s0pad, s1pad, dropout, dropout_w=.5) # fix

    if dropout_in is None:
        dropout_in = dropout

    # FIXME: pool_layer=None is in fact not supported, since this RNN
    # would return a scalar for e1s too; instead, we'l need to manually
    # pick the first&last element of the returned sequence from e0s
    B.rnn_input(model, N, s0pad, return_sequences=(pool_layer is not None),
                rnnlevels=rnnlevels, dropout=dropout_in, sdim=sdim,
                rnnbidi=rnnbidi, rnnbidi_mode=rnnbidi_mode,
                rnn=rnn, rnnact=rnnact, rnninit=rnninit)

    # Generate e0s aggregate embedding
    e0_aggreg, gwidth = aggregate(model, 'e0s_', 'e0', N, s0pad, pool_layer,
                                  dropout=dropout_in, l2reg=l2reg, sdim=sdim,
                                  cnnact=cnnact, cdim=cdim, cfiltlen=cfiltlen,
                                  project=project)

    if project:
        # ...and re-embed e0, e1 in attention space
        awidth = int(N*adim)
        model.add_node(name='e0a', input=e0_aggreg,
                       layer=Dense(input_dim=gwidth, output_dim=awidth, W_regularizer=l2(l2reg)))
        e0_aggreg_attn = 'e0a'

        model.add_node(name='e1sa_', input='e1s',
                       layer=TimeDistributedDense(input_dim=int(N*sdim), output_dim=awidth, W_regularizer=l2(l2reg)))
        # XXX: this dummy works around a mysterious theano error
        model.add_node(name='e1sa', input='e1sa_', layer=Activation('linear'))
        e1_attn = 'e1sa'
    else:
        e1_attn = 'e1s'
        e0_aggreg_attn = e0_aggreg

    # Now, build an attention function f(e0a, e1sa) -> e1a, producing an
    # (s1pad,) vector of scalars denoting the attention for each e1 token
    model.add_node(name='e0sa', input=e0_aggreg_attn,
                   layer=RepeatVector(s1pad))
    if attn_mode == 'dot' or attn_mode == 'cos':
        # model attention by dot-product, i.e. similarity measure of question
        # aggregate and answer token in attention space
        model.add_node(name='e1a[1]',
                       layer=B.dot_time_distributed_merge(model, ['e0sa', e1_attn], cos_norm=(attn_mode == 'cos')))
    else:
        # traditional attention model from Hermann et al., 2015 and Tan et al., 2015
        # we want to model attention as w*tanh(e0a + e1sa[i])
        model.add_node(name='e1a[0]', inputs=['e0sa', e1_attn], merge_mode='sum',
                       layer=Activation('tanh'))
        model.add_node(name='e1a[1]', input='e1a[0]',
                       layer=TimeDistributedDense(input_dim=awidth, output_dim=1, W_regularizer=l2(l2reg)))
    model.add_node(name='e1a[2]', input='e1a[1]',
                   layer=Flatten(input_shape=(s1pad, 1)))

    # *Focus* e1 by softmaxing (by default) attention and multiplying tokens
    # by their attention.
    model.add_node(name='e1a[3]', input='e1a[2]',
                   layer=Activation(fact))
    model.add_node(name='e1a[4]', input='e1a[3]',
                   layer=RepeatVector(int(N*sdim)))
    model.add_node(name='e1a', input='e1a[4]',
                   layer=Permute((2,1)))
    model.add_node(name='e1sm', inputs=['e1s_', 'e1a'], merge_mode='mul',
                   layer=Activation('linear'))

    # Generate e1sm aggregate embedding
    e1_aggreg, gwidth = aggregate(model, 'e1sm', 'e1', N, s1pad, pool_layer,
                                  dropout=dropout_in, l2reg=l2reg, sdim=sdim,
                                  cnnact=cnnact, cdim=cdim, cfiltlen=cfiltlen,
                                  project=project)

    if ptscorer == '1':
        # special scoring mode just based on the answer
        # (assuming that the question match is carried by the attention)
        ptscorer = B.cat_ptscorer
        final_outputs = [e1_aggreg]
    else:
        final_outputs = [e0_aggreg, e1_aggreg]

    # Measurement
    kwargs = dict()
    if ptscorer == B.mlp_ptscorer:
        kwargs['sum_mode'] = mlpsum
    model.add_node(name='scoreS', input=ptscorer(model, final_outputs, Ddim, N, l2reg, **kwargs),
                   layer=Activation(oact))
    model.add_output(name='score', input='scoreS')
    return model
示例#9
0
def prep_model(glove,
               vocab,
               dropout=3 / 4,
               dropout_in=None,
               l2reg=1e-4,
               rnnbidi=True,
               rnn=GRU,
               rnnbidi_mode='sum',
               rnnact='tanh',
               rnninit='glorot_uniform',
               sdim=2,
               rnnlevels=1,
               pool_layer=MaxPooling1D,
               cnnact='tanh',
               cnninit='glorot_uniform',
               cdim=2,
               cfiltlen=3,
               project=True,
               adim=1 / 2,
               attn_mode='sum',
               fact='softmax',
               ptscorer=B.mlp_ptscorer,
               mlpsum='sum',
               Ddim=2,
               oact='sigmoid'):
    model = Graph()
    N = B.embedding(model, glove, vocab, s0pad, s1pad, dropout,
                    dropout_w=.5)  # fix

    if dropout_in is None:
        dropout_in = dropout

    # FIXME: pool_layer=None is in fact not supported, since this RNN
    # would return a scalar for e1s too; instead, we'l need to manually
    # pick the first&last element of the returned sequence from e0s
    B.rnn_input(model,
                N,
                s0pad,
                return_sequences=(pool_layer is not None),
                rnnlevels=rnnlevels,
                dropout=dropout_in,
                sdim=sdim,
                rnnbidi=rnnbidi,
                rnnbidi_mode=rnnbidi_mode,
                rnn=rnn,
                rnnact=rnnact,
                rnninit=rnninit)

    # Generate e0s aggregate embedding
    e0_aggreg, gwidth = aggregate(model,
                                  'e0s_',
                                  'e0',
                                  N,
                                  s0pad,
                                  pool_layer,
                                  dropout=dropout_in,
                                  l2reg=l2reg,
                                  sdim=sdim,
                                  cnnact=cnnact,
                                  cdim=cdim,
                                  cfiltlen=cfiltlen,
                                  project=project)

    if project:
        # ...and re-embed e0, e1 in attention space
        awidth = int(N * adim)
        model.add_node(name='e0a',
                       input=e0_aggreg,
                       layer=Dense(input_dim=gwidth,
                                   output_dim=awidth,
                                   W_regularizer=l2(l2reg)))
        e0_aggreg_attn = 'e0a'

        model.add_node(name='e1sa_',
                       input='e1s',
                       layer=TimeDistributedDense(input_dim=int(N * sdim),
                                                  output_dim=awidth,
                                                  W_regularizer=l2(l2reg)))
        # XXX: this dummy works around a mysterious theano error
        model.add_node(name='e1sa', input='e1sa_', layer=Activation('linear'))
        e1_attn = 'e1sa'
    else:
        e1_attn = 'e1s'
        e0_aggreg_attn = e0_aggreg

    # Now, build an attention function f(e0a, e1sa) -> e1a, producing an
    # (s1pad,) vector of scalars denoting the attention for each e1 token
    model.add_node(name='e0sa',
                   input=e0_aggreg_attn,
                   layer=RepeatVector(s1pad))
    if attn_mode == 'dot' or attn_mode == 'cos':
        # model attention by dot-product, i.e. similarity measure of question
        # aggregate and answer token in attention space
        model.add_node(name='e1a[1]',
                       layer=B.dot_time_distributed_merge(
                           model, ['e0sa', e1_attn],
                           cos_norm=(attn_mode == 'cos')))
    else:
        # traditional attention model from Hermann et al., 2015 and Tan et al., 2015
        # we want to model attention as w*tanh(e0a + e1sa[i])
        model.add_node(name='e1a[0]',
                       inputs=['e0sa', e1_attn],
                       merge_mode='sum',
                       layer=Activation('tanh'))
        model.add_node(name='e1a[1]',
                       input='e1a[0]',
                       layer=TimeDistributedDense(input_dim=awidth,
                                                  output_dim=1,
                                                  W_regularizer=l2(l2reg)))
    model.add_node(name='e1a[2]',
                   input='e1a[1]',
                   layer=Flatten(input_shape=(s1pad, 1)))

    # *Focus* e1 by softmaxing (by default) attention and multiplying tokens
    # by their attention.
    model.add_node(name='e1a[3]', input='e1a[2]', layer=Activation(fact))
    model.add_node(name='e1a[4]',
                   input='e1a[3]',
                   layer=RepeatVector(int(N * sdim)))
    model.add_node(name='e1a', input='e1a[4]', layer=Permute((2, 1)))
    model.add_node(name='e1sm',
                   inputs=['e1s_', 'e1a'],
                   merge_mode='mul',
                   layer=Activation('linear'))

    # Generate e1sm aggregate embedding
    e1_aggreg, gwidth = aggregate(model,
                                  'e1sm',
                                  'e1',
                                  N,
                                  s1pad,
                                  pool_layer,
                                  dropout=dropout_in,
                                  l2reg=l2reg,
                                  sdim=sdim,
                                  cnnact=cnnact,
                                  cdim=cdim,
                                  cfiltlen=cfiltlen,
                                  project=project)

    if ptscorer == '1':
        # special scoring mode just based on the answer
        # (assuming that the question match is carried by the attention)
        ptscorer = B.cat_ptscorer
        final_outputs = [e1_aggreg]
    else:
        final_outputs = [e0_aggreg, e1_aggreg]

    # Measurement
    kwargs = dict()
    if ptscorer == B.mlp_ptscorer:
        kwargs['sum_mode'] = mlpsum
    model.add_node(name='scoreS',
                   input=ptscorer(model, final_outputs, Ddim, N, l2reg,
                                  **kwargs),
                   layer=Activation(oact))
    model.add_output(name='score', input='scoreS')
    return model
示例#10
0
def prep_model(model, N, s0pad, s1pad, c):
    # FIXME: pool_layer=None is in fact not supported, since this RNN
    # would return a scalar for e1s too; instead, we'l need to manually
    # pick the first&last element of the returned sequence from e0s
    B.rnn_input(model,
                N,
                s0pad,
                return_sequences=(c['pool_layer'] is not None),
                rnnlevels=c['rnnlevels'],
                dropout=c['dropout'],
                sdim=c['sdim'],
                rnnbidi=c['rnnbidi'],
                rnnbidi_mode=c['rnnbidi_mode'],
                rnn=c['rnn'],
                rnnact=c['rnnact'],
                rnninit=c['rnninit'])

    # Generate e0s aggregate embedding
    e0_aggreg, gwidth = aggregate(model,
                                  'e0s_',
                                  'e0',
                                  N,
                                  s0pad,
                                  c['pool_layer'],
                                  dropout=c['dropout'],
                                  l2reg=c['l2reg'],
                                  sdim=c['sdim'],
                                  cnnact=c['cnnact'],
                                  cdim=c['cdim'],
                                  cfiltlen=c['cfiltlen'],
                                  project=c['project'])

    if c['project']:
        # ...and re-embed e0, e1 in attention space
        awidth = int(N * c['adim'])
        model.add_node(name='e0a',
                       input=e0_aggreg,
                       layer=Dense(input_dim=gwidth,
                                   output_dim=awidth,
                                   W_regularizer=l2(c['l2reg'])))
        e0_aggreg_attn = 'e0a'

        model.add_node(name='e1sa_',
                       input='e1s',
                       layer=TimeDistributedDense(input_dim=int(N * c['sdim']),
                                                  output_dim=awidth,
                                                  W_regularizer=l2(
                                                      c['l2reg'])))
        # XXX: this dummy works around a mysterious theano error
        model.add_node(name='e1sa', input='e1sa_', layer=Activation('linear'))
        e1_attn = 'e1sa'
    else:
        e1_attn = 'e1s'
        e0_aggreg_attn = e0_aggreg

    # Now, build an attention function f(e0a, e1sa) -> e1a, producing an
    # (s1pad,) vector of scalars denoting the attention for each e1 token
    model.add_node(name='e0sa',
                   input=e0_aggreg_attn,
                   layer=RepeatVector(s1pad))
    if c['attn_mode'] == 'dot' or c['attn_mode'] == 'cos':
        # model attention by dot-product, i.e. similarity measure of question
        # aggregate and answer token in attention space
        model.add_node(name='e1a[1]',
                       layer=B.dot_time_distributed_merge(
                           model, ['e0sa', e1_attn],
                           cos_norm=(c['attn_mode'] == 'cos')))
    else:
        # traditional attention model from Hermann et al., 2015 and Tan et al., 2015
        # we want to model attention as w*tanh(e0a + e1sa[i])
        model.add_node(name='e1a[0]',
                       inputs=['e0sa', e1_attn],
                       merge_mode='sum',
                       layer=Activation('tanh'))
        model.add_node(name='e1a[1]',
                       input='e1a[0]',
                       layer=TimeDistributedDense(input_dim=awidth,
                                                  output_dim=1,
                                                  W_regularizer=l2(
                                                      c['l2reg'])))
    model.add_node(name='e1a[2]',
                   input='e1a[1]',
                   layer=Flatten(input_shape=(s1pad, 1)))

    # *Focus* e1 by softmaxing (by default) attention and multiplying tokens
    # by their attention.
    model.add_node(name='e1a[3]',
                   input='e1a[2]',
                   layer=Activation(c['focus_act']))
    model.add_node(name='e1a[4]',
                   input='e1a[3]',
                   layer=RepeatVector(int(N * c['sdim'])))
    model.add_node(name='e1a', input='e1a[4]', layer=Permute((2, 1)))
    model.add_node(name='e1sm',
                   inputs=['e1s_', 'e1a'],
                   merge_mode='mul',
                   layer=Activation('linear'))

    # Generate e1sm aggregate embedding
    e1_aggreg, gwidth = aggregate(model,
                                  'e1sm',
                                  'e1',
                                  N,
                                  s1pad,
                                  c['pool_layer'],
                                  dropout=c['dropout'],
                                  l2reg=c['l2reg'],
                                  sdim=c['sdim'],
                                  cnnact=c['cnnact'],
                                  cdim=c['cdim'],
                                  cfiltlen=c['cfiltlen'],
                                  project=c['project'])

    return (e0_aggreg, e1_aggreg)