def prep_model(model, N, s0pad, s1pad, c): (sc, Nc) = cnn_input(model, N, s0pad, l2reg=c['l2reg'], cnninit=c['cnninit'], cnnact=c['cnnact'], cdim=c['cdim'], cfiltlen=c['cfiltlen']) if c['maxpool_len'] > 1: model.add_shared_node(name='pool', inputs=['e0c', 'e1c'], outputs=['e0g', 'e1g'], layer=MaxPooling1D(pool_length=c['maxpool_len'])) sc /= c['maxpool_len'] cnn_outputs = ['e0g', 'e1g'] else: cnn_outputs = ['e0c', 'e1c'] model.add_node(name='e0c_', input=cnn_outputs[0], layer=Dropout(c['dropout'])) model.add_node(name='e1c_', input=cnn_outputs[1], layer=Dropout(c['dropout'])) B.rnn_input(model, Nc, sc, inputs=['e0c_', 'e1c_'], dropout=c['dropout'], sdim=c['sdim'], rnnbidi=c['rnnbidi'], rnn=c['rnn'], rnnact=c['rnnact'], rnninit=c['rnninit'], rnnbidi_mode=c['rnnbidi_mode'], rnnlevels=c['rnnlevels']) # Projection if c['project']: model.add_shared_node(name='proj', inputs=['e0s_', 'e1s_'], outputs=['e0p', 'e1p'], layer=Dense(input_dim=int(N*c['sdim']), output_dim=int(N*c['pdim']), W_regularizer=l2(c['l2reg']))) # This dropout is controversial; it might be harmful to apply, # or at least isn't a clear win. # model.add_shared_node(name='projdrop', inputs=['e0p', 'e1p'], outputs=['e0p_', 'e1p_'], # layer=Dropout(c['dropout'], input_shape=(N,))) # return ('e0p_', 'e1p_') return ('e0p', 'e1p') else: return ('e0s_', 'e1s_')
def prep_model(model, N, s0pad, s1pad, c): B.rnn_input(model, N, s0pad, dropout=c['dropout'], sdim=c['sdim'], rnnbidi=c['rnnbidi'], rnn=c['rnn'], rnnact=c['rnnact'], rnninit=c['rnninit'], rnnbidi_mode=c['rnnbidi_mode'], rnnlevels=c['rnnlevels']) # Projection if c['project']: model.add_shared_node(name='proj', inputs=['e0s_', 'e1s_'], outputs=['e0p', 'e1p'], layer=Dense(input_dim=int(N * c['sdim']), output_dim=int(N * c['pdim']), W_regularizer=l2(c['l2reg']), activation=c['pact'])) # This dropout is controversial; it might be harmful to apply, # or at least isn't a clear win. # model.add_shared_node(name='projdrop', inputs=['e0p', 'e1p'], outputs=['e0p_', 'e1p_'], # layer=Dropout(c['dropout'], input_shape=(N,))) # return ('e0p_', 'e1p_') return ('e0p', 'e1p') else: return ('e0s_', 'e1s_')
def prep_model(model, N, s0pad, s1pad, c): (sc, Nc) = cnn_input(model, N, s0pad, l2reg=c['l2reg'], cnninit=c['cnninit'], cnnact=c['cnnact'], cdim=c['cdim'], cfiltlen=c['cfiltlen']) if c['maxpool_len'] > 1: model.add_shared_node(name='pool', inputs=['e0c', 'e1c'], outputs=['e0g', 'e1g'], layer=MaxPooling1D(pool_length=c['maxpool_len'])) sc /= c['maxpool_len'] cnn_outputs = ['e0g', 'e1g'] else: cnn_outputs = ['e0c', 'e1c'] model.add_node(name='e0c_', input=cnn_outputs[0], layer=Dropout(c['dropout'])) model.add_node(name='e1c_', input=cnn_outputs[1], layer=Dropout(c['dropout'])) B.rnn_input(model, Nc, sc, inputs=['e0c_', 'e1c_'], dropout=c['dropout'], dropoutfix_inp=c['dropoutfix_inp'], dropoutfix_rec=c['dropoutfix_rec'], sdim=c['sdim'], rnnbidi=c['rnnbidi'], rnn=c['rnn'], rnnact=c['rnnact'], rnninit=c['rnninit'], rnnbidi_mode=c['rnnbidi_mode'], rnnlevels=c['rnnlevels']) # Projection if c['project']: model.add_shared_node(name='proj', inputs=['e0s_', 'e1s_'], outputs=['e0p', 'e1p'], layer=Dense(input_dim=int(N * c['sdim']), output_dim=int(N * c['pdim']), W_regularizer=l2(c['l2reg']))) # This dropout is controversial; it might be harmful to apply, # or at least isn't a clear win. # model.add_shared_node(name='projdrop', inputs=['e0p', 'e1p'], outputs=['e0p_', 'e1p_'], # layer=Dropout(c['dropout'], input_shape=(N,))) # return ('e0p_', 'e1p_') return ('e0p', 'e1p') else: return ('e0s_', 'e1s_')
def prep_model(inputs, N, s0pad, s1pad, c): outputs = B.rnn_input(inputs, N, s0pad, dropout=c['dropout'], dropoutfix_inp=c['dropoutfix_inp'], dropoutfix_rec=c['dropoutfix_rec'], sdim=c['sdim'], rnnbidi=c['rnnbidi'], rnn=c['rnn'], rnnact=c['rnnact'], rnninit=c['rnninit'], rnnbidi_mode=c['rnnbidi_mode'], rnnlevels=c['rnnlevels']) # Projection if c['project']: proj = Dense(int(N * c['pdim']), activation=c['pact'], kernel_regularizer=l2(c['l2reg']), name='proj') e0p = proj(outputs[0]) e1p = proj(outputs[1]) N = N * c['pdim'] return [e0p, e1p], N else: return [outputs[0], outputs[1]], N
def prep_model(model, N, s0pad, s1pad, c): # FIXME: pool_layer=None is in fact not supported, since this RNN # would return a scalar for e1s too; instead, we'l need to manually # pick the first&last element of the returned sequence from e0s B.rnn_input(model, N, s0pad, return_sequences=(c['pool_layer'] is not None), dropout=c['dropout'], dropoutfix_inp=c['dropoutfix_inp'], dropoutfix_rec=c['dropoutfix_rec'], sdim=c['sdim'], rnnlevels=c['rnnlevels'], rnnbidi=c['rnnbidi'], rnnbidi_mode=c['rnnbidi_mode'], rnn=c['rnn'], rnnact=c['rnnact'], rnninit=c['rnninit']) # Generate e0s aggregate embedding e0_aggreg, gwidth = aggregate(model, 'e0s_', 'e0', N, s0pad, c['pool_layer'], dropout=c['dropout'], l2reg=c['l2reg'], sdim=c['sdim'], cnnact=c['cnnact'], cdim=c['cdim'], cfiltlen=c['cfiltlen'], project=c['project']) if c['project']: # ...and re-embed e0, e1 in attention space awidth = int(N*c['adim']) model.add_node(name='e0a', input=e0_aggreg, layer=Dense(input_dim=gwidth, output_dim=awidth, W_regularizer=l2(c['l2reg']))) e0_aggreg_attn = 'e0a' model.add_node(name='e1sa_', input='e1s', layer=TimeDistributedDense(input_dim=int(N*c['sdim']), output_dim=awidth, W_regularizer=l2(c['l2reg']))) # XXX: this dummy works around a mysterious theano error model.add_node(name='e1sa', input='e1sa_', layer=Activation('linear')) e1_attn = 'e1sa' else: awidth = int(N*c['sdim']) e1_attn = 'e1s' e0_aggreg_attn = e0_aggreg # Now, build an attention function f(e0a, e1sa) -> e1a, producing an # (s1pad,) vector of scalars denoting the attention for each e1 token focus(model, N, e0_aggreg_attn, e1_attn, 'e1s_', 'e1a', 'e1sm', s1pad, c['sdim'], awidth, c['attn_mode'], c['focus_act'], c['l2reg']) # Generate e1sm aggregate embedding e1_aggreg, gwidth = aggregate(model, 'e1sm', 'e1', N, s1pad, c['pool_layer'], dropout=c['dropout'], l2reg=c['l2reg'], sdim=c['sdim'], cnnact=c['cnnact'], cdim=c['cdim'], cfiltlen=c['cfiltlen'], project=c['project']) return (e0_aggreg, e1_aggreg)
def prep_model(model, N, s0pad, s1pad, c): B.rnn_input(model, N, s0pad, dropout=c['dropout'], dropoutfix_inp=c['dropoutfix_inp'], dropoutfix_rec=c['dropoutfix_rec'], sdim=c['sdim'], rnnbidi=c['rnnbidi'], rnn=c['rnn'], rnnact=c['rnnact'], rnninit=c['rnninit'], rnnbidi_mode=c['rnnbidi_mode'], rnnlevels=c['rnnlevels']) # Projection if c['project']: model.add_shared_node(name='proj', inputs=['e0s_', 'e1s_'], outputs=['e0p', 'e1p'], layer=Dense(input_dim=int(N*c['sdim']), output_dim=int(N*c['pdim']), init=c['pinit'], W_regularizer=l2(c['l2reg']), activation=c['pact'])) # This dropout is controversial; it might be harmful to apply, # or at least isn't a clear win. # model.add_shared_node(name='projdrop', inputs=['e0p', 'e1p'], outputs=['e0p_', 'e1p_'], # layer=Dropout(c['dropout'], input_shape=(N,))) # return ('e0p_', 'e1p_') return ('e0p', 'e1p') else: return ('e0s_', 'e1s_')
def prep_model(model, N, s0pad, s1pad, c): # FIXME: pool_layer=None is in fact not supported, since this RNN # would return a scalar for e1s too; instead, we'l need to manually # pick the first&last element of the returned sequence from e0s B.rnn_input(model, N, s0pad, return_sequences=(c['pool_layer'] is not None), dropout=c['dropout'], dropoutfix_inp=c['dropoutfix_inp'], dropoutfix_rec=c['dropoutfix_rec'], sdim=c['sdim'], rnnlevels=c['rnnlevels'], rnnbidi=c['rnnbidi'], rnnbidi_mode=c['rnnbidi_mode'], rnn=c['rnn'], rnnact=c['rnnact'], rnninit=c['rnninit']) # Generate e0s aggregate embedding e0_aggreg, gwidth = aggregate(model, 'e0s_', 'e0', N, s0pad, c['pool_layer'], dropout=c['dropout'], l2reg=c['l2reg'], sdim=c['sdim'], cnnact=c['cnnact'], cdim=c['cdim'], cfiltlen=c['cfiltlen'], project=c['project']) if c['project']: # ...and re-embed e0, e1 in attention space awidth = int(N * c['adim']) model.add_node(name='e0a', input=e0_aggreg, layer=Dense(input_dim=gwidth, output_dim=awidth, W_regularizer=l2(c['l2reg']))) e0_aggreg_attn = 'e0a' model.add_node(name='e1sa_', input='e1s', layer=TimeDistributedDense(input_dim=int(N * c['sdim']), output_dim=awidth, W_regularizer=l2( c['l2reg']))) # XXX: this dummy works around a mysterious theano error model.add_node(name='e1sa', input='e1sa_', layer=Activation('linear')) e1_attn = 'e1sa' else: awidth = int(N * c['sdim']) e1_attn = 'e1s' e0_aggreg_attn = e0_aggreg # Now, build an attention function f(e0a, e1sa) -> e1a, producing an # (s1pad,) vector of scalars denoting the attention for each e1 token focus(model, N, e0_aggreg_attn, e1_attn, 'e1s_', 'e1a', 'e1sm', s1pad, c['sdim'], awidth, c['attn_mode'], c['focus_act'], c['l2reg']) # Generate e1sm aggregate embedding e1_aggreg, gwidth = aggregate(model, 'e1sm', 'e1', N, s1pad, c['pool_layer'], dropout=c['dropout'], l2reg=c['l2reg'], sdim=c['sdim'], cnnact=c['cnnact'], cdim=c['cdim'], cfiltlen=c['cfiltlen'], project=c['project']) return (e0_aggreg, e1_aggreg)
def prep_model(glove, vocab, dropout=3/4, dropout_in=None, l2reg=1e-4, rnnbidi=True, rnn=GRU, rnnbidi_mode='sum', rnnact='tanh', rnninit='glorot_uniform', sdim=2, rnnlevels=1, pool_layer=MaxPooling1D, cnnact='tanh', cnninit='glorot_uniform', cdim=2, cfiltlen=3, project=True, adim=1/2, attn_mode='sum', fact='softmax', ptscorer=B.mlp_ptscorer, mlpsum='sum', Ddim=2, oact='sigmoid'): model = Graph() N = B.embedding(model, glove, vocab, s0pad, s1pad, dropout, dropout_w=.5) # fix if dropout_in is None: dropout_in = dropout # FIXME: pool_layer=None is in fact not supported, since this RNN # would return a scalar for e1s too; instead, we'l need to manually # pick the first&last element of the returned sequence from e0s B.rnn_input(model, N, s0pad, return_sequences=(pool_layer is not None), rnnlevels=rnnlevels, dropout=dropout_in, sdim=sdim, rnnbidi=rnnbidi, rnnbidi_mode=rnnbidi_mode, rnn=rnn, rnnact=rnnact, rnninit=rnninit) # Generate e0s aggregate embedding e0_aggreg, gwidth = aggregate(model, 'e0s_', 'e0', N, s0pad, pool_layer, dropout=dropout_in, l2reg=l2reg, sdim=sdim, cnnact=cnnact, cdim=cdim, cfiltlen=cfiltlen, project=project) if project: # ...and re-embed e0, e1 in attention space awidth = int(N*adim) model.add_node(name='e0a', input=e0_aggreg, layer=Dense(input_dim=gwidth, output_dim=awidth, W_regularizer=l2(l2reg))) e0_aggreg_attn = 'e0a' model.add_node(name='e1sa_', input='e1s', layer=TimeDistributedDense(input_dim=int(N*sdim), output_dim=awidth, W_regularizer=l2(l2reg))) # XXX: this dummy works around a mysterious theano error model.add_node(name='e1sa', input='e1sa_', layer=Activation('linear')) e1_attn = 'e1sa' else: e1_attn = 'e1s' e0_aggreg_attn = e0_aggreg # Now, build an attention function f(e0a, e1sa) -> e1a, producing an # (s1pad,) vector of scalars denoting the attention for each e1 token model.add_node(name='e0sa', input=e0_aggreg_attn, layer=RepeatVector(s1pad)) if attn_mode == 'dot' or attn_mode == 'cos': # model attention by dot-product, i.e. similarity measure of question # aggregate and answer token in attention space model.add_node(name='e1a[1]', layer=B.dot_time_distributed_merge(model, ['e0sa', e1_attn], cos_norm=(attn_mode == 'cos'))) else: # traditional attention model from Hermann et al., 2015 and Tan et al., 2015 # we want to model attention as w*tanh(e0a + e1sa[i]) model.add_node(name='e1a[0]', inputs=['e0sa', e1_attn], merge_mode='sum', layer=Activation('tanh')) model.add_node(name='e1a[1]', input='e1a[0]', layer=TimeDistributedDense(input_dim=awidth, output_dim=1, W_regularizer=l2(l2reg))) model.add_node(name='e1a[2]', input='e1a[1]', layer=Flatten(input_shape=(s1pad, 1))) # *Focus* e1 by softmaxing (by default) attention and multiplying tokens # by their attention. model.add_node(name='e1a[3]', input='e1a[2]', layer=Activation(fact)) model.add_node(name='e1a[4]', input='e1a[3]', layer=RepeatVector(int(N*sdim))) model.add_node(name='e1a', input='e1a[4]', layer=Permute((2,1))) model.add_node(name='e1sm', inputs=['e1s_', 'e1a'], merge_mode='mul', layer=Activation('linear')) # Generate e1sm aggregate embedding e1_aggreg, gwidth = aggregate(model, 'e1sm', 'e1', N, s1pad, pool_layer, dropout=dropout_in, l2reg=l2reg, sdim=sdim, cnnact=cnnact, cdim=cdim, cfiltlen=cfiltlen, project=project) if ptscorer == '1': # special scoring mode just based on the answer # (assuming that the question match is carried by the attention) ptscorer = B.cat_ptscorer final_outputs = [e1_aggreg] else: final_outputs = [e0_aggreg, e1_aggreg] # Measurement kwargs = dict() if ptscorer == B.mlp_ptscorer: kwargs['sum_mode'] = mlpsum model.add_node(name='scoreS', input=ptscorer(model, final_outputs, Ddim, N, l2reg, **kwargs), layer=Activation(oact)) model.add_output(name='score', input='scoreS') return model
def prep_model(glove, vocab, dropout=3 / 4, dropout_in=None, l2reg=1e-4, rnnbidi=True, rnn=GRU, rnnbidi_mode='sum', rnnact='tanh', rnninit='glorot_uniform', sdim=2, rnnlevels=1, pool_layer=MaxPooling1D, cnnact='tanh', cnninit='glorot_uniform', cdim=2, cfiltlen=3, project=True, adim=1 / 2, attn_mode='sum', fact='softmax', ptscorer=B.mlp_ptscorer, mlpsum='sum', Ddim=2, oact='sigmoid'): model = Graph() N = B.embedding(model, glove, vocab, s0pad, s1pad, dropout, dropout_w=.5) # fix if dropout_in is None: dropout_in = dropout # FIXME: pool_layer=None is in fact not supported, since this RNN # would return a scalar for e1s too; instead, we'l need to manually # pick the first&last element of the returned sequence from e0s B.rnn_input(model, N, s0pad, return_sequences=(pool_layer is not None), rnnlevels=rnnlevels, dropout=dropout_in, sdim=sdim, rnnbidi=rnnbidi, rnnbidi_mode=rnnbidi_mode, rnn=rnn, rnnact=rnnact, rnninit=rnninit) # Generate e0s aggregate embedding e0_aggreg, gwidth = aggregate(model, 'e0s_', 'e0', N, s0pad, pool_layer, dropout=dropout_in, l2reg=l2reg, sdim=sdim, cnnact=cnnact, cdim=cdim, cfiltlen=cfiltlen, project=project) if project: # ...and re-embed e0, e1 in attention space awidth = int(N * adim) model.add_node(name='e0a', input=e0_aggreg, layer=Dense(input_dim=gwidth, output_dim=awidth, W_regularizer=l2(l2reg))) e0_aggreg_attn = 'e0a' model.add_node(name='e1sa_', input='e1s', layer=TimeDistributedDense(input_dim=int(N * sdim), output_dim=awidth, W_regularizer=l2(l2reg))) # XXX: this dummy works around a mysterious theano error model.add_node(name='e1sa', input='e1sa_', layer=Activation('linear')) e1_attn = 'e1sa' else: e1_attn = 'e1s' e0_aggreg_attn = e0_aggreg # Now, build an attention function f(e0a, e1sa) -> e1a, producing an # (s1pad,) vector of scalars denoting the attention for each e1 token model.add_node(name='e0sa', input=e0_aggreg_attn, layer=RepeatVector(s1pad)) if attn_mode == 'dot' or attn_mode == 'cos': # model attention by dot-product, i.e. similarity measure of question # aggregate and answer token in attention space model.add_node(name='e1a[1]', layer=B.dot_time_distributed_merge( model, ['e0sa', e1_attn], cos_norm=(attn_mode == 'cos'))) else: # traditional attention model from Hermann et al., 2015 and Tan et al., 2015 # we want to model attention as w*tanh(e0a + e1sa[i]) model.add_node(name='e1a[0]', inputs=['e0sa', e1_attn], merge_mode='sum', layer=Activation('tanh')) model.add_node(name='e1a[1]', input='e1a[0]', layer=TimeDistributedDense(input_dim=awidth, output_dim=1, W_regularizer=l2(l2reg))) model.add_node(name='e1a[2]', input='e1a[1]', layer=Flatten(input_shape=(s1pad, 1))) # *Focus* e1 by softmaxing (by default) attention and multiplying tokens # by their attention. model.add_node(name='e1a[3]', input='e1a[2]', layer=Activation(fact)) model.add_node(name='e1a[4]', input='e1a[3]', layer=RepeatVector(int(N * sdim))) model.add_node(name='e1a', input='e1a[4]', layer=Permute((2, 1))) model.add_node(name='e1sm', inputs=['e1s_', 'e1a'], merge_mode='mul', layer=Activation('linear')) # Generate e1sm aggregate embedding e1_aggreg, gwidth = aggregate(model, 'e1sm', 'e1', N, s1pad, pool_layer, dropout=dropout_in, l2reg=l2reg, sdim=sdim, cnnact=cnnact, cdim=cdim, cfiltlen=cfiltlen, project=project) if ptscorer == '1': # special scoring mode just based on the answer # (assuming that the question match is carried by the attention) ptscorer = B.cat_ptscorer final_outputs = [e1_aggreg] else: final_outputs = [e0_aggreg, e1_aggreg] # Measurement kwargs = dict() if ptscorer == B.mlp_ptscorer: kwargs['sum_mode'] = mlpsum model.add_node(name='scoreS', input=ptscorer(model, final_outputs, Ddim, N, l2reg, **kwargs), layer=Activation(oact)) model.add_output(name='score', input='scoreS') return model
def prep_model(model, N, s0pad, s1pad, c): # FIXME: pool_layer=None is in fact not supported, since this RNN # would return a scalar for e1s too; instead, we'l need to manually # pick the first&last element of the returned sequence from e0s B.rnn_input(model, N, s0pad, return_sequences=(c['pool_layer'] is not None), rnnlevels=c['rnnlevels'], dropout=c['dropout'], sdim=c['sdim'], rnnbidi=c['rnnbidi'], rnnbidi_mode=c['rnnbidi_mode'], rnn=c['rnn'], rnnact=c['rnnact'], rnninit=c['rnninit']) # Generate e0s aggregate embedding e0_aggreg, gwidth = aggregate(model, 'e0s_', 'e0', N, s0pad, c['pool_layer'], dropout=c['dropout'], l2reg=c['l2reg'], sdim=c['sdim'], cnnact=c['cnnact'], cdim=c['cdim'], cfiltlen=c['cfiltlen'], project=c['project']) if c['project']: # ...and re-embed e0, e1 in attention space awidth = int(N * c['adim']) model.add_node(name='e0a', input=e0_aggreg, layer=Dense(input_dim=gwidth, output_dim=awidth, W_regularizer=l2(c['l2reg']))) e0_aggreg_attn = 'e0a' model.add_node(name='e1sa_', input='e1s', layer=TimeDistributedDense(input_dim=int(N * c['sdim']), output_dim=awidth, W_regularizer=l2( c['l2reg']))) # XXX: this dummy works around a mysterious theano error model.add_node(name='e1sa', input='e1sa_', layer=Activation('linear')) e1_attn = 'e1sa' else: e1_attn = 'e1s' e0_aggreg_attn = e0_aggreg # Now, build an attention function f(e0a, e1sa) -> e1a, producing an # (s1pad,) vector of scalars denoting the attention for each e1 token model.add_node(name='e0sa', input=e0_aggreg_attn, layer=RepeatVector(s1pad)) if c['attn_mode'] == 'dot' or c['attn_mode'] == 'cos': # model attention by dot-product, i.e. similarity measure of question # aggregate and answer token in attention space model.add_node(name='e1a[1]', layer=B.dot_time_distributed_merge( model, ['e0sa', e1_attn], cos_norm=(c['attn_mode'] == 'cos'))) else: # traditional attention model from Hermann et al., 2015 and Tan et al., 2015 # we want to model attention as w*tanh(e0a + e1sa[i]) model.add_node(name='e1a[0]', inputs=['e0sa', e1_attn], merge_mode='sum', layer=Activation('tanh')) model.add_node(name='e1a[1]', input='e1a[0]', layer=TimeDistributedDense(input_dim=awidth, output_dim=1, W_regularizer=l2( c['l2reg']))) model.add_node(name='e1a[2]', input='e1a[1]', layer=Flatten(input_shape=(s1pad, 1))) # *Focus* e1 by softmaxing (by default) attention and multiplying tokens # by their attention. model.add_node(name='e1a[3]', input='e1a[2]', layer=Activation(c['focus_act'])) model.add_node(name='e1a[4]', input='e1a[3]', layer=RepeatVector(int(N * c['sdim']))) model.add_node(name='e1a', input='e1a[4]', layer=Permute((2, 1))) model.add_node(name='e1sm', inputs=['e1s_', 'e1a'], merge_mode='mul', layer=Activation('linear')) # Generate e1sm aggregate embedding e1_aggreg, gwidth = aggregate(model, 'e1sm', 'e1', N, s1pad, c['pool_layer'], dropout=c['dropout'], l2reg=c['l2reg'], sdim=c['sdim'], cnnact=c['cnnact'], cdim=c['cdim'], cfiltlen=c['cfiltlen'], project=c['project']) return (e0_aggreg, e1_aggreg)