def get_rnn_unit(l_in, mask, rev_mask, state, rev_state, n_units, prefix, grad_clip=0, context=None, attention=False): net = OrderedDict() hid = state rg = Gate(W_in=input, W_hid=inner, W_cell=None) ug = Gate(W_in=input, W_hid=inner, W_cell=None) hg = Gate(W_in=input, W_hid=inner, W_cell=None, nonlinearity=tanh) net[prefix + 'gru'] = GRULayer(l_in, num_units=n_units, resetgate=rg, updategate=ug, hidden_update=hg, mask_input=mask, hid_init=hid, learn_init=False, only_return_final=False, grad_clipping=grad_clip, context_input=context, use_attention=attention, name='gru') if rev_mask is not None and rev_state is not None: net[prefix + 'gru_rev'] = GRULayer(l_in, num_units=n_units, resetgate=rg, updategate=ug, hidden_update=hg, mask_input=rev_mask, hid_init=rev_state, only_return_final=False, learn_init=False, grad_clipping=grad_clip, context_input=context, backwards=True, name='gru_rev') net['context'] = ElemwiseSumLayer(net.values()[-2:], name='context') return net
def get_unidirectional_layer(self, input_layer, mask_layer, n_hidden, true_input_size, only_return_final, backwards=False): if true_input_size is not None: if self.layer_type == "LSTM": layer = LSTMLayerOHEInput elif self.layer_type == "GRU": layer = GRULayerOHEInput elif self.layer_type == "Vanilla": layer = VanillaLayerOHEInput else: raise ValueError('Unknown layer type') return layer(input_layer, n_hidden, true_input_size, mask_input=mask_layer, grad_clipping=self.grad_clip, learn_init=True, only_return_final=only_return_final, backwards=backwards, ingate=Gate(nonlinearity=self.act_f_input), forgetgate=Gate(nonlinearity=self.act_f_forget), outgate=Gate(nonlinearity=self.act_f_output), cell=Gate(W_cell=None, nonlinearity=self.act_f_cell), nonlinearity=self.act_f_hidden) else: if self.layer_type == "LSTM": layer = lasagne.layers.LSTMLayer elif self.layer_type == "GRU": layer = lasagne.layers.GRULayer elif self.layer_type == "Vanilla": layer = lasagne.layers.RecurrentLayer else: raise ValueError('Unknown layer type') return layer(input_layer, n_hidden, mask_input=mask_layer, grad_clipping=self.grad_clip, learn_init=True, only_return_final=only_return_final, backwards=backwards)
def __init__(self, n_in, n_out, resetgate=Gate(W_cell=None), updategate=Gate(W_cell=None), hidden_update=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), grad_clipping=100): self.n_in = n_in self.n_out = n_out self.grad_clipping = grad_clipping self.params = [] def create_gate_params(gate): """ Convenience function for adding layer parameters from a Gate instance. """ return (gate.W_in.sample( (n_in, n_out)), gate.W_hid.sample( (n_out, n_out)), gate.b.sample( (n_out, )), gate.nonlinearity) (W_in_to_updategate, W_hid_to_updategate, b_updategate, self.nonlinearity_updategate) = create_gate_params(updategate) (W_in_to_resetgate, W_hid_to_resetgate, b_resetgate, self.nonlinearity_resetgate) = create_gate_params(resetgate) (W_in_to_hidden_update, W_hid_to_hidden_update, b_hidden_update, self.nonlinearity_hid) = create_gate_params(hidden_update) W_in_stacked = np.concatenate( [W_in_to_resetgate, W_in_to_updategate, W_in_to_hidden_update], axis=1) self.W_in_stacked = shared(W_in_stacked, 'W_in') W_hid_stacked = np.concatenate( [W_hid_to_resetgate, W_hid_to_updategate, W_hid_to_hidden_update], axis=1) self.W_hid_stacked = shared(W_hid_stacked, 'W_hid') b_stacked = np.concatenate( [b_resetgate, b_updategate, b_hidden_update], axis=0) self.b_stacked = shared(b_stacked, name='b') self.in_params = [self.W_in_stacked, self.b_stacked] self.rec_params = [self.W_hid_stacked]
def __init__(self, incoming, num_units, ingate=Gate(), forgetgate=Gate(), cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(), nonlinearity=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.), backwards=False, learn_init=False, peepholes=True, gradient_steps=-1, grad_clipping=0, unroll_scan=False, precompute_input=True, mask_input=None, only_return_final=False, W_dt=init.GlorotUniform(), b_dt=init.Constant(0.), nonlinearity_dt=nonlinearities.rectify, num_dt_layers=1, **kwargs): super(LSTMDTLayer, self).__init__(incoming, num_units, ingate, forgetgate, cell, outgate, nonlinearity, cell_init, hid_init, backwards, learn_init, peepholes, gradient_steps, grad_clipping, unroll_scan, precompute_input, mask_input, only_return_final, **kwargs) self.nonlinearity_dt = (nonlinearities.identity if nonlinearity_dt is None else nonlinearity_dt) self.num_dt_layers = num_dt_layers self.W_dt = [ self.add_param(W_dt, (num_units, num_units), name="W_dt") for _ in range(self.num_dt_layers) ] self.b_dt = [ self.add_param(b_dt, (1, num_units), name="b_dt", regularizable=False) for _ in range(self.num_dt_layers) ]
def _get_l_out(self, input_vars): listener.check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer( l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.listener_nonlinearity] l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_hidden = DenseLayer( l_rec1_drop, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_out = DenseLayer(l_hidden_drop, num_units=3, nonlinearity=softmax, name=id_tag + 'scores') return l_out, [l_in]
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] context_vars = input_vars[1:] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer( l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.listener_nonlinearity] l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', only_return_final=True, **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout, name=id_tag + 'rec2_drop') else: l_rec2_drop = l_rec2 # add only_return_final to l_rec1 and uncomment next line to remove second layer # l_rec2_drop = l_rec1_drop # Context repr has shape (batch_size, context_len * repr_size) l_context_repr, context_inputs = self.color_vec.get_input_layer( context_vars, cell_size=self.options.listener_cell_size, context_len=self.context_len, id=self.id) l_concat = ConcatLayer([l_context_repr, l_rec2_drop], axis=1, name=id_tag + 'concat_context_rec2') l_hidden_drop = l_concat for i in range(1, self.options.listener_hidden_color_layers + 1): l_hidden = NINLayer(l_hidden_drop, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[ self.options.listener_nonlinearity], name=id_tag + 'hidden_combined%d' % i) if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_scores = DenseLayer(l_hidden_drop, num_units=self.context_len, nonlinearity=softmax, name=id_tag + 'scores') return l_scores, [l_in] + context_inputs
def __init__(self, incoming, num_units, resetgate=Gate(W_cell=None), updategate=Gate(W_cell=None), hidden_update=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), hid_init=init.Constant(0.), cov_init=init.Constant(0.), backwards=False, learn_init=False, gradient_steps=-1, grad_clipping=0, unroll_scan=False, precompute_input=True, mask_input=None, only_return_final=False, **kwargs): # This layer inherits from a MergeLayer, because it can have three # inputs - the layer input, the mask and the initial hidden state. We # will just provide the layer input as incomings, unless a mask input # or initial hidden state was provided. incomings = [incoming] self.mask_incoming_index = -1 self.hid_init_incoming_index = -1 self.cov_init_incoming_index = -1 if mask_input is not None: incomings.append(mask_input) self.mask_incoming_index = len(incomings)-1 if isinstance(hid_init, Layer): incomings.append(hid_init) self.hid_init_incoming_index = len(incomings)-1 if isinstance(cov_init, Layer): incomings.append(cov_init) self.cov_init_incoming_index = len(incomings)-1 # Initialize parent layer super(GRULayer, self).__init__(incomings, **kwargs) self.learn_init = learn_init self.num_units = num_units self.grad_clipping = grad_clipping self.backwards = backwards self.gradient_steps = gradient_steps self.unroll_scan = unroll_scan self.precompute_input = precompute_input self.only_return_final = only_return_final if unroll_scan and gradient_steps != -1: raise ValueError( "Gradient steps must be -1 when unroll_scan is true.") # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] if unroll_scan and input_shape[1] is None: raise ValueError("Input sequence length cannot be specified as " "None when unroll_scan is True") # Input dimensionality is the output dimensionality of the input layer num_inputs = np.prod(input_shape[2:]) def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units,), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # Add in all parameters from gates (self.W_in_to_updategate, self.W_hid_to_updategate, self.b_updategate, self.nonlinearity_updategate) = add_gate_params(updategate, 'updategate') (self.W_in_to_resetgate, self.W_hid_to_resetgate, self.b_resetgate, self.nonlinearity_resetgate) = add_gate_params(resetgate, 'resetgate') (self.W_in_to_hidden_update, self.W_hid_to_hidden_update, self.b_hidden_update, self.nonlinearity_hid) = add_gate_params( hidden_update, 'hidden_update') # Initialize hidden state if isinstance(hid_init, Layer): self.hid_init = hid_init else: self.hid_init = self.add_param( hid_init, (1, self.num_units), name="hid_init", trainable=learn_init, regularizable=False) if isinstance(cov_init, Layer): self.cov_init = cov_init else: self.cov_init = self.add_param( cov_init, (1, self.num_units), name="cov_init", trainable=learn_init, regularizable=False)
def get_rnn(input_var, mask_var, time_var, arch_size, GRAD_CLIP=100, bn=False, model_type='plstm'): # (batch size, max sequence length, number of features) l_in = lasagne.layers.InputLayer(shape=(None, None, 1), input_var=input_var) #L0? # Mask as matrices of dimensionality (N_BATCH, MAX_LENGTH) l_mask = lasagne.layers.InputLayer(shape=(None, None), input_var=mask_var) #l6 # Time as matrices of dimensionality (N_BATCH, MAX_LENGTH) l_t = lasagne.layers.InputLayer(shape=(None, None), input_var=time_var) #l5 # Allows arbitrary sizes batch_size, seq_len, _ = input_var.shape if model_type == 'plstm': print('Using PLSTM.') # RNN layer 1 l_forward = PLSTMLayer( l_in, time_input=l_t, num_units=arch_size[1], mask_input=l_mask, ingate=Gate(b=lasagne.init.Constant(-0.1)), forgetgate=Gate(b=lasagne.init.Constant(0), nonlinearity=lasagne.nonlinearities.sigmoid), cell=Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=Gate(), nonlinearity=lasagne.nonlinearities.tanh, grad_clipping=GRAD_CLIP, bn=bn, learn_time_params=[True, True, True], timegate=PLSTMTimeGate(Period=ExponentialUniformInit((1, 3)), Shift=lasagne.init.Uniform((0., 100)), On_End=lasagne.init.Constant(0.05))) else: print('Using LSTM, with BN: {}'.format(bn)) # RNN layers l_forward = LSTMWBNLayer( lasagne.layers.ConcatLayer([ l_in, lasagne.layers.ReshapeLayer(l_t, [batch_size, seq_len, 1]) ], axis=2), num_units=arch_size[1], mask_input=l_mask, grad_clipping=GRAD_CLIP, ingate=Gate(b=lasagne.init.Constant(-0.1)), forgetgate=Gate(b=lasagne.init.Constant(0), nonlinearity=lasagne.nonlinearities.sigmoid), cell=Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=Gate(), nonlinearity=lasagne.nonlinearities.tanh, bn=bn) # Need to slice off the last layer now l_slice = lasagne.layers.SliceLayer(l_forward, -1, axis=1) #l11 # Softmax l_dense = lasagne.layers.DenseLayer( l_slice, num_units=arch_size[2], nonlinearity=lasagne.nonlinearities.leaky_rectify) l_out = lasagne.layers.NonlinearityLayer( l_dense, nonlinearity=lasagne.nonlinearities.softmax) return l_out
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' prev_output_var, mask_var = input_vars[-2:] color_input_vars = input_vars[:-2] context_len = self.context_len if hasattr(self, 'context_len') else 1 l_color_repr, color_inputs = self.color_vec.get_input_layer( color_input_vars, recurrent_length=self.seq_vec.max_len - 1, cell_size=self.options.speaker_cell_size, context_len=context_len, id=self.id) l_hidden_color = dimshuffle(l_color_repr, (0, 2, 1)) for i in range(1, self.options.speaker_hidden_color_layers + 1): l_hidden_color = NINLayer( l_hidden_color, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_color%d' % i) l_hidden_color = dimshuffle(l_hidden_color, (0, 2, 1)) l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=prev_output_var, name=id_tag + 'prev_input') l_prev_embed = EmbeddingLayer( l_prev_out, input_size=len(self.seq_vec.tokens), output_size=self.options.speaker_cell_size, name=id_tag + 'prev_embed') l_in = ConcatLayer([l_hidden_color, l_prev_embed], axis=2, name=id_tag + 'color_prev') l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=mask_var, name=id_tag + 'mask_input') l_rec_drop = l_in cell = CELLS[self.options.speaker_cell] cell_kwargs = { 'mask_input': (None if self.options.speaker_no_mask else l_mask_in), 'grad_clipping': self.options.speaker_grad_clipping, 'num_units': self.options.speaker_cell_size, } if self.options.speaker_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.speaker_forget_bias)) if self.options.speaker_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.speaker_nonlinearity] for i in range(1, self.options.speaker_recurrent_layers): l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs) if self.options.speaker_dropout > 0.0: l_rec_drop = DropoutLayer(l_rec, p=self.options.speaker_dropout, name=id_tag + 'rec%d_drop' % i) else: l_rec_drop = l_rec l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % self.options.speaker_recurrent_layers, **cell_kwargs) l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size), name=id_tag + 'reshape') l_hidden_out = l_shape for i in range(1, self.options.speaker_hidden_out_layers + 1): l_hidden_out = DenseLayer( l_hidden_out, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_out%d' % i) l_softmax = DenseLayer(l_hidden_out, num_units=len(self.seq_vec.tokens), nonlinearity=softmax, name=id_tag + 'softmax') l_out = ReshapeLayer( l_softmax, (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)), name=id_tag + 'out') return l_out, color_inputs + [l_prev_out, l_mask_in]
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' color_mask_var, prev_output_var, mask_var = input_vars[-3:] color_input_vars = input_vars[:-3] num_contexts = color_mask_var.shape[0] num_colors = color_mask_var.shape[1] l_color_repr, color_inputs = self.color_vec.get_input_layer( color_input_vars, recurrent_length=0, cell_size=self.options.speaker_cell_size, context_len=None, id=self.id) l_color_reshaped = ReshapeLayer( l_color_repr, (num_contexts, num_colors, self.color_vec.output_size), name=id_tag + 'color_reshaped') l_color_mask_in = InputLayer(shape=(None, None), input_var=color_mask_var, name=id_tag + 'color_mask') cell = CELLS[self.options.speaker_cell] cell_kwargs = { 'mask_input': (None if self.options.speaker_no_mask else l_color_mask_in), 'grad_clipping': self.options.speaker_grad_clipping, 'num_units': self.options.speaker_cell_size, } if self.options.speaker_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.speaker_forget_bias)) if self.options.speaker_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.speaker_nonlinearity] l_context_out = cell(l_color_reshaped, name=id_tag + 'reccontext', only_return_final=True, **cell_kwargs) l_context_tiled = RepeatLayer(l_context_out, self.seq_vec.max_len - 1, name=id_tag + 'reccontext_tiled') l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=prev_output_var, name=id_tag + 'prev_input') l_prev_embed = EmbeddingLayer( l_prev_out, input_size=len(self.seq_vec.tokens), output_size=self.options.speaker_cell_size, name=id_tag + 'prev_embed') l_in = ConcatLayer([l_context_tiled, l_prev_embed], axis=2, name=id_tag + 'color_prev') l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=mask_var, name=id_tag + 'mask_input') l_rec_drop = l_in cell_kwargs['mask_input'] = (None if self.options.speaker_no_mask else l_mask_in) for i in range(1, self.options.speaker_recurrent_layers): l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs) if self.options.speaker_dropout > 0.0: l_rec_drop = DropoutLayer(l_rec, p=self.options.speaker_dropout, name=id_tag + 'rec%d_drop' % i) else: l_rec_drop = l_rec l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % self.options.speaker_recurrent_layers, **cell_kwargs) l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size), name=id_tag + 'reshape') l_hidden_out = l_shape for i in range(1, self.options.speaker_hidden_out_layers + 1): l_hidden_out = DenseLayer( l_hidden_out, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_out%d' % i) l_softmax = DenseLayer(l_hidden_out, num_units=len(self.seq_vec.tokens), nonlinearity=softmax, name=id_tag + 'softmax') l_out = ReshapeLayer( l_softmax, (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)), name=id_tag + 'out') return l_out, color_inputs + [l_color_mask_in, l_prev_out, l_mask_in]
def __init__( self, incoming, # 输入层输出 (batch size, SEQ_LENGTH, num_features) num_units, # 隐藏层单元个数 (128) time_input, # 输入层时间 (batch size, SEQ_LENGTH) duration_input, # 输入持续时间(batch size,SEQ_LENGTH) ingate=Gate(), cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), outgate=OutGate(), nonlinearity=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.), hid1_init=init.Constant(0.), hid2_init=init.Constant(0.), backwards=False, learn_init=False, peepholes=True, gradient_steps=-1, grad_clipping=0, unroll_scan=False, precompute_input=True, mask_input=None, # 输入层有效序列(1 1 1 1 1 1 ... 0 0 0 0) (batch size, SEQ_LENGTH) only_return_final=False, bn=False, tgate1=TimeGate(W_t=init.Uniform((-1, 0))), # add 添加时间门 boundary=-0.00001, # add2 不知道什么用 constraint ceil dgate2=DurationGate(), # addv wgate=WGate(), **kwargs): # 建立incomings作为所有输入层的list,并将incoming作为第一个元素 incomings = [incoming] # add 时间作为必要输入 incomings.append(time_input) self.time_incoming_index = len(incomings) - 1 # addv 持续时间作为必要输入 incomings.append(duration_input) self.duration_incoming_index = len(incomings) - 1 self.mask_incoming_index = -1 self.hid_init_incoming_index = -1 self.cell_init_incoming_index = -1 # v:MergeLayer可以有多个输入层,可以使用append将输入层叠加,然后调用父类的__init__初始化 if mask_input is not None: incomings.append(mask_input) self.mask_incoming_index = len(incomings) - 1 if isinstance(hid_init, Layer): incomings.append(hid_init) self.hid_init_incoming_index = len(incomings) - 1 if isinstance(cell_init, Layer): incomings.append(cell_init) self.cell_init_incoming_index = len(incomings) - 1 # Initialize parent layer super(VDTLSTMEMLayer, self).__init__(incomings, **kwargs) # If the provided nonlinearity is None, make it linear if nonlinearity is None: self.nonlinearity = nonlinearities.identity else: self.nonlinearity = nonlinearity # v:多个变量不知道什么意思 self.learn_init = learn_init # 不知道什么意思 default:false 可能没有什么用 self.num_units = num_units # default:128 self.backwards = backwards # 不知道什么意思 default:false self.peepholes = peepholes # 不知道什么意思 default:true self.gradient_steps = gradient_steps # 不知道什么意思 default:-1 self.grad_clipping = grad_clipping # 不知道什么意思 default:0 self.unroll_scan = unroll_scan # 不知道什么意思 default:false self.precompute_input = precompute_input # 不知道什么意思 default:false self.only_return_final = only_return_final # 不知道什么意思 default:false self.boundary = boundary # add2 if unroll_scan and gradient_steps != -1: raise ValueError( "Gradient steps must be -1 when unroll_scan is true.") # 验证输入向量 # input_shapes是自带的方法,用于查看输入的维度 input_shape = self.input_shapes[0] # add time_shape = self.input_shapes[1] if unroll_scan and input_shape[1] is None: raise ValueError("Input sequence length cannot be specified as " "None when unroll_scan is True") # 返回给定轴上的数组元素的乘积。 num_inputs = np.prod(input_shape[2:]) def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) def add_outgate_params(gate, gate_name): return (self.add_param(gate.W_in, (num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.W_to, (1, num_units), name="W_to_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # add def add_timegate_params(gate, gate_name): return (self.add_param(gate.W_t, (1, num_units), name="W_t_to_{}".format(gate_name)), self.add_param(gate.W_x, (num_inputs, num_units), name="W_x_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name)), gate.nonlinearity_inside, gate.nonlinearity_outside) # addv def add_duration_gate_params(gate, gate_name): return (self.add_param(gate.W_d, (1, num_units), name="W_d_to_{}".format(gate_name)), self.add_param(gate.W_x, (num_inputs, num_units), name="W_x_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name)), gate.nonlinearity, gate.nonlinearity_outside) # addvw def add_wgate_params(gate, gate_name): return (self.add_param(gate.W_x, (num_units, num_units), name="W_x_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="W_b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # 添加LSTM的输入门 (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate') # 添加LSTM的单元(cell) (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell') # 添加LSTM的输出门 (self.W_in_to_outgate, self.W_hid_to_outgate, self.W_to_to_outgate, self.b_outgate, self.nonlinearity_outgate) = add_outgate_params(outgate, 'outgate') # add (self.W_t1_to_tg1, self.W_x1_to_tg1, self.b1_tg1, self.nonlinearity_inside_tg1, self.nonlinearity_outside_tg1) = add_timegate_params( tgate1, 'tgate1') (self.W_d2_to_dg2, self.W_x2_to_dg2, self.b2_dg2, self.nonlinearity_dg2, self.nonlinearity_outside_dg2) = add_duration_gate_params( dgate2, 'dgate2') # addvw 添加一个权重w (self.W_x_wg, self.b_wg, self.nonlinearity_wg) = add_wgate_params(wgate, 'wgate') # 即cell的输出会通到输入门,输出门,忘记门 if self.peepholes: self.W_cell_to_ingate = self.add_param(ingate.W_cell, (num_units, ), name="W_cell_to_ingate") self.W_cell_to_outgate = self.add_param(outgate.W_cell, (num_units, ), name="W_cell_to_outgate") # 这两个单元就是cell和hid,以下是第一次初始化 if isinstance(cell_init, Layer): self.cell_init = cell_init else: self.cell_init = self.add_param(cell_init, (1, num_units), name="cell_init", trainable=learn_init, regularizable=False) if isinstance(hid_init, Layer): self.hid_init = hid_init else: self.hid_init = self.add_param(hid_init, (1, self.num_units), name="hid_init", trainable=learn_init, regularizable=False) if isinstance(hid1_init, Layer): self.hid1_init = hid1_init else: self.hid1_init = self.add_param(hid1_init, (1, self.num_units), name="hid1_init", trainable=learn_init, regularizable=False) if isinstance(hid2_init, Layer): self.hid2_init = hid2_init else: self.hid2_init = self.add_param(hid2_init, (1, self.num_units), name="hid2_init", trainable=learn_init, regularizable=False) # 如果bn为true,则构造BatchNormLayer,This layer implements batch normalization of its inputs. # self.params.update(self.bn.params)?似乎是对所有的参数进行标准化 if bn: self.bn = lasagne.layers.BatchNormLayer(input_shape, axes=(0, 1)) self.params.update(self.bn.params) else: self.bn = False
def __init__(self, x, hid_previous, num_units, resetgate=Gate(W_cell=None), updategate=Gate(W_cell=None), hidden_update=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), hid_init=init.Constant(0.), learn_init=False, grad_clipping=0, **kwargs): if hid_previous.output_shape[-1] != num_units: raise ValueError('Number of hid_previous inputs should be the ' 'same as num_units_gru') if x.output_shape[0] != hid_previous.output_shape[0]: raise ValueError('first dimension output of x and hid_previous ' 'should be equal') # Initialize parent layer super(GRUCell, self).__init__([x, hid_previous], **kwargs) self.learn_init = learn_init self.num_units = num_units # this could also be inferred? self.grad_clipping = grad_clipping self.unroll_scan = unroll_scan # Retrieve the dimensionality of the incoming layer input_shape_x = self.input_shapes[0] # Input dimensionality is the output dimensionality of the input layer num_inputs_x = np.prod(input_shape_x[1:]) def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (num_inputs_x, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # Add in all parameters from gates (self.W_in_to_updategate, self.W_hid_to_updategate, self.b_updategate, self.nonlinearity_updategate) = add_gate_params( updategate, 'updategate') (self.W_in_to_resetgate, self.W_hid_to_resetgate, self.b_resetgate, self.nonlinearity_resetgate) = add_gate_params( resetgate, 'resetgate') (self.W_in_to_hidden_update, self.W_hid_to_hidden_update, self.b_hidden_update, self.nonlinearity_hid) = add_gate_params(hidden_update, 'hidden_update') # Initialize hidden state self.hid_init = self.add_param(hid_init, (1, self.num_units), name="hid_init", trainable=learn_init, regularizable=False) # Stack input weight matrices into a (num_inputs, 3*num_units_gru) # matrix, which speeds up computation self.W_in_stacked = T.concatenate([ self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update ], axis=1) # Same for hidden weight matrices self.W_hid_stacked = T.concatenate([ self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update ], axis=1) # Stack gate biases into a (3*num_units_gru) vector self.b_stacked = T.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0)
def __init__(self, incoming, time_input, num_units, ingate=Gate(b=lasagne.init.Constant(0)), forgetgate=Gate(b=lasagne.init.Constant(2), nonlinearity=nonlinearities.sigmoid), timegate=PLSTMTimeGate(), cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(), nonlinearity=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.), backwards=False, learn_init=False, peepholes=True, gradient_steps=-1, grad_clipping=0, unroll_scan=False, precompute_input=True, mask_input=None, only_return_final=False, bn=False, learn_time_params=[True, True, False], off_alpha=1e-3, **kwargs): # This layer inherits from a MergeLayer, because it can have four # inputs - the layer input, the mask, the initial hidden state and the # inital cell state. We will just provide the layer input as incomings, # unless a mask input, inital hidden state or initial cell state was # provided. incomings = [incoming] # TIME STUFF incomings.append(time_input) self.time_incoming_index = len(incomings)-1 self.mask_incoming_index = -2 self.hid_init_incoming_index = -2 self.cell_init_incoming_index = -2 #ADD TIME INPUT HERE if mask_input is not None: incomings.append(mask_input) self.mask_incoming_index = len(incomings)-1 if isinstance(hid_init, Layer): incomings.append(hid_init) self.hid_init_incoming_index = len(incomings)-1 if isinstance(cell_init, Layer): incomings.append(cell_init) self.cell_init_incoming_index = len(incomings)-1 # Initialize parent layer super(PLSTMLayer, self).__init__(incomings, **kwargs) # If the provided nonlinearity is None, make it linear if nonlinearity is None: self.nonlinearity = nonlinearities.identity else: self.nonlinearity = nonlinearity self.learn_init = learn_init self.num_units = num_units self.backwards = backwards self.peepholes = peepholes self.gradient_steps = gradient_steps self.grad_clipping = grad_clipping self.unroll_scan = unroll_scan self.precompute_input = precompute_input self.only_return_final = only_return_final if unroll_scan and gradient_steps != -1: raise ValueError( "Gradient steps must be -1 when unroll_scan is true.") # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] time_shape = self.input_shapes[1] if unroll_scan and input_shape[1] is None: raise ValueError("Input sequence length cannot be specified as " "None when unroll_scan is True") # m num_inputs = np.prod(input_shape[2:]) def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units,), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # PHASED LSTM: Initialize params for the time gate self.off_alpha = off_alpha if timegate == None: timegate = TimeGate() def add_timegate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.Period, (num_units, ), name="Period_{}".format(gate_name), trainable=learn_time_params[0]), self.add_param(gate.Shift, (num_units, ), name="Shift_{}".format(gate_name), trainable=learn_time_params[1]), self.add_param(gate.On_End, (num_units, ), name="On_End_{}".format(gate_name), trainable=learn_time_params[2])) print('Learnableness: {}'.format(learn_time_params)) (self.period_timegate, self.shift_timegate, self.on_end_timegate) = add_timegate_params(timegate, 'timegate') # Add in parameters from the supplied Gate instances (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate') (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate, self.nonlinearity_forgetgate) = add_gate_params(forgetgate, 'forgetgate') (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell') (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate, self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate') # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.W_cell_to_ingate = self.add_param( ingate.W_cell, (num_units, ), name="W_cell_to_ingate") self.W_cell_to_forgetgate = self.add_param( forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate") self.W_cell_to_outgate = self.add_param( outgate.W_cell, (num_units, ), name="W_cell_to_outgate") # Setup initial values for the cell and the hidden units if isinstance(cell_init, Layer): self.cell_init = cell_init else: self.cell_init = self.add_param( cell_init, (1, num_units), name="cell_init", trainable=learn_init, regularizable=False) if isinstance(hid_init, Layer): self.hid_init = hid_init else: self.hid_init = self.add_param( hid_init, (1, self.num_units), name="hid_init", trainable=learn_init, regularizable=False) if bn: self.bn = lasagne.layers.BatchNormLayer(input_shape, axes=(0,1)) # create BN layer for correct input shape self.params.update(self.bn.params) # make BN params your params else: self.bn = False
def __init__(self, x, cell_previous, hid_previous, num_units, ingate=Gate(), forgetgate=Gate(), cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(), nonlinearity=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.), learn_init=False, peepholes=True, grad_clipping=0, **kwargs): if hid_previous.output_shape[-1] != num_units: raise ValueError('Number of hid_previous inputs should be the ' 'same as num_units_lstm') if cell_previous.output_shape[-1] != num_units: raise ValueError('Number of cell_previous inputs should be the ' 'same as num_units_lstm') if x.output_shape[0] != cell_previous.output_shape[0]: raise ValueError('first dimension output of x and hid_previous ' 'should be equal') if x.output_shape[0] != hid_previous.output_shape[0]: raise ValueError('first dimension output of x and hid_previous ' 'should be equal') # Initialize parent layer super(LSTMCell, self).__init__([x, cell_previous, hid_previous], **kwargs) # If the provided nonlinearity is None, make it linear if nonlinearity is None: self.nonlinearity = nonlinearities.identity else: self.nonlinearity = nonlinearity self.learn_init = learn_init self.num_units = num_units self.peepholes = peepholes self.grad_clipping = grad_clipping self.unroll_scan = unroll_scan # Retrieve the dimensionality of the incoming layer input_shape_x = self.input_shapes[0] # Input dimensionality is the output dimensionality of the input layer num_inputs_x = np.prod(input_shape_x[1:]) def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (num_inputs_x, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # Add in parameters from the supplied Gate instances (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate') (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate, self.nonlinearity_forgetgate) = add_gate_params( forgetgate, 'forgetgate') (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell') (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate, self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate') # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.W_cell_to_ingate = self.add_param(ingate.W_cell, (num_units, ), name="W_cell_to_ingate") self.W_cell_to_forgetgate = self.add_param( forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate") self.W_cell_to_outgate = self.add_param(outgate.W_cell, (num_units, ), name="W_cell_to_outgate") # Setup initial values for the cell and the hidden units self.cell_init = self.add_param(cell_init, (1, num_units), name="cell_init", trainable=learn_init, regularizable=False) self.hid_init = self.add_param(hid_init, (1, self.num_units), name="hid_init", trainable=learn_init, regularizable=False) # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation self.W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate ], axis=1) # Same for hidden weight matrices self.W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) # Stack biases into a (4*num_units) vector self.b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0)
def __init__( self, incoming, time_input, num_units, ingate=Gate(), forgetgate=Gate(), tgate1=TimeGate(W_t=init.Uniform((-1, 0))), tgate2=TimeGate(), cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), outgate=OutGate(), nonlinearity=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.), backwards=False, learn_init=False, peepholes=True, gradient_steps=-1, grad_clipping=0, unroll_scan=False, precompute_input=True, mask_input=None, only_return_final=False, bn=False, boundary=-0.00001, # constraint ceil **kwargs): # This layer inherits from a MergeLayer, because it can have four # inputs - the layer input, the mask, the initial hidden state and the # inital cell state. We will just provide the layer input as incomings, # unless a mask input, inital hidden state or initial cell state was # provided. incomings = [incoming] incomings.append(time_input) self.time_incoming_index = len(incomings) - 1 self.mask_incoming_index = -1 self.hid_init_incoming_index = -1 self.cell_init_incoming_index = -1 if mask_input is not None: incomings.append(mask_input) self.mask_incoming_index = len(incomings) - 1 if isinstance(hid_init, Layer): incomings.append(hid_init) self.hid_init_incoming_index = len(incomings) - 1 if isinstance(cell_init, Layer): incomings.append(cell_init) self.cell_init_incoming_index = len(incomings) - 1 # Initialize parent layer super(TLSTM2Layer, self).__init__(incomings, **kwargs) # If the provided nonlinearity is None, make it linear if nonlinearity is None: self.nonlinearity = nonlinearities.identity else: self.nonlinearity = nonlinearity self.learn_init = learn_init self.num_units = num_units self.backwards = backwards self.peepholes = peepholes self.gradient_steps = gradient_steps self.grad_clipping = grad_clipping self.unroll_scan = unroll_scan self.precompute_input = precompute_input self.only_return_final = only_return_final self.boundary = boundary if unroll_scan and gradient_steps != -1: raise ValueError( "Gradient steps must be -1 when unroll_scan is true.") # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] time_shape = self.input_shapes[1] if unroll_scan and input_shape[1] is None: raise ValueError("Input sequence length cannot be specified as " "None when unroll_scan is True") num_inputs = np.prod(input_shape[2:]) def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) def add_outgate_params(gate, gate_name): return (self.add_param(gate.W_in, (num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.W_to, (1, num_units), name="W_to_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) def add_timegate_params(gate, gate_name): return (self.add_param(gate.W_t, (1, num_units), name="W_t_to_{}".format(gate_name)), self.add_param(gate.W_x, (num_inputs, num_units), name="W_x_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name)), gate.nonlinearity_inside, gate.nonlinearity_outside) # Add in parameters from the supplied Gate instances (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate') (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate, self.nonlinearity_forgetgate) = add_gate_params( forgetgate, 'forgetgate') (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell') (self.W_in_to_outgate, self.W_hid_to_outgate, self.W_to_to_outgate, self.b_outgate, self.nonlinearity_outgate) = add_outgate_params(outgate, 'outgate') (self.W_t1_to_tg1, self.W_x1_to_tg1, self.b1_tg1, self.nonlinearity_inside_tg1, self.nonlinearity_outside_tg1) = add_timegate_params( tgate1, 'tgate1') (self.W_t2_to_tg2, self.W_x2_to_tg2, self.b2_tg2, self.nonlinearity_inside_tg2, self.nonlinearity_outside_tg2) = add_timegate_params( tgate2, 'tgate2') # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.W_cell_to_ingate = self.add_param(ingate.W_cell, (num_units, ), name="W_cell_to_ingate") self.W_cell_to_forgetgate = self.add_param( forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate") self.W_cell_to_outgate = self.add_param(outgate.W_cell, (num_units, ), name="W_cell_to_outgate") # Setup initial values for the cell and the hidden units if isinstance(cell_init, Layer): self.cell_init = cell_init else: self.cell_init = self.add_param(cell_init, (1, num_units), name="cell_init", trainable=learn_init, regularizable=False) if isinstance(hid_init, Layer): self.hid_init = hid_init else: self.hid_init = self.add_param(hid_init, (1, self.num_units), name="hid_init", trainable=learn_init, regularizable=False) if bn: self.bn = lasagne.layers.BatchNormLayer(input_shape, axes=(0, 1)) self.params.update(self.bn.params) else: self.bn = False
def get_rnn(event_var, feature_idx, feature_value, mask_var, time_var, arch_size, num_attention=0, embed_size=40, init_period=(1, 3), seq_len=1000, GRAD_CLIP=100, bn=False, model_type='LSTM'): #input layers l_in_event = lasagne.layers.InputLayer(shape=(None, seq_len), input_var=event_var) l_in_feature_idx = lasagne.layers.InputLayer(shape=(None, seq_len, 3), input_var=feature_idx) l_in_feature_value = lasagne.layers.InputLayer(shape=(None, seq_len, 3), input_var=feature_value) l_mask = lasagne.layers.InputLayer(shape=(None, seq_len), input_var=mask_var) l_t = lasagne.layers.InputLayer(shape=(None, seq_len), input_var=time_var) #embed event embed_event = lasagne.layers.EmbeddingLayer(l_in_event, input_size=3418, output_size=embed_size) #embed feature_idx embed_feature_idx = lasagne.layers.EmbeddingLayer(l_in_feature_idx, input_size=649, output_size=embed_size) #embed feature_value bias embed_feature_b = lasagne.layers.EmbeddingLayer(l_in_feature_idx, input_size=649, output_size=1) #embed feature_value trans embed_feature_trans = lasagne.layers.EmbeddingLayer(l_in_feature_idx, input_size=649, output_size=1) embed_params = [ embed_event.W, embed_feature_idx.W, embed_feature_b.W, embed_feature_trans.W ] #get input_var l_in_merge = MergeEmbeddingLayer(embed_event, embed_feature_idx, embed_feature_b, embed_feature_trans, l_in_feature_value) if model_type == "LSTM": l_in_merge = lasagne.layers.ConcatLayer( [l_in_merge, lasagne.layers.ReshapeLayer(l_t, [-1, seq_len, 1])], axis=2) l_forward = HELSTMLayer( incoming=l_in_merge, time_input=l_t, event_input=embed_event, num_units=arch_size[1], num_attention=num_attention, model=model_type, mask_input=l_mask, ingate=Gate(), forgetgate=Gate(), cell=Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=Gate(), nonlinearity=lasagne.nonlinearities.tanh, grad_clipping=GRAD_CLIP, bn=bn, only_return_final=True, timegate=HELSTMGate(Period=ExponentialUniformInit(init_period), Shift=lasagne.init.Uniform((0., 1000)), On_End=lasagne.init.Constant(0.05))) gate_params = [] if model_type != 'LSTM': gate_params = l_forward.get_gate_params() # Softmax l_dense = lasagne.layers.DenseLayer( l_forward, num_units=arch_size[2], nonlinearity=lasagne.nonlinearities.leaky_rectify) l_out = lasagne.layers.NonlinearityLayer( l_dense, nonlinearity=lasagne.nonlinearities.softmax) return l_out, gate_params, embed_params
def _get_l_out(self, input_vars, multi_utt=None): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] extra_vars = input_vars[1:] if multi_utt is None: l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_flattened = l_in else: l_in = InputLayer(shape=(None, multi_utt, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_flattened = reshape(l_in, (-1, self.seq_vec.max_len), name=id_tag + 'input_flattened') l_in_embed, context_vars = self.get_embedding_layer(l_in_flattened, extra_vars) cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate(b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[self.options.listener_nonlinearity] l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', only_return_final=True, **cell_kwargs) if self.options.listener_bidi: l_rec1_backwards = cell(l_in_embed, name=id_tag + 'rec1_back', backwards=True, only_return_final=True, **cell_kwargs) l_rec1 = ConcatLayer([l_rec1, l_rec1_backwards], axis=1, name=id_tag + 'rec1_bidi_concat') if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 # (batch_size [ * multi_utt], repr_size) l_pred_mean = DenseLayer(l_rec1_drop, num_units=self.color_vec.output_size, nonlinearity=None, name=id_tag + 'pred_mean') # (batch_size [ * multi_utt], repr_size * repr_size) l_pred_covar_vec = DenseLayer(l_rec1_drop, num_units=self.color_vec.output_size ** 2, # initially produce identity matrix b=np.eye(self.color_vec.output_size, dtype=theano.config.floatX).ravel(), nonlinearity=None, name=id_tag + 'pred_covar_vec') # (batch_size [ * multi_utt], repr_size, repr_size) l_pred_covar = reshape(l_pred_covar_vec, ([0], self.color_vec.output_size, self.color_vec.output_size), name=id_tag + 'pred_covar') if multi_utt is not None: l_pred_mean = reshape(l_pred_mean, (-1, multi_utt, self.color_vec.output_size), name=id_tag + 'pred_mean_reshape') l_pred_covar = reshape(l_pred_covar, (-1, multi_utt, self.color_vec.output_size, self.color_vec.output_size), name=id_tag + 'pred_covar_reshape') # Context repr has shape (batch_size, context_len * repr_size) l_context_repr, context_inputs = self.color_vec.get_input_layer( context_vars, cell_size=self.options.listener_cell_size, context_len=self.context_len, id=self.id ) l_context_points = reshape(l_context_repr, ([0], self.context_len, self.color_vec.output_size)) # (batch_size, [multi_utt,] context_len) l_unnorm_scores = GaussianScoreLayer(l_context_points, l_pred_mean, l_pred_covar, name=id_tag + 'gaussian_score') if multi_utt is not None: l_unnorm_scores = reshape(l_unnorm_scores, (-1, self.context_len), name=id_tag + 'gaussian_score_reshape') # (batch_size [ * multi_utt], context_len) # XXX: returning probs for normal models, log probs for AC model! # This is really surprising and definitely not the best solution. # We should be using log probs everywhere for stability... final_softmax = (softmax if multi_utt is None else logit_softmax_nd(axis=2)) l_scores = NonlinearityLayer(l_unnorm_scores, nonlinearity=final_softmax, name=id_tag + 'scores') if multi_utt is not None: l_scores = reshape(l_unnorm_scores, (-1, multi_utt, self.context_len), name=id_tag + 'scores_reshape') self.gaussian_fn = theano.function(input_vars, [get_output(l_pred_mean, deterministic=True), get_output(l_pred_covar, deterministic=True), get_output(l_context_points, deterministic=True), get_output(l_unnorm_scores, deterministic=True)], name=id_tag + 'gaussian', on_unused_input='ignore') self.repr_fn = theano.function(input_vars, get_output(l_rec1_drop, deterministic=True), name=id_tag + 'repr', on_unused_input='ignore') return l_scores, [l_in] + context_inputs
def _get_l_out(self, input_vars, multi_utt='ignored'): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] extra_vars = input_vars[1:] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed, context_vars = self.get_embedding_layer(l_in, extra_vars) # Context repr has shape (batch_size, seq_len, context_len * repr_size) l_context_repr, context_inputs = self.color_vec.get_input_layer( context_vars, recurrent_length=self.seq_vec.max_len, cell_size=self.options.listener_cell_size, context_len=self.context_len, id=self.id ) l_hidden_context = dimshuffle(l_context_repr, (0, 2, 1)) for i in range(1, self.options.listener_hidden_color_layers + 1): l_hidden_context = NINLayer( l_hidden_context, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'hidden_context%d' % i) l_hidden_context = dimshuffle(l_hidden_context, (0, 2, 1)) l_concat = ConcatLayer([l_hidden_context, l_in_embed], axis=2, name=id_tag + 'concat_inp_context') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate(b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[self.options.listener_nonlinearity] l_rec1 = cell(l_concat, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout, name=id_tag + 'rec2_drop') else: l_rec2_drop = l_rec2 l_hidden = DenseLayer(l_rec2_drop, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_scores = DenseLayer(l_hidden_drop, num_units=self.context_len, nonlinearity=softmax, name=id_tag + 'scores') return l_scores, [l_in] + context_inputs
def __init__(self, incoming, num_units, ingate=Gate(), forgetgate=Gate(), cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(), nonlinearity=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.), backwards=False, learn_init=False, peepholes=True, gradient_steps=-1, grad_clipping=0, unroll_scan=False, precompute_input=True, mask_input=None, only_return_final=False, batch_norm=True, **kwargs): # This layer inherits from a MergeLayer, because it can have four # inputs - the layer input, the mask, the initial hidden state and the # inital cell state. We will just provide the layer input as incomings, # unless a mask input, inital hidden state or initial cell state was # provided. incomings = [incoming] self.mask_incoming_index = -1 self.hid_init_incoming_index = -1 self.cell_init_incoming_index = -1 if mask_input is not None: incomings.append(mask_input) self.mask_incoming_index = len(incomings)-1 if isinstance(hid_init, Layer): incomings.append(hid_init) self.hid_init_incoming_index = len(incomings)-1 if isinstance(cell_init, Layer): incomings.append(cell_init) self.cell_init_incoming_index = len(incomings)-1 # Initialize parent layer super(LSTMLayer, self).__init__(incomings, **kwargs) # If the provided nonlinearity is None, make it linear if nonlinearity is None: self.nonlinearity = nonlinearities.identity else: self.nonlinearity = nonlinearity self.learn_init = learn_init self.num_units = num_units self.backwards = backwards self.peepholes = peepholes self.gradient_steps = gradient_steps self.grad_clipping = grad_clipping self.unroll_scan = unroll_scan self.precompute_input = precompute_input self.only_return_final = only_return_final self.batch_norm = batch_norm if unroll_scan and gradient_steps != -1: raise ValueError( "Gradient steps must be -1 when unroll_scan is true.") # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] self.batch_size = input_shape[0] if unroll_scan and input_shape[1] is None: raise ValueError("Input sequence length cannot be specified as " "None when unroll_scan is True") num_inputs = np.prod(input_shape[2:]) def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), gate.nonlinearity) def add_gate_params_b(gate, gate_name): return self.add_param(gate.b, (num_units,), name="b_{}".format(gate_name), regularizable=False) # Add in parameters from the supplied Gate instances (self.W_in_to_ingate, self.W_hid_to_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate') (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.nonlinearity_forgetgate) = add_gate_params(forgetgate, 'forgetgate') (self.W_in_to_cell, self.W_hid_to_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell') (self.W_in_to_outgate, self.W_hid_to_outgate, self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate') if not self.batch_norm: # add b self.b_ingate = add_gate_params_b(ingate, 'ingate') self.b_forgetgate = add_gate_params_b(forgetgate, 'forgetgate') self.b_cell = add_gate_params_b(cell, 'cell') self.b_outgate = add_gate_params_b(outgate, 'outgate') if self.batch_norm: # add 4 batch norm layers for i, f, c and o n_time_step = input_shape[1] bn_shape = (n_time_step, self.batch_size, 4*num_units) self.bn = SequenceBatchNorm(bn_shape, axes=(0,1)) # create BN layer for correct input shape self.params.update(self.bn.params) # make BN params your params # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.W_cell_to_ingate = self.add_param( ingate.W_cell, (num_units, ), name="W_cell_to_ingate") self.W_cell_to_forgetgate = self.add_param( forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate") self.W_cell_to_outgate = self.add_param( outgate.W_cell, (num_units, ), name="W_cell_to_outgate") # Setup initial values for the cell and the hidden units if isinstance(cell_init, Layer): self.cell_init = cell_init else: self.cell_init = self.add_param( cell_init, (1, num_units), name="cell_init", trainable=learn_init, regularizable=False) if isinstance(hid_init, Layer): self.hid_init = hid_init else: self.hid_init = self.add_param( hid_init, (1, self.num_units), name="hid_init", trainable=learn_init, regularizable=False)
def main(num_epochs=NUM_EPOCHS): print("Loading data ...") snli = SNLI(batch_size=BATCH_SIZE) train_batches = list(snli.train_minibatch_generator()) dev_batches = list(snli.dev_minibatch_generator()) test_batches = list(snli.test_minibatch_generator()) W_word_embedding = snli.weight # W shape: (# vocab size, WE_DIM) del snli print("Building network ...") ########### sentence embedding encoder ########### # sentence vector, with each number standing for a word number input_var = T.TensorType('int32', [False, False])('sentence_vector') input_var.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (50, 20), 'int32'), numpy.zeros( (50, 5)).astype('int32'))) input_var.tag.test_value[1, 20:22] = (413, 45) l_in = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_var) input_mask = T.TensorType('int32', [False, False])('sentence_mask') input_mask.tag.test_value = numpy.hstack((numpy.ones( (50, 20), dtype='int32'), numpy.zeros((50, 5), dtype='int32'))) input_mask.tag.test_value[1, 20:22] = 1 l_mask = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_mask) # output shape (BATCH_SIZE, None, WE_DIM) l_word_embed = lasagne.layers.EmbeddingLayer( l_in, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=W_word_embedding) # how to set it to be non-trainable? # bidirectional LSTM l_forward = lasagne.layers.LSTMLayer( l_word_embed, mask_input=l_mask, num_units=LSTM_HIDDEN, ingate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), forgetgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), cell=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), nonlinearity=lasagne.nonlinearities.tanh, peepholes=False, grad_clipping=GRAD_CLIP) l_backward = lasagne.layers.LSTMLayer( l_word_embed, mask_input=l_mask, num_units=LSTM_HIDDEN, ingate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), forgetgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), cell=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), nonlinearity=lasagne.nonlinearities.tanh, peepholes=False, grad_clipping=GRAD_CLIP, backwards=True) # output dim: (BATCH_SIZE, None, 2*LSTM_HIDDEN) l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2) # Attention mechanism to get sentence embedding # output dim: (BATCH_SIZE, None, ATTENTION_HIDDEN) l_ws1 = DenseLayer3DInput(l_concat, num_units=ATTENTION_HIDDEN) # output dim: (BATCH_SIZE, None, N_ROWS) l_ws2 = DenseLayer3DInput(l_ws1, num_units=N_ROWS, nonlinearity=None) l_annotations = Softmax3D(l_ws2, mask=l_mask) # output dim: (BATCH_SIZE, 2*LSTM_HIDDEN, N_ROWS) l_sentence_embedding = ApplyAttention([l_annotations, l_concat]) # beam search? Bi lstm in the sentence embedding layer? etc. ########### get embeddings for hypothesis and premise ########### # hypothesis input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector') input_var_h.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (50, 18), 'int32'), numpy.zeros( (50, 6)).astype('int32'))) l_in_h = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_var_h) input_mask_h = T.TensorType('int32', [False, False])('hypo_mask') input_mask_h.tag.test_value = numpy.hstack((numpy.ones( (50, 18), dtype='int32'), numpy.zeros((50, 6), dtype='int32'))) input_mask_h.tag.test_value[1, 18:22] = 1 l_mask_h = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_mask_h) # premise input_var_p = T.TensorType('int32', [False, False])('premise_vector') input_var_p.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (50, 16), 'int32'), numpy.zeros( (50, 3)).astype('int32'))) l_in_p = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_var_p) input_mask_p = T.TensorType('int32', [False, False])('premise_mask') input_mask_p.tag.test_value = numpy.hstack((numpy.ones( (50, 16), dtype='int32'), numpy.zeros((50, 3), dtype='int32'))) input_mask_p.tag.test_value[1, 16:18] = 1 l_mask_p = lasagne.layers.InputLayer(shape=(BATCH_SIZE, None), input_var=input_mask_p) hypothesis_embedding, hypothesis_annotation = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_h.input_var, l_mask: l_mask_h.input_var }) premise_embedding, premise_annotation = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_p.input_var, l_mask: l_mask_p.input_var }) ########### gated encoder and output MLP ########## l_hypo_embed = lasagne.layers.InputLayer(shape=(BATCH_SIZE, N_ROWS, 2 * LSTM_HIDDEN), input_var=hypothesis_embedding) l_pre_embed = lasagne.layers.InputLayer(shape=(BATCH_SIZE, N_ROWS, 2 * LSTM_HIDDEN), input_var=premise_embedding) # output dim: (BATCH_SIZE, 2*LSTM_HIDDEN, N_ROWS) l_factors = GatedEncoder3D([l_hypo_embed, l_pre_embed], num_hfactors=2 * LSTM_HIDDEN) # Dropout: l_factors_noise = lasagne.layers.DropoutLayer(l_factors, p=GAEREG, rescale=True) # l_hids = DenseLayer3DWeight() l_outhid = lasagne.layers.DenseLayer( l_factors_noise, num_units=OUT_HIDDEN, nonlinearity=lasagne.nonlinearities.rectify) # Dropout: l_outhid_noise = lasagne.layers.DropoutLayer(l_outhid, p=GAEREG, rescale=True) l_output = lasagne.layers.DenseLayer( l_outhid_noise, num_units=3, nonlinearity=lasagne.nonlinearities.softmax) ########### target, cost, validation, etc. ########## target_values = T.ivector('target_output') target_values.tag.test_value = numpy.asarray([ 1, ] * 50, dtype='int32') network_output = lasagne.layers.get_output(l_output) network_output_clean = lasagne.layers.get_output(l_output, deterministic=True) # penalty term and cost attention_penalty = T.mean( ( T.batched_dot( hypothesis_annotation, # pay attention to this line: # T.extra_ops.cpu_contiguous(hypothesis_annotation.dimshuffle(0, 2, 1)) hypothesis_annotation.dimshuffle(0, 2, 1)) - T.eye(hypothesis_annotation.shape[1]).dimshuffle('x', 0, 1))**2, axis=(0, 1, 2) ) + T.mean( ( T.batched_dot( premise_annotation, # T.extra_ops.cpu_contiguous(premise_annotation.dimshuffle(0, 2, 1)) # ditto. premise_annotation.dimshuffle(0, 2, 1) # ditto. ) - T.eye(premise_annotation.shape[1]).dimshuffle('x', 0, 1))**2, axis=(0, 1, 2)) cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values) + \ ATTENTION_PENALTY * attention_penalty) cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values) + \ ATTENTION_PENALTY * attention_penalty) # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_output) + \ lasagne.layers.get_all_params(l_sentence_embedding) numparams = sum( [numpy.prod(i) for i in [i.shape.eval() for i in all_params]]) print("Number of params: {}".format(numparams)) # if exist param file then load params look_for = 'params' + os.sep + 'params_' + filename + '.pkl' if os.path.isfile(look_for): print("Resuming from file: " + look_for) all_param_values = cPickle.load(open(look_for, 'rb')) for p, v in zip(all_params, all_param_values): p.set_value(v) # withoutwe_params = all_params + [l_word_embed.W] # Compute updates for training print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) # Theano functions for training and computing cost print("Compiling functions ...") network_prediction = T.argmax(network_output, axis=1) error_rate = T.mean(T.neq(network_prediction, target_values)) network_prediction_clean = T.argmax(network_output_clean, axis=1) error_rate_clean = T.mean(T.neq(network_prediction_clean, target_values)) train = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost, error_rate], updates=updates) compute_cost = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost_clean, error_rate_clean]) def evaluate(mode): if mode == 'dev': data = dev_batches if mode == 'test': data = test_batches set_cost = 0. set_error_rate = 0. for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1): _cost, _error = compute_cost(hypo, hm, premise, pm, truth) set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \ 1.0 / batches_seen * _cost set_error_rate = (1.0 - 1.0 / batches_seen) * set_error_rate + \ 1.0 / batches_seen * _error return set_cost, set_error_rate dev_set_cost, dev_set_error = evaluate('dev') print("BEFORE TRAINING: dev cost %f, error %f" % (dev_set_cost, dev_set_error)) print("Training ...") try: for epoch in range(num_epochs): train_set_cost = 0. train_set_error = 0. start = time.time() for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(train_batches, 1): _cost, _error = train(hypo, hm, premise, pm, truth) train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \ 1.0 / batches_seen * _cost train_set_error = (1.0 - 1.0 / batches_seen) * train_set_error + \ 1.0 / batches_seen * _error if batches_seen % 100 == 0: end = time.time() print("Sample %d %.2fs, lr %.4f, train cost %f, error %f" % (batches_seen * BATCH_SIZE, LEARNING_RATE, end - start, train_set_cost, train_set_error)) start = end if batches_seen % 2000 == 0: dev_set_cost, dev_set_error = evaluate('dev') test_set_cost, test_set_error = evaluate('test') print("***dev cost %f, error %f" % (dev_set_cost, dev_set_error)) print("***test cost %f, error %f" % (test_set_cost, test_set_error)) # save parameters all_param_values = [p.get_value() for p in all_params] cPickle.dump( all_param_values, open('params' + os.sep + 'params_' + filename + '.pkl', 'wb')) # load params # all_param_values = cPickle.load(open('params' + os.sep + 'params_' + filename, 'rb')) # for p, v in zip(all_params, all_param_values): # p.set_value(v) dev_set_cost, dev_set_error = evaluate('dev') test_set_cost, test_set_error = evaluate('test') print("epoch %d, cost: train %f dev %f test %f;\n" " error train %f dev %f test %f" % (epoch, train_set_cost, dev_set_cost, test_set_cost, train_set_error, dev_set_error, test_set_error)) except KeyboardInterrupt: pdb.set_trace() pass
def __init__( self, incoming, # 就是x的输入 (None, features) time_input, # 每一个 batch的时间点 信息 ,int mask_input=None, # time_step 决定哪些 cell_init=init.Constant(0.), # 细胞状态初始化 hid_init=init.Constant(0.), # 隐含层状态初始化 num_units, # 神经元节点数 ingate=Gate(b=lasagne.init.Constant(0)), # 初始化输入门 forgetgate=Gate(b=lasagne.init.Constant(2), nonlinearity=nonlinearities.sigmoid), # 初始化遗忘门 timegate=PLSTMTimeGate(), # 创建一个时间门 cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), # 创建细胞状态控制的门 outgate=Gate(), # 初始化输出门 nonlinearity=nonlinearities.tanh, # 这一层的输出的 非线性激活函数 backwards=False, learn_init=False, peepholes=True, # 是否利用窥视孔连接 gradient_steps=-1, grad_clipping=0, unroll_scan=False, precompute_input=True, only_return_final=False, bn=False, # 是否采用另外一种 结构 BN-LSTM learn_time_params=[True, True, False], # 是否学习 事件门的参数 off_alpha=1e-3, # leak rate **kwargs): # This layer inherits from a MergeLayer, because it can have four # inputs - the layer input, the mask, the initial hidden state and the # inital cell state. We will just provide the layer input as incomings, # unless a mask input, inital hidden state or initial cell state was # provided. incomings = [incoming] # TIME STUFF incomings.append(time_input) self.time_incoming_index = len(incomings) - 1 self.mask_incoming_index = -2 self.hid_init_incoming_index = -2 self.cell_init_incoming_index = -2 # incomings 是总的输入 原则上包含了 # x 的输入 , mask 输入 , 初始化的隐含层状态输入, 初始化的 细胞状态输入 # 上述 的 赋值 是给了每一种参数 在 incoming 列表里面的 索引信息 # 一开始的 incoming 不用必须包含 mask , init_b, init_c # 下面如果有输入的话再加上去 incomings.append() #ADD TIME INPUT HERE if mask_input is not None: incomings.append(mask_input) self.mask_incoming_index = len(incomings) - 1 # 更新索引信息 if isinstance(hid_init, Layer): incomings.append(hid_init) self.hid_init_incoming_index = len(incomings) - 1 # 更新索引信息 if isinstance(cell_init, Layer): incomings.append(cell_init) self.cell_init_incoming_index = len(incomings) - 1 # 更新索引信息 # Initialize parent layer super(PLSTMLayer, self).__init__(incomings, **kwargs) # If the provided nonlinearity is None, make it linear if nonlinearity is None: self.nonlinearity = nonlinearities.identity else: self.nonlinearity = nonlinearity self.learn_init = learn_init self.num_units = num_units self.backwards = backwards self.peepholes = peepholes self.gradient_steps = gradient_steps self.grad_clipping = grad_clipping self.unroll_scan = unroll_scan self.precompute_input = precompute_input self.only_return_final = only_return_final # 这段还不知道是干嘛的 if unroll_scan and gradient_steps != -1: raise ValueError( "Gradient steps must be -1 when unroll_scan is true.") # Retrieve the dimensionality of the incoming layer # 检索传入图层的维度 input_shape = self.input_shapes[0] time_shape = self.input_shapes[1] ## unroll_scan 展开扫描 # 如果要展开扫描 , 那么时间 维度的 shape 就不能为None if unroll_scan and input_shape[1] is None: raise ValueError("Input sequence length cannot be specified as " "None when unroll_scan is True") # num_inputs == features np.prod 是吧除了 batch_size-- input_shape[0] , time_step--input_shape[1]以外的其余 # 维度数全部相乘,当成features num_inputs = np.prod(input_shape[2:]) def add_gate_params(gate, gate_name): # 指定门的 变量, 和名字(名字后面有用) """ Convenience function for adding layer parameters from a Gate instance. """ # 用于从Gate实例添加图层参数的便捷功能 return ( self.add_param( gate.W_in, (num_inputs, num_units), # 这里相当于给的 shape ,用initializer 类 进行初始化 name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # PHASED LSTM: Initialize params for the time gate # 初始化时间门的 参数 self.off_alpha = off_alpha # leak_rate if timegate == None: # 如果时间门为空的话, 那么指定生成一个 时间门 instance timegate = PLSTMTimeGate() def add_timegate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ # self.add_param return ( self.add_param( gate.Period, (num_units, ), # 这里相当于给的 shape ,用initializer 类 进行初始化 # 这个地方为什么要留一个维度呢, 这三个参数都只是针对 h 和 c 进行的 name="Period_{}".format(gate_name), trainable=learn_time_params[0]), self.add_param(gate.Shift, (num_units, ), name="Shift_{}".format(gate_name), trainable=learn_time_params[1]), self.add_param(gate.On_End, (num_units, ), name="On_End_{}".format(gate_name), trainable=learn_time_params[2])) print('Learnableness: {}'.format(learn_time_params)) # 这里 实际上 是创建 self--PLSTMLayer 类 的 变量 进行初始化 # 初始化时间门的参数 (self.period_timegate, self.shift_timegate, self.on_end_timegate) = add_timegate_params(timegate, 'timegate') # Add in parameters from the supplied Gate instances # 初始化 输入门, 遗忘门, 输出门的参数 (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate') (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate, self.nonlinearity_forgetgate) = add_gate_params( forgetgate, 'forgetgate') (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell') (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate, self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate') # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. # 如果采用了窥视孔的连接,那么还要初始化窥视孔连接的参数 # 窥视孔 在三个门 都有一个 W.cell 权重矩阵 if self.peepholes: self.W_cell_to_ingate = self.add_param(ingate.W_cell, (num_units, ), name="W_cell_to_ingate") self.W_cell_to_forgetgate = self.add_param( forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate") self.W_cell_to_outgate = self.add_param(outgate.W_cell, (num_units, ), name="W_cell_to_outgate") # Setup initial values for the cell and the hidden units # 如果 cell_init 是一个 Layer 类,那么就把 Layer 类赋值给 这个 plstmlayer 的 cell_init if isinstance(cell_init, Layer): self.cell_init = cell_init else: self.cell_init = self.add_param(cell_init, (1, num_units), name="cell_init", trainable=learn_init, regularizable=False) if isinstance(hid_init, Layer): self.hid_init = hid_init else: self.hid_init = self.add_param(hid_init, (1, self.num_units), name="hi d_init", trainable=learn_init, regularizable=False) if bn: # 如果要做 bn-lstm 的话 self.bn = lasagne.layers.BatchNormLayer( input_shape, axes=(0, 1)) # create BN layer for correct input shape self.params.update(self.bn.params) # make BN params your params else: self.bn = False
def main(num_epochs=NEPOCH): if DSET == 'yelp': print("Loading yelp dataset ...") loaded_dataset = YELP( batch_size=BSIZE, datapath="/home/hantek/datasets/NLC_data/yelp/word2vec_yelp.pkl") elif DSET == 'age2': print("Loading age2 dataset ...") loaded_dataset = AGE2( batch_size=BSIZE, datapath="/home/hantek/datasets/NLC_data/age2/word2vec_age2.pkl") else: raise ValueError("DSET was set incorrectly. Check your cmd args.") # yelp age2 # train data 500000 68450 # dev/test data 2000 4000 # vocab ~1.2e5 # train_batches = list(loaded_dataset.train_minibatch_generator()) dev_batches = list(loaded_dataset.dev_minibatch_generator()) test_batches = list(loaded_dataset.test_minibatch_generator()) W_word_embedding = loaded_dataset.weight # W shape: (# vocab size, WE_DIM) del loaded_dataset print("Building network ...") ########### sentence embedding encoder ########### # sentence vector, with each number standing for a word number input_var = T.TensorType('int32', [False, False])('sentence_vector') input_var.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 20), 'int32'), numpy.zeros( (BSIZE, 5)).astype('int32'))) input_var.tag.test_value[1, 20:22] = (413, 45) l_in = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var) input_mask = T.TensorType('int32', [False, False])('sentence_mask') input_mask.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 20), dtype='int32'), numpy.zeros((BSIZE, 5), dtype='int32'))) input_mask.tag.test_value[1, 20:22] = 1 l_mask = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask) # output shape (BSIZE, None, WEDIM) l_word_embed = lasagne.layers.EmbeddingLayer( l_in, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=W_word_embedding) # bidirectional LSTM l_forward = lasagne.layers.LSTMLayer( l_word_embed, mask_input=l_mask, num_units=LSTMHID, ingate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), forgetgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), cell=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), nonlinearity=lasagne.nonlinearities.tanh, peepholes=False, only_return_final=False, grad_clipping=GCLIP) l_backward = lasagne.layers.LSTMLayer( l_word_embed, mask_input=l_mask, num_units=LSTMHID, ingate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), forgetgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), cell=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), nonlinearity=lasagne.nonlinearities.tanh, peepholes=False, only_return_final=False, grad_clipping=GCLIP, backwards=True) # output dim: (BSIZE, None, 2*LSTMHID) l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2) # output dim: (BSIZE, 2*LSTMHID) l_maxpool = Maxpooling(l_concat, axis=1) l_maxpool_dpout = lasagne.layers.DropoutLayer(l_maxpool, p=DPOUT, rescale=True) l_outhid = lasagne.layers.DenseLayer( l_maxpool_dpout, num_units=OUTHID, nonlinearity=lasagne.nonlinearities.rectify) l_outhid_dpout = lasagne.layers.DropoutLayer(l_outhid, p=DPOUT, rescale=True) l_output = lasagne.layers.DenseLayer( l_outhid_dpout, num_units=5, nonlinearity=lasagne.nonlinearities.softmax) ########### target, cost, validation, etc. ########## target_values = T.ivector('target_output') target_values.tag.test_value = numpy.asarray([ 1, ] * BSIZE, dtype='int32') network_output = lasagne.layers.get_output(l_output) network_prediction = T.argmax(network_output, axis=1) accuracy = T.mean(T.eq(network_prediction, target_values)) network_output_clean = lasagne.layers.get_output(l_output, deterministic=True) network_prediction_clean = T.argmax(network_output_clean, axis=1) accuracy_clean = T.mean(T.eq(network_prediction_clean, target_values)) L2_lstm = ((l_forward.W_in_to_ingate ** 2).sum() + \ (l_forward.W_hid_to_ingate ** 2).sum() + \ (l_forward.W_in_to_forgetgate ** 2).sum() + \ (l_forward.W_hid_to_forgetgate ** 2).sum() + \ (l_forward.W_in_to_cell ** 2).sum() + \ (l_forward.W_hid_to_cell ** 2).sum() + \ (l_forward.W_in_to_outgate ** 2).sum() + \ (l_forward.W_hid_to_outgate ** 2).sum() + \ (l_backward.W_in_to_ingate ** 2).sum() + \ (l_backward.W_hid_to_ingate ** 2).sum() + \ (l_backward.W_in_to_forgetgate ** 2).sum() + \ (l_backward.W_hid_to_forgetgate ** 2).sum() + \ (l_backward.W_in_to_cell ** 2).sum() + \ (l_backward.W_hid_to_cell ** 2).sum() + \ (l_backward.W_in_to_outgate ** 2).sum() + \ (l_backward.W_hid_to_outgate ** 2).sum()) L2_outputhid = (l_outhid.W**2).sum() L2_softmax = (l_output.W**2).sum() L2 = L2_lstm + L2_outputhid + L2_softmax cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values)) + \ L2REG * L2 cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values)) + \ L2REG * L2 # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_output) if not UPDATEWE: all_params.remove(l_word_embed.W) numparams = sum( [numpy.prod(i) for i in [i.shape.eval() for i in all_params]]) print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams)) print("-----------------------------------------------------------------") for item in all_params: print("{0:24}{1:24}{2}".format(item, item.shape.eval(), numpy.prod(item.shape.eval()))) # if exist param file then load params look_for = 'params' + os.sep + 'params_' + filename + '.pkl' if os.path.isfile(look_for): print("Resuming from file: " + look_for) all_param_values = cPickle.load(open(look_for, 'rb')) for p, v in zip(all_params, all_param_values): p.set_value(v) # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LR) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([l_in.input_var, l_mask.input_var, target_values], [cost, accuracy], updates=updates) compute_cost = theano.function( [l_in.input_var, l_mask.input_var, target_values], [cost_clean, accuracy_clean]) predict = theano.function([l_in.input_var, l_mask.input_var], network_prediction_clean) def evaluate(mode, verbose=False): if mode == 'dev': data = dev_batches if mode == 'test': data = test_batches set_cost = 0. set_accuracy = 0. for batches_seen, (hypo, hm, truth) in enumerate(data, 1): _cost, _accuracy = compute_cost(hypo, hm, truth) set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \ 1.0 / batches_seen * _cost set_accuracy = (1.0 - 1.0 / batches_seen) * set_accuracy + \ 1.0 / batches_seen * _accuracy if verbose == True: predicted = [] truth = [] for batches_seen, (sent, mask, th) in enumerate(data, 1): predicted.append(predict(sent, mask)) truth.append(th) truth = numpy.concatenate(truth) predicted = numpy.concatenate(predicted) cm = confusion_matrix(truth, predicted) pr_a = cm.trace() * 1.0 / truth.size pr_e = ((cm.sum(axis=0)*1.0/truth.size) * \ (cm.sum(axis=1)*1.0/truth.size)).sum() k = (pr_a - pr_e) / (1 - pr_e) print(mode + " set statistics:") print("kappa index of agreement: %f" % k) print("confusion matrix:") print(cm) return set_cost, set_accuracy print("Done. Evaluating scratch model ...") test_set_cost, test_set_accuracy = evaluate('test', verbose=True) print("BEFORE TRAINING: test cost %f, accuracy %f" % (test_set_cost, test_set_accuracy)) print("Training ...") try: for epoch in range(num_epochs): train_set_cost = 0. train_set_accuracy = 0. start = time.time() for batches_seen, (hypo, hm, truth) in enumerate(train_batches, 1): _cost, _accuracy = train(hypo, hm, truth) train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \ 1.0 / batches_seen * _cost train_set_accuracy = (1.0 - 1.0 / batches_seen) * train_set_accuracy + \ 1.0 / batches_seen * _accuracy if batches_seen % 100 == 0: end = time.time() print( "Sample %d %.2fs, lr %.4f, train cost %f, accuracy %f" % (batches_seen * BSIZE, end - start, LR, train_set_cost, train_set_accuracy)) start = end if batches_seen % 2000 == 0: dev_set_cost, dev_set_accuracy = evaluate('dev') test_set_cost, test_set_accuracy = evaluate('test') print("RECORD: cost: train %f dev %f test %f\n" " accu: train %f dev %f test %f" % (train_set_cost, dev_set_cost, test_set_cost, train_set_accuracy, dev_set_accuracy, test_set_accuracy)) # save parameters all_param_values = [p.get_value() for p in all_params] cPickle.dump( all_param_values, open('params' + os.sep + 'params_' + filename + '.pkl', 'wb')) dev_set_cost, dev_set_accuracy = evaluate('dev') test_set_cost, test_set_accuracy = evaluate('test', verbose=True) print("RECORD:epoch %d, cost: train %f dev %f test %f\n" " accu: train %f dev %f test %f" % (epoch, train_set_cost, dev_set_cost, test_set_cost, train_set_accuracy, dev_set_accuracy, test_set_accuracy)) except KeyboardInterrupt: pdb.set_trace() pass
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] context_vars = input_vars[1:] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer( l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.listener_nonlinearity] l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', only_return_final=True, **cell_kwargs) if self.options.listener_bidi: l_rec1_backwards = cell(l_in_embed, name=id_tag + 'rec1_back', backwards=True, only_return_final=True, **cell_kwargs) l_rec1 = ConcatLayer([l_rec1, l_rec1_backwards], axis=1, name=id_tag + 'rec1_bidi_concat') if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 # (batch_size, repr_size) l_pred_mean = DenseLayer(l_rec1_drop, num_units=self.color_vec.output_size, nonlinearity=None, name=id_tag + 'pred_mean') # (batch_size, repr_size * repr_size) l_pred_covar_vec = DenseLayer( l_rec1_drop, num_units=self.color_vec.output_size**2, # initially produce identity matrix b=np.eye(self.color_vec.output_size, dtype=theano.config.floatX).ravel(), nonlinearity=None, name=id_tag + 'pred_covar_vec') # (batch_size, repr_size, repr_size) l_pred_covar = reshape( l_pred_covar_vec, ([0], self.color_vec.output_size, self.color_vec.output_size), name=id_tag + 'pred_covar') # Context repr has shape (batch_size, context_len * repr_size) l_context_repr, context_inputs = self.color_vec.get_input_layer( context_vars, cell_size=self.options.listener_cell_size, context_len=self.context_len, id=self.id) l_context_points = reshape( l_context_repr, ([0], self.context_len, self.color_vec.output_size)) l_unnorm_scores = GaussianScoreLayer(l_context_points, l_pred_mean, l_pred_covar, name=id_tag + 'gaussian_score') l_scores = NonlinearityLayer(l_unnorm_scores, nonlinearity=softmax, name=id_tag + 'scores') return l_scores, [l_in] + context_inputs
def __init__(self, incoming, num_units, in_dropout=0.0, hid_dropout=0.0, ingate=Gate(), forgetgate=Gate(), cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(), nonlinearity=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.), backwards=False, learn_init=False, peepholes=True, gradient_steps=-1, grad_clipping=0, unroll_scan=False, precompute_input=True, mask_input=None, only_return_final=False, **kwargs): # This layer inherits from a MergeLayer, because it can have two # inputs - the layer input, and the mask. We will just provide the # layer input as incomings, unless a mask input was provided. incomings = [incoming] if mask_input is not None: incomings.append(mask_input) # Initialize parent layer super(LSTMDropoutLayer, self).__init__(incomings, **kwargs) # If the provided nonlinearity is None, make it linear if nonlinearity is None: self.nonlinearity = nonlinearities.identity else: self.nonlinearity = nonlinearity self._srng = RandomStreams(get_rng().randint(1, 2147462579)) self.p_in = in_dropout self.p_hid = hid_dropout self.learn_init = learn_init self.num_units = num_units self.backwards = backwards self.peepholes = peepholes self.gradient_steps = gradient_steps self.grad_clipping = grad_clipping self.unroll_scan = unroll_scan self.precompute_input = precompute_input self.only_return_final = only_return_final if unroll_scan and gradient_steps != -1: raise ValueError( "Gradient steps must be -1 when unroll_scan is true.") # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] if unroll_scan and input_shape[1] is None: raise ValueError("Input sequence length cannot be specified as " "None when unroll_scan is True") num_inputs = np.prod(input_shape[2:]) self.num_inputs = num_inputs def add_gate_params(gate, gate_name): """ Convenience function for adding layer parameters from a Gate instance. """ return (self.add_param(gate.W_in, (num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # Add in parameters from the supplied Gate instances (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate') (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate, self.nonlinearity_forgetgate) = add_gate_params( forgetgate, 'forgetgate') (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell') (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate, self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate') # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.W_cell_to_ingate = self.add_param(ingate.W_cell, (num_units, ), name="W_cell_to_ingate") self.W_cell_to_forgetgate = self.add_param( forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate") self.W_cell_to_outgate = self.add_param(outgate.W_cell, (num_units, ), name="W_cell_to_outgate") # Setup initial values for the cell and the hidden units if isinstance(cell_init, T.TensorVariable): if cell_init.ndim != 2: raise ValueError( "When cell_init is provided as a TensorVariable, it should" " have 2 dimensions and have shape (num_batch, num_units)") self.cell_init = cell_init else: self.cell_init = self.add_param(cell_init, (1, num_units), name="cell_init", trainable=learn_init, regularizable=False) if isinstance(hid_init, T.TensorVariable): if hid_init.ndim != 2: raise ValueError( "When hid_init is provided as a TensorVariable, it should " "have 2 dimensions and have shape (num_batch, num_units)") self.hid_init = hid_init else: self.hid_init = self.add_param(hid_init, (1, self.num_units), name="hid_init", trainable=learn_init, regularizable=False)
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] context_vars = input_vars[1:] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer( l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') # Context repr has shape (batch_size, seq_len, context_len * repr_size) l_context_repr, context_inputs = self.color_vec.get_input_layer( context_vars, recurrent_length=self.seq_vec.max_len, cell_size=self.options.listener_cell_size, context_len=self.context_len, id=self.id) l_context_repr = reshape( l_context_repr, ([0], [1], self.context_len, self.color_vec.output_size)) l_hidden_context = dimshuffle(l_context_repr, (0, 3, 1, 2), name=id_tag + 'shuffle_in') for i in range(1, self.options.listener_hidden_color_layers + 1): l_hidden_context = NINLayer( l_hidden_context, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[ self.options.listener_nonlinearity], b=Constant(0.1), name=id_tag + 'hidden_context%d' % i) l_pool = FeaturePoolLayer(l_hidden_context, pool_size=self.context_len, axis=3, pool_function=T.mean, name=id_tag + 'pool') l_pool_squeezed = reshape(l_pool, ([0], [1], [2]), name=id_tag + 'pool_squeezed') l_pool_shuffle = dimshuffle(l_pool_squeezed, (0, 2, 1), name=id_tag + 'shuffle_out') l_concat = ConcatLayer([l_pool_shuffle, l_in_embed], axis=2, name=id_tag + 'concat_inp_context') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.listener_nonlinearity] # l_rec1_drop = l_concat l_rec1 = cell(l_concat, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', only_return_final=True, **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout, name=id_tag + 'rec2_drop') else: l_rec2_drop = l_rec2 l_rec2_drop = NINLayer(l_rec2_drop, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'rec2_dense') # Context is fed into the RNN as one copy for each time step; just use # the first time step for output. # Input shape: (batch_size, repr_size, seq_len, context_len) # Output shape: (batch_size, repr_size, context_len) l_context_nonrec = SliceLayer(l_hidden_context, indices=0, axis=2, name=id_tag + 'context_nonrec') l_pool_nonrec = SliceLayer(l_pool_squeezed, indices=0, axis=2, name=id_tag + 'pool_nonrec') # Output shape: (batch_size, repr_size, context_len) l_sub = broadcast_sub_layer( l_pool_nonrec, l_context_nonrec, feature_dim=self.options.listener_cell_size, id_tag=id_tag) # Output shape: (batch_size, repr_size * 2, context_len) l_concat_sub = ConcatLayer([l_context_nonrec, l_sub], axis=1, name=id_tag + 'concat_inp_context') # Output shape: (batch_size, cell_size, context_len) l_hidden = NINLayer(l_concat_sub, num_units=self.options.listener_cell_size, nonlinearity=None, name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_dot = broadcast_dot_layer( l_rec2_drop, l_hidden_drop, feature_dim=self.options.listener_cell_size, id_tag=id_tag) l_dot_bias = l_dot # BiasLayer(l_dot, name=id_tag + 'dot_bias') l_dot_clipped = NonlinearityLayer( l_dot_bias, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'dot_clipped') l_scores = NonlinearityLayer(l_dot_clipped, nonlinearity=softmax, name=id_tag + 'scores') return l_scores, [l_in] + context_inputs
def __init__( self, incoming, time_input, event_input, num_units, num_attention, model='HELSTM', #model options: LSTM, PLSTM or HELSTM mask_input=None, ingate=Gate(), forgetgate=Gate(), cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), timegate=HELSTMGate(), nonlinearity=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.), outgate=Gate(), backwards=False, learn_init=False, peepholes=True, grad_clipping=0, bn=False, only_return_final=False, off_alpha=1e-3, **kwargs): incomings = [incoming, time_input, event_input] self.time_incoming_idx = 1 self.event_incoming_idx = 2 self.mask_incoming_index = -2 self.hid_init_incoming_index = -2 self.cell_init_incoming_index = -2 if mask_input is not None: incomings.append(mask_input) self.mask_incoming_index = len(incomings) - 1 if isinstance(hid_init, Layer): incomings.append(hid_init) self.hid_init_incoming_index = len(incomings) - 1 if isinstance(cell_init, Layer): incomings.append(cell_init) self.cell_init_incoming_index = len(incomings) - 1 super(HELSTMLayer, self).__init__(incomings, **kwargs) self.nonlinearity = nonlinearity self.learn_init = learn_init self.num_units = num_units self.num_attention = num_attention self.peepholes = peepholes self.grad_clipping = grad_clipping self.backwards = backwards self.off_alpha = off_alpha self.only_return_final = only_return_final self.model = model if self.model == 'LSTM': print 'using LSTM' elif self.model == 'PLSTM': print 'using PLSTM' else: assert self.model == 'HELSTM' print 'using HELSTM' input_shape = self.input_shapes[0] num_inputs = np.prod(input_shape[2:]) def add_gate_params(gate, gate_name): return (self.add_param(gate.W_in, (num_inputs, num_units), name="W_in_to_{}".format(gate_name)), self.add_param(gate.W_hid, (num_units, num_units), name="W_hid_to_{}".format(gate_name)), self.add_param(gate.b, (num_units, ), name="b_{}".format(gate_name), regularizable=False), gate.nonlinearity) # Add in parameters from the supplied Gate instances (self.W_in_to_ingate, self.W_hid_to_ingate, self.b_ingate, self.nonlinearity_ingate) = add_gate_params(ingate, 'ingate') (self.W_in_to_forgetgate, self.W_hid_to_forgetgate, self.b_forgetgate, self.nonlinearity_forgetgate) = add_gate_params( forgetgate, 'forgetgate') (self.W_in_to_cell, self.W_hid_to_cell, self.b_cell, self.nonlinearity_cell) = add_gate_params(cell, 'cell') (self.W_in_to_outgate, self.W_hid_to_outgate, self.b_outgate, self.nonlinearity_outgate) = add_gate_params(outgate, 'outgate') # If peephole (cell to gate) connections were enabled, initialize # peephole connections. These are elementwise products with the cell # state, so they are represented as vectors. if self.peepholes: self.W_cell_to_ingate = self.add_param(ingate.W_cell, (num_units, ), name="W_cell_to_ingate") self.W_cell_to_forgetgate = self.add_param( forgetgate.W_cell, (num_units, ), name="W_cell_to_forgetgate") self.W_cell_to_outgate = self.add_param(outgate.W_cell, (num_units, ), name="W_cell_to_outgate") # Setup initial values for the cell and the hidden units if isinstance(cell_init, Layer): self.cell_init = cell_init else: self.cell_init = self.add_param(cell_init, (1, num_units), name="cell_init", trainable=learn_init, regularizable=False) if isinstance(hid_init, Layer): self.hid_init = hid_init else: self.hid_init = self.add_param(hid_init, (1, self.num_units), name="hid_init", trainable=learn_init, regularizable=False) if bn: self.bn = lasagne.layers.BatchNormLayer( input_shape, axes=(0, 1)) # create BN layer for correct input shape self.params.update(self.bn.params) # make BN params your params else: self.bn = False def add_timegate_params(gate, gate_name, attention=False): params = [ self.add_param(gate.Period, (num_units, ), name="Period_{}".format(gate_name)), self.add_param(gate.Shift, (num_units, ), name="Shift_{}".format(gate_name)), self.add_param(gate.On_End, (num_units, ), name="On_End_{}".format(gate_name)) ] if attention: params += [ self.add_param(gate.Event_W, (num_inputs, num_attention), name="Event_W_{}".format(gate_name)), self.add_param(gate.Event_b, (num_attention, ), name="Event_b_{}".format(gate_name)), self.add_param(gate.out_W, (num_attention, num_units), name="out_b_{}".format(gate_name)), self.add_param(gate.out_b, (num_units, ), name="out_b_{}".format(gate_name)) ] return params if model != 'LSTM': if model == 'PLSTM': (self.period_timegate, self.shift_timegate, self.on_end_timegate) = add_timegate_params( timegate, 'timegate') else: assert model == 'HELSTM' (self.period_timegate, self.shift_timegate, self.on_end_timegate, self.event_w_timegate, self.event_b_timegate, self.out_w_timegate, self.out_b_timegate) = add_timegate_params(timegate, 'timegate', attention=True)
def main(num_epochs=NEPOCH): print("Loading data ...") snli = SNLI(batch_size=BSIZE) train_batches = list(snli.train_minibatch_generator()) dev_batches = list(snli.dev_minibatch_generator()) test_batches = list(snli.test_minibatch_generator()) W_word_embedding = snli.weight # W shape: (# vocab size, WE_DIM) del snli print("Building network ...") ########### sentence embedding encoder ########### # sentence vector, with each number standing for a word number input_var = T.TensorType('int32', [False, False])('sentence_vector') input_var.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 20), 'int32'), numpy.zeros( (BSIZE, 5)).astype('int32'))) input_var.tag.test_value[1, 20:22] = (413, 45) l_in = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var) input_mask = T.TensorType('int32', [False, False])('sentence_mask') input_mask.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 20), dtype='int32'), numpy.zeros((BSIZE, 5), dtype='int32'))) input_mask.tag.test_value[1, 20:22] = 1 l_mask = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask) # output shape (BSIZE, None, WEDIM) l_word_embed = lasagne.layers.EmbeddingLayer( l_in, input_size=W_word_embedding.shape[0], output_size=W_word_embedding.shape[1], W=W_word_embedding) # bidirectional LSTM l_forward = lasagne.layers.LSTMLayer( l_word_embed, mask_input=l_mask, num_units=LSTMHID, ingate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), forgetgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), cell=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), nonlinearity=lasagne.nonlinearities.tanh, peepholes=False, grad_clipping=GCLIP) l_backward = lasagne.layers.LSTMLayer( l_word_embed, mask_input=l_mask, num_units=LSTMHID, ingate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), forgetgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), cell=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(W_in=init.Normal(STD), W_hid=init.Normal(STD), W_cell=init.Normal(STD)), nonlinearity=lasagne.nonlinearities.tanh, peepholes=False, grad_clipping=GCLIP, backwards=True) # output dim: (BSIZE, None, 2*LSTMHID) l_concat = lasagne.layers.ConcatLayer([l_forward, l_backward], axis=2) l_concat_dpout = lasagne.layers.DropoutLayer( l_concat, p=DPOUT, rescale=True) # might not need this line # Attention mechanism to get sentence embedding # output dim: (BSIZE, None, ATTHID) l_ws1 = DenseLayer3DInput(l_concat_dpout, num_units=ATTHID) l_ws1_dpout = lasagne.layers.DropoutLayer(l_ws1, p=DPOUT, rescale=True) # output dim: (BSIZE, None, NROW) l_ws2 = DenseLayer3DInput(l_ws1_dpout, num_units=NROW, nonlinearity=None) l_annotations = Softmax3D(l_ws2, mask=l_mask) # output dim: (BSIZE, 2*LSTMHID, NROW) l_sentence_embedding = ApplyAttention([l_annotations, l_concat]) # beam search? Bi lstm in the sentence embedding layer? etc. ########### get embeddings for hypothesis and premise ########### # hypothesis input_var_h = T.TensorType('int32', [False, False])('hypothesis_vector') input_var_h.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 18), 'int32'), numpy.zeros( (BSIZE, 6)).astype('int32'))) l_in_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_h) input_mask_h = T.TensorType('int32', [False, False])('hypo_mask') input_mask_h.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 18), dtype='int32'), numpy.zeros((BSIZE, 6), dtype='int32'))) input_mask_h.tag.test_value[1, 18:22] = 1 l_mask_h = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_h) # premise input_var_p = T.TensorType('int32', [False, False])('premise_vector') input_var_p.tag.test_value = numpy.hstack( (numpy.random.randint(1, 10000, (BSIZE, 16), 'int32'), numpy.zeros( (BSIZE, 3)).astype('int32'))) l_in_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_var_p) input_mask_p = T.TensorType('int32', [False, False])('premise_mask') input_mask_p.tag.test_value = numpy.hstack((numpy.ones( (BSIZE, 16), dtype='int32'), numpy.zeros((BSIZE, 3), dtype='int32'))) input_mask_p.tag.test_value[1, 16:18] = 1 l_mask_p = lasagne.layers.InputLayer(shape=(BSIZE, None), input_var=input_mask_p) hypothesis_embedding, hypothesis_annotation = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_h.input_var, l_mask: l_mask_h.input_var }) premise_embedding, premise_annotation = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_p.input_var, l_mask: l_mask_p.input_var }) hypothesis_embedding_clean, hypothesis_annotation_clean = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_h.input_var, l_mask: l_mask_h.input_var }, deterministic=True) premise_embedding_clean, premise_annotation_clean = lasagne.layers.get_output( [l_sentence_embedding, l_annotations], { l_in: l_in_p.input_var, l_mask: l_mask_p.input_var }, deterministic=True) ########### gated encoder and output MLP ########## l_hypo_embed = lasagne.layers.InputLayer(shape=(BSIZE, NROW, 2 * LSTMHID), input_var=hypothesis_embedding) l_hypo_embed_dpout = lasagne.layers.DropoutLayer(l_hypo_embed, p=DPOUT, rescale=True) l_pre_embed = lasagne.layers.InputLayer(shape=(BSIZE, NROW, 2 * LSTMHID), input_var=premise_embedding) l_pre_embed_dpout = lasagne.layers.DropoutLayer(l_pre_embed, p=DPOUT, rescale=True) # output dim: (BSIZE, NROW, 2*LSTMHID) l_factors = GatedEncoder3D([l_hypo_embed_dpout, l_pre_embed_dpout], num_hfactors=2 * LSTMHID) l_factors_dpout = lasagne.layers.DropoutLayer(l_factors, p=DPOUT, rescale=True) # l_hids = DenseLayer3DWeight() l_outhid = lasagne.layers.DenseLayer( l_factors_dpout, num_units=OUTHID, nonlinearity=lasagne.nonlinearities.rectify) l_outhid_dpout = lasagne.layers.DropoutLayer(l_outhid, p=DPOUT, rescale=True) l_output = lasagne.layers.DenseLayer( l_outhid_dpout, num_units=3, nonlinearity=lasagne.nonlinearities.softmax) ########### target, cost, validation, etc. ########## target_values = T.ivector('target_output') target_values.tag.test_value = numpy.asarray([ 1, ] * BSIZE, dtype='int32') network_output = lasagne.layers.get_output(l_output) network_prediction = T.argmax(network_output, axis=1) accuracy = T.mean(T.eq(network_prediction, target_values)) network_output_clean = lasagne.layers.get_output( l_output, { l_hypo_embed: hypothesis_embedding_clean, l_pre_embed: premise_embedding_clean }, deterministic=True) network_prediction_clean = T.argmax(network_output_clean, axis=1) accuracy_clean = T.mean(T.eq(network_prediction_clean, target_values)) # penalty term and cost attention_penalty = T.mean( (T.batched_dot(hypothesis_annotation, hypothesis_annotation.dimshuffle(0, 2, 1)) - T.eye(hypothesis_annotation.shape[1]).dimshuffle('x', 0, 1))**2, axis=(0, 1, 2)) + T.mean( (T.batched_dot(premise_annotation, premise_annotation.dimshuffle(0, 2, 1)) - T.eye(premise_annotation.shape[1]).dimshuffle('x', 0, 1))**2, axis=(0, 1, 2)) L2_lstm = ((l_forward.W_in_to_ingate ** 2).sum() + \ (l_forward.W_hid_to_ingate ** 2).sum() + \ (l_forward.W_in_to_forgetgate ** 2).sum() + \ (l_forward.W_hid_to_forgetgate ** 2).sum() + \ (l_forward.W_in_to_cell ** 2).sum() + \ (l_forward.W_hid_to_cell ** 2).sum() + \ (l_forward.W_in_to_outgate ** 2).sum() + \ (l_forward.W_hid_to_outgate ** 2).sum() + \ (l_backward.W_in_to_ingate ** 2).sum() + \ (l_backward.W_hid_to_ingate ** 2).sum() + \ (l_backward.W_in_to_forgetgate ** 2).sum() + \ (l_backward.W_hid_to_forgetgate ** 2).sum() + \ (l_backward.W_in_to_cell ** 2).sum() + \ (l_backward.W_hid_to_cell ** 2).sum() + \ (l_backward.W_in_to_outgate ** 2).sum() + \ (l_backward.W_hid_to_outgate ** 2).sum()) L2_attention = (l_ws1.W**2).sum() + (l_ws2.W**2).sum() L2_gae = (l_factors.Wxf**2).sum() + (l_factors.Wyf**2).sum() L2_outputhid = (l_outhid.W**2).sum() L2_softmax = (l_output.W**2).sum() L2 = L2_lstm + L2_attention + L2_gae + L2_outputhid + L2_softmax cost = T.mean(T.nnet.categorical_crossentropy(network_output, target_values)) + \ L2REG * L2 cost_clean = T.mean(T.nnet.categorical_crossentropy(network_output_clean, target_values)) + \ L2REG * L2 if ATTPENALTY != 0.: cost = cost + ATTPENALTY * attention_penalty cost_clean = cost_clean + ATTPENALTY * attention_penalty # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(l_output) + \ lasagne.layers.get_all_params(l_sentence_embedding) if not UPDATEWE: all_params.remove(l_word_embed.W) numparams = sum( [numpy.prod(i) for i in [i.shape.eval() for i in all_params]]) print("Number of params: {}\nName\t\t\tShape\t\t\tSize".format(numparams)) print("-----------------------------------------------------------------") for item in all_params: print("{0:24}{1:24}{2}".format(item, item.shape.eval(), numpy.prod(item.shape.eval()))) # if exist param file then load params look_for = 'params' + os.sep + 'params_' + filename + '.pkl' if os.path.isfile(look_for): print("Resuming from file: " + look_for) all_param_values = cPickle.load(open(look_for, 'rb')) for p, v in zip(all_params, all_param_values): p.set_value(v) # Compute SGD updates for training print("Computing updates ...") updates = lasagne.updates.adagrad(cost, all_params, LR) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost, accuracy], updates=updates) compute_cost = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var, target_values ], [cost_clean, accuracy_clean]) predict = theano.function([ l_in_h.input_var, l_mask_h.input_var, l_in_p.input_var, l_mask_p.input_var ], network_prediction_clean) def evaluate(mode, verbose=False): if mode == 'dev': data = dev_batches if mode == 'test': data = test_batches set_cost = 0. set_accuracy = 0. for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(data, 1): _cost, _accuracy = compute_cost(hypo, hm, premise, pm, truth) set_cost = (1.0 - 1.0 / batches_seen) * set_cost + \ 1.0 / batches_seen * _cost set_accuracy = (1.0 - 1.0 / batches_seen) * set_accuracy + \ 1.0 / batches_seen * _accuracy if verbose == True: predicted = [] truth = [] for batches_seen, (hypo, hm, premise, pm, th) in enumerate(data, 1): predicted.append(predict(hypo, hm, premise, pm)) truth.append(th) truth = numpy.concatenate(truth) predicted = numpy.concatenate(predicted) cm = confusion_matrix(truth, predicted) pr_a = cm.trace() * 1.0 / truth.size pr_e = ((cm.sum(axis=0)*1.0/truth.size) * \ (cm.sum(axis=1)*1.0/truth.size)).sum() k = (pr_a - pr_e) / (1 - pr_e) print(mode + " set statistics:") print("kappa index of agreement: %f" % k) print("confusion matrix:") print(cm) return set_cost, set_accuracy print("Done. Evaluating scratch model ...") test_set_cost, test_set_accuracy = evaluate('test', verbose=True) print("BEFORE TRAINING: dev cost %f, accuracy %f" % (test_set_cost, test_set_accuracy)) print("Training ...") try: for epoch in range(num_epochs): train_set_cost = 0. train_set_accuracy = 0. start = time.time() for batches_seen, (hypo, hm, premise, pm, truth) in enumerate(train_batches, 1): _cost, _accuracy = train(hypo, hm, premise, pm, truth) train_set_cost = (1.0 - 1.0 / batches_seen) * train_set_cost + \ 1.0 / batches_seen * _cost train_set_accuracy = (1.0 - 1.0 / batches_seen) * train_set_accuracy + \ 1.0 / batches_seen * _accuracy if batches_seen % 100 == 0: end = time.time() print( "Sample %d %.2fs, lr %.4f, train cost %f, accuracy %f" % (batches_seen * BSIZE, end - start, LR, train_set_cost, train_set_accuracy)) start = end if batches_seen % 2000 == 0: dev_set_cost, dev_set_accuracy = evaluate('dev') print("***dev cost %f, accuracy %f" % (dev_set_cost, dev_set_accuracy)) # save parameters all_param_values = [p.get_value() for p in all_params] cPickle.dump( all_param_values, open('params' + os.sep + 'params_' + filename + '.pkl', 'wb')) dev_set_cost, dev_set_accuracy = evaluate('dev') test_set_cost, test_set_accuracy = evaluate('test', verbose=True) print("epoch %d, cost: train %f dev %f test %f;\n" " accu: train %f dev %f test %f" % (epoch, train_set_cost, dev_set_cost, test_set_cost, train_set_accuracy, dev_set_accuracy, test_set_accuracy)) except KeyboardInterrupt: pdb.set_trace() pass