def _init_params(self): shape_xh = (self.n_in, self.n_hids) shape_hh = (self.n_hids, self.n_hids) self.W_xz = norm_weight(shape=shape_xh, name=_p(self.pname, 'W_xz')) self.W_xr = norm_weight(shape=shape_xh, name=_p(self.pname, 'W_xr')) self.W_xh = norm_weight(shape=shape_xh, name=_p(self.pname, 'W_xh')) self.b_z = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_z')) self.b_r = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_r')) self.b_h = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_h')) self.W_hz = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_hz')) self.W_hr = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_hr')) self.W_hh = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_hh')) self.params = [ self.W_xz, self.W_xr, self.W_xh, self.W_hz, self.W_hr, self.W_hh, self.b_z, self.b_r, self.b_h ] if self.with_context: shape_ch = (self.c_hids, self.n_hids) self.W_cz = norm_weight(shape=shape_ch, name=_p(self.pname, 'W_cz')) self.W_cr = norm_weight(shape=shape_ch, name=_p(self.pname, 'W_cr')) self.W_ch = norm_weight(shape=shape_ch, name=_p(self.pname, 'W_ch')) self.W_c_init = norm_weight(shape=shape_ch, name=_p(self.pname, 'W_c_init')) self.params += [self.W_cz, self.W_cr, self.W_ch, self.W_c_init]
def run_pipeline(self, state_below, mask_below, context=None): hiddens = self.apply(state_below, mask_below, context=context) if self.with_context: n_in = self.n_in + self.n_hids + self.c_hids n_out = self.n_hids * 2 n_times = K.shape(state_below)[0] r_context = ReplicateLayer(context, n_times) combine = K.concatenate([state_below, hiddens, r_context], axis=2) else: n_in = self.n_in + self.n_hids n_out = self.n_hids * 2 # for maxout combine = K.concatenate([state_below, hiddens], axis=2) self.W_m = norm_weight(shape=(n_in, n_out), name=_p(self.pname, 'W_m')) self.b_m = constant_weight(shape=(n_out, ), name=_p(self.pname, 'b_m')) self.params += [self.W_m, self.b_m] # maxout merge_out = K.dot(combine, self.W_m) + self.b_m merge_out_shape = K.shape(merge_out) merge_max_out = K.max(K.reshape(merge_out, shape=(merge_out_shape[0], merge_out_shape[1], merge_out_shape[2] / 2, 2)), axis=3) return merge_max_out * mask_below
def _init_params(self): shape_hh = (self.n_hids, self.n_hids) if self.with_attention: self.B_hp = norm_weight(shape=shape_hh, name=_p(self.pname, 'B_hp')) self.b_tt = constant_weight(shape=(self.n_hids,), name=_p(self.pname, 'b_tt')) self.D_pe = norm_weight(shape=(self.n_hids, 1), name=_p(self.pname, 'D_pe')) self.params = [self.B_hp, self.b_tt, self.D_pe]
def __init__(self, rng, n_in, n_out, name='LR'): # initialize the weights W as a matrix of shape (n_in, n_out) self.W = norm_weight(rng=rng, shape=(n_in, n_out), name=_p(name, 'W')) # initialize the baises b as a vector of n_out 0s self.b = constant_weight(shape=(n_out, ), name=_p(name, 'b')) # parameters of the model self.params = [self.W, self.b]
def _init_params(self): shape_xh = (self.n_in*3, self.n_hids) shape_hh = (self.n_hids*3, self.n_hids) self.W_x = norm_weight(shape=shape_xh, name=_p(self.pname, 'W_x')) self.b = constant_weight(shape=(self.n_hids*3,), name=_p(self.pname, 'b')) self.W_h = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_h')) self.params = [self.W_x, self.W_h, self.b] self.GRU_op = mkl_gru.GRU(hid=self.n_hids, return_sequences=True) self.h_init_state = numpy.zeros((80, 1000), numpy.float64)
def _init_params(self): shape_io = (self.n_in, self.n_out) if self.orth: if self.n_in != self.n_out : raise ValueError('n_in != n_out when require orth in FeedForward') self.W = ortho_weight(rng=self.rng, shape=shape_io, name=_p(self.pname, 'W')) else: self.W = norm_weight(rng=self.rng, shape=shape_io, name=_p(self.pname, 'W')) self.b = constant_weight(shape=(self.n_out, ), name=_p(self.pname, 'b')) self.params = [self.W, self.b]
def _init_params(self): shape_xh = (self.n_in, self.n_hids) shape_xh2 = (self.n_in, 2*self.n_hids) shape_hh = (self.n_hids, self.n_hids) shape_hh2 = (self.n_hids, 2*self.n_hids) self.W_xzr = norm_weight(rng=self.rng, shape=shape_xh2, name=_p(self.pname, 'W_xzr')) self.W_xh = norm_weight(rng=self.rng, shape=shape_xh, name=_p(self.pname, 'W_xh')) self.b_zr = constant_weight(shape=(2*self.n_hids, ), name=_p(self.pname, 'b_zr')) self.b_h = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_h')) self.W_hzr = multi_orth(rng=self.rng, size=shape_hh2, name=_p(self.pname, 'W_hzr')) self.W_hh = ortho_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_hh')) self.params += [self.W_xzr, self.W_xh, self.W_hzr, self.W_hh, self.b_zr, self.b_h] if self.with_context: shape_ch = (self.c_hids, self.n_hids) self.W_cz = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_cz')) self.W_cr = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_cr')) self.W_ch = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_ch')) self.W_c_init = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_c_init')) self.params += [self.W_cz, self.W_cr, self.W_ch, self.W_c_init] if self.with_begin_tag: self.struct_begin_tag = constant_weight(shape=(self.n_hids,), value=0., name=_p(self.pname, 'struct_begin_tag')) self.params += [self.struct_begin_tag] if self.with_end_tag: self.struct_end_tag = constant_weight(shape=(self.n_in,), value=0., name=_p(self.pname, 'struct_end_tag')) self.params += [self.struct_end_tag] if self.n_att_ctx: self.gru_combine_ctx_h = GRU(self.n_att_ctx, self.n_hids, rng=self.rng, name=_p(self.pname, 'gru_combine_ctx_h')) self.params.extend(self.gru_combine_ctx_h.params) self.attention = ATTENTION(self.n_hids, self.rng, name=_p(self.pname, 'att_ctx')) self.params.extend(self.attention.params)
def _init_params(self): shape_i0o = (self.n_in_0, self.n_out) shape_i1o = (self.n_in_1, self.n_out) if self.orth: if self.n_in_0 != self.n_out or self.n_in_1 != self.n_out: raise ValueError('n_in != n_out when require orth in FeedForward') self.W0 = ortho_weight(rng=self.rng, shape=shape_i0o, name=_p(self.pname, 'W0')) self.W1 = ortho_weight(rng=self.rng, shape=shape_i1o, name=_p(self.pname, 'W1')) else: self.W0 = norm_weight(rng=self.rng, shape=shape_i0o, name=_p(self.pname, 'W0')) self.W1 = norm_weight(rng=self.rng, shape=shape_i1o, name=_p(self.pname, 'W1')) self.b = constant_weight(shape=(self.n_out, ), name=_p(self.pname, 'b')) self.params = [self.W0, self.W1, self.b]
def __init__(self , input , n_in , n_out , name='LR'): """ input : batch_size * sentence_size * hidden n_in : hidden_size n_out : tags_size output(y_pred) : batch_size * sentence_size * tags_size """ self.W = uniform_weight(shape=(n_in , n_out) , name= _p(name , 'W')) self.b = constant_weight(shape=(n_out , ) , name = _p(name , 'b')) self.input = input energy = T.dot(input , self.W) + self.b if energy.ndim == 3: #with batch energy_exp = T.exp(energy - T.max(energy , 2 , keepdims = True)) pmf = energy_exp / energy_exp.sum(2 , keepdims = True) self.p_y_given_x = pmf else: self.p_y_given_x = T.nnet.softmax(energy) self.y_pred = T.argmax(self.p_y_given_x , axis = -1) self.params = [self.W , self.b]
def __init__(self , input , n_in , n_out , W = None , b=None , activation = T.tanh , name = 'HD'): """ input : batch_size * sentence_size * (embedding_size * windows_size) n_in : embedding_size * windows_size n_out : hidden output(self.output) : batch_size * sentence_size * hidden """ self.input = input if W is None: W = uniform_weight(shape=(n_in , n_out) , name= _p(name , "W")) if b is None: b = constant_weight(shape=(n_out , ) , name= _p(name , "b")) self.W = W self.b = b output = T.dot(input , self.W) + self.b self.output = ( output if activation is None else activation(output) ) self.params = [self.W , self.b]
def _init_params(self): shape_xh = (self.n_in, self.n_hids) shape_xh4 = (self.n_in, 4*self.n_hids) shape_hh = (self.n_hids, self.n_hids) shape_hh4 = (self.n_hids, 4*self.n_hids) self.W_pre_x = norm_weight(rng=self.rng, shape=shape_xh4, name=_p(self.pname, 'W_pre_x')) self.W_h = multi_orth(rng=self.rng, size=shape_hh4, name=_p(self.pname, 'W_h')) b_i = constant_weight(share=False, shape=(self.n_hids, ), name=_p(self.pname, 'b_i')) b_f = constant_weight(share=False, value=1., shape=(self.n_hids, ), name=_p(self.pname, 'b_f')) b_o = constant_weight(share=False, shape=(self.n_hids, ), name=_p(self.pname, 'b_o')) b_c = constant_weight(share=False, shape=(self.n_hids, ), name=_p(self.pname, 'b_c')) b_ifoc = numpy.concatenate([b_i, b_f, b_o, b_c], axis=0) self.b_pre_x = theano.shared(value=b_ifoc, borrow=True, name=_p(self.pname, 'b_pre_x')) self.params += [self.W_pre_x, self.W_h, self.b_pre_x] if self.with_context: raise NotImplementedError if self.with_begin_tag: self.struct_begin_tag = constant_weight(shape=(self.n_hids,), value=0., name=_p(self.pname, 'struct_begin_tag')) self.params += [self.struct_begin_tag] if self.with_end_tag: self.struct_end_tag = constant_weight(shape=(self.n_in,), value=0., name=_p(self.pname, 'struct_end_tag')) self.params += [self.struct_end_tag] if self.n_att_ctx: self.lstm_combine_ctx_h = LSTM(self.n_att_ctx, self.n_hids, rng=self.rng, name=_p(self.pname, 'lstm_combine_ctx_h')) self.params.extend(self.lstm_combine_ctx_h.params) self.attention = ATTENTION(self.n_hids, self.rng, name=_p(self.pname, 'att_ctx')) self.params.extend(self.attention.params) if self.seq_pyramid: self.pyramid_on_seq = LSTM(self.n_att_ctx, self.n_att_ctx, rng=self.rng, name=_p(self.pname, 'pyramid_on_seq')) self.params.extend(self.pyramid_on_seq.params) self.ff_pyramid2ctx = FeedForward(self.n_att_ctx, self.n_hids, name=_p(self.pname, 'ff_pyramid2ctx')) self.params.extend(self.ff_pyramid2ctx.params)
def _init_params(self): shape_xh = (self.n_in, self.n_hids) shape_hh = (self.n_hids, self.n_hids) self.W_xz = norm_weight(rng=self.rng, shape=shape_xh, name=_p(self.pname, 'W_xz')) self.W_xr = norm_weight(rng=self.rng, shape=shape_xh, name=_p(self.pname, 'W_xr')) self.W_xh = norm_weight(rng=self.rng, shape=shape_xh, name=_p(self.pname, 'W_xh')) self.b_z = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_z')) self.b_r = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_r')) self.b_h = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_h')) self.W_hz = ortho_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_hz')) self.W_hr = ortho_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_hr')) self.W_hh = ortho_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_hh')) self.params = [self.W_xz, self.W_xr, self.W_xh, self.W_hz, self.W_hr, self.W_hh, self.b_z, self.b_r, self.b_h] if self.with_layernorm: self.W_xz_lnb = constant_weight(shape=(self.n_hids), value=scale_add, name=_p(self.pname, 'xz_lnb')) self.W_xz_lns = constant_weight(shape=(self.n_hids), value=scale_mul, name=_p(self.pname, 'xz_lns')) self.W_xr_lnb = constant_weight(shape=(self.n_hids), value=scale_add, name=_p(self.pname, 'xr_lnb')) self.W_xr_lns = constant_weight(shape=(self.n_hids), value=scale_mul, name=_p(self.pname, 'xr_lns')) self.W_xh_lnb = constant_weight(shape=(self.n_hids), value=scale_add, name=_p(self.pname, 'xh_lnb')) self.W_xh_lns = constant_weight(shape=(self.n_hids), value=scale_mul, name=_p(self.pname, 'xh_lns')) self.W_z_lnb = constant_weight(shape=(self.n_hids), value=scale_add, name=_p(self.pname, 'z_lnb')) self.W_z_lns = constant_weight(shape=(self.n_hids), value=scale_mul, name=_p(self.pname, 'z_lns')) self.W_r_lnb = constant_weight(shape=(self.n_hids), value=scale_add, name=_p(self.pname, 'r_lnb')) self.W_r_lns = constant_weight(shape=(self.n_hids), value=scale_mul, name=_p(self.pname, 'r_lns')) self.W_h_lnb = constant_weight(shape=(self.n_hids), value=scale_add, name=_p(self.pname, 'h_lnb')) self.W_h_lns = constant_weight(shape=(self.n_hids), value=scale_mul, name=_p(self.pname, 'h_lns')) self.params += [self.W_xz_lnb, self.W_xz_lns, self.W_xr_lnb, self.W_xr_lns, self.W_xh_lnb, self.W_xh_lns, \ self.W_z_lnb, self.W_z_lns, self.W_r_lnb, self.W_r_lns, self.W_h_lnb, self.W_h_lns] if self.with_context: shape_ch = (self.c_hids, self.n_hids) self.W_cz = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_cz')) self.W_cr = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_cr')) self.W_ch = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_ch')) self.W_c_init = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_c_init')) self.params += [self.W_cz, self.W_cr, self.W_ch, self.W_c_init] if self.with_layernorm: self.W_cz_lnb = constant_weight(shape=(self.n_hids), value=scale_add, name=_p(self.pname, 'cz_lnb')) self.W_cz_lns = constant_weight(shape=(self.n_hids), value=scale_mul, name=_p(self.pname, 'cz_lns')) self.W_cr_lnb = constant_weight(shape=(self.n_hids), value=scale_add, name=_p(self.pname, 'cr_lnb')) self.W_cr_lns = constant_weight(shape=(self.n_hids), value=scale_mul, name=_p(self.pname, 'cr_lns')) self.W_ch_lnb = constant_weight(shape=(self.n_hids), value=scale_add, name=_p(self.pname, 'ch_lnb')) self.W_ch_lns = constant_weight(shape=(self.n_hids), value=scale_mul, name=_p(self.pname, 'ch_lns')) self.params += [self.W_cz_lnb, self.W_cz_lns, self.W_cr_lnb, self.W_cr_lns, self.W_ch_lnb, self.W_ch_lns]
def _init_params(self): # added by Zhaopeng Tu, 2016-07-12 # this for combining lastly generated words and decoder state, shape_xh = (self.n_in, self.n_hids) shape_hh = (self.n_hids, self.n_hids) self.W_xz = norm_weight(rng=self.rng, shape=shape_xh, name=_p(self.pname, 'W_xz')) self.W_xr = norm_weight(rng=self.rng, shape=shape_xh, name=_p(self.pname, 'W_xr')) self.W_xh = norm_weight(rng=self.rng, shape=shape_xh, name=_p(self.pname, 'W_xh')) self.b_z = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_z')) self.b_r = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_r')) self.b_h = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_h')) self.W_hz = ortho_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_hz')) self.W_hr = ortho_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_hr')) self.W_hh = ortho_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_hh')) self.params = [self.W_xz, self.W_xr, self.W_xh, self.W_hz, self.W_hr, self.W_hh, self.b_z, self.b_r, self.b_h] shape_ch = (self.n_cdim, self.n_hids) self.W_cz = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_cz')) self.W_cr = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_cr')) self.W_ch = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_ch')) self.W_c_init = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'W_c_init')) # modified by Zhaopeng Tu, 2016-07-29 # we don't add the new params if we use tied_weights, since we reuse the weights in decoder self.params += [self.W_cz, self.W_cr, self.W_ch, self.W_c_init] self.b_c_init = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_c_init')) self.params += [self.b_c_init] # modified by Zhaopeng Tu, 2016-06-08 # we moved the parameters below here, to make it works for both with_context and with_attention modes # commented by Zhaopeng Tu, 2016-04-29 # modification in this version # in the paper, e_{i,j} = a(s_{i-1}, h_j) # here, e_{i,j} = a(GRU(s_{i-1}, y_{i-1}), h_j), which considers the lastly generated target word # all the following parameters are for the introduced GRU # it is reasonable self.W_n1_h = ortho_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_n1_h')) self.W_n1_r = ortho_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_n1_r')) self.W_n1_z = ortho_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_n1_z')) self.b_n1_h = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_n1_h')) self.b_n1_r = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_n1_r')) self.b_n1_z = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_n1_z')) self.params += [self.W_n1_h, self.W_n1_r, self.W_n1_z, self.b_n1_h, self.b_n1_r, self.b_n1_z] ############################################### if self.with_attention: self.A_cp = norm_weight(rng=self.rng, shape=shape_ch, name=_p(self.pname, 'A_cp')) self.B_hp = norm_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'B_hp')) self.b_tt = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_tt')) self.D_pe = norm_weight(rng=self.rng, shape=(self.n_hids, 1), name=_p(self.pname, 'D_pe')) self.c_tt = constant_weight(shape=(1, ), name=_p(self.pname, 'c_tt')) self.params += [self.A_cp, self.B_hp, self.b_tt, self.D_pe, self.c_tt] # for readout n_out = self.n_in * self.maxout_part self.W_o_c = norm_weight(rng=self.rng, shape=(self.n_cdim, n_out), name=_p(self.pname, 'W_out_c')) self.W_o_h = norm_weight(rng=self.rng, shape=(self.n_hids, n_out), name=_p(self.pname, 'W_out_h')) self.W_o_e = norm_weight(rng=self.rng, shape=(self.n_in, n_out), name=_p(self.pname, 'W_out_e')) self.b_o = constant_weight(shape=(n_out, ), name=_p(self.pname, 'b_out_o')) self.params += [self.W_o_c, self.W_o_h, self.W_o_e, self.b_o]
def _init_params(self): shape_xh = (self.n_in, self.n_hids) shape_hh = (self.n_hids, self.n_hids) self.W_xz = norm_weight(shape=shape_xh, name=_p(self.pname, 'W_xz')) self.W_xr = norm_weight(shape=shape_xh, name=_p(self.pname, 'W_xr')) self.W_xh = norm_weight(shape=shape_xh, name=_p(self.pname, 'W_xh')) self.b_z = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_z')) self.b_r = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_r')) self.b_h = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_h')) self.W_hz = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_hz')) self.W_hr = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_hr')) self.W_hh = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_hh')) self.params = [ self.W_xz, self.W_xr, self.W_xh, self.W_hz, self.W_hr, self.W_hh, self.b_z, self.b_r, self.b_h ] shape_ch = (self.n_cdim, self.n_hids) self.W_cz = norm_weight(shape=shape_ch, name=_p(self.pname, 'W_cz')) self.W_cr = norm_weight(shape=shape_ch, name=_p(self.pname, 'W_cr')) self.W_ch = norm_weight(shape=shape_ch, name=_p(self.pname, 'W_ch')) self.W_c_init = norm_weight(shape=(self.n_cdim, self.n_hids), name=_p(self.pname, 'W_c_init')) self.b_c_init = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_c_init')) self.params += [ self.W_cz, self.W_cr, self.W_ch, self.W_c_init, self.b_c_init ] # we moved the parameters below here, to make it works for both with_context and with_attention modes # modification in this version # in the paper, e_{i,j} = a(s_{i-1}, h_j) # here, e_{i,j} = a(GRU(s_{i-1}, y_{i-1}), h_j), which considers the lastly generated target word # all the following parameters are for the introduced GRU # it is reasonable self.W_n1_h = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_n1_h')) self.W_n1_r = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_n1_r')) self.W_n1_z = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_n1_z')) self.b_n1_h = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_n1_h')) self.b_n1_r = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_n1_r')) self.b_n1_z = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_n1_z')) self.params += [ self.W_n1_h, self.W_n1_r, self.W_n1_z, self.b_n1_h, self.b_n1_r, self.b_n1_z ] if self.with_attention: self.A_cp = norm_weight(shape=shape_ch, name=_p(self.pname, 'A_cp')) self.B_hp = norm_weight(shape=shape_hh, name=_p(self.pname, 'B_hp')) self.b_tt = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_tt')) self.D_pe = norm_weight(shape=(self.n_hids, 1), name=_p(self.pname, 'D_pe')) self.params += [self.A_cp, self.B_hp, self.b_tt, self.D_pe] # coverage only works for attention model if self.with_coverage: shape_covh = (self.coverage_dim, self.n_hids) self.C_covp = norm_weight(shape=shape_covh, name=_p(self.pname, 'Cov_covp')) if self.coverage_type is 'linguistic': # for linguistic coverage, fertility model is necessary since it yields better translation and alignment quality self.W_cov_fertility = norm_weight(shape=(self.n_cdim, 1), name=_p( self.pname, 'W_cov_fertility')) self.b_cov_fertility = constant_weight( shape=(1, ), name=_p(self.pname, 'b_cov_fertility')) self.params += [self.W_cov_fertility, self.b_cov_fertility] else: # for neural network based coverage, gating is necessary shape_covcov = (self.coverage_dim, self.coverage_dim) self.W_cov_h = ortho_weight(shape=shape_covcov, name=_p(self.pname, 'W_cov_h')) self.W_cov_r = ortho_weight(shape=shape_covcov, name=_p(self.pname, 'W_cov_r')) self.W_cov_z = ortho_weight(shape=shape_covcov, name=_p(self.pname, 'W_cov_z')) self.b_cov_h = constant_weight(shape=(self.coverage_dim, ), name=_p( self.pname, 'b_cov_h')) self.b_cov_r = constant_weight(shape=(self.coverage_dim, ), name=_p( self.pname, 'b_cov_r')) self.b_cov_z = constant_weight(shape=(self.coverage_dim, ), name=_p( self.pname, 'b_cov_z')) self.params += [ self.W_cov_h, self.W_cov_r, self.W_cov_z, self.b_cov_h, self.b_cov_r, self.b_cov_z ] # parameters for coverage inputs # attention probablity self.W_cov_ph = norm_weight(shape=(1, self.coverage_dim), name=_p( self.pname, 'W_cov_ph')) self.W_cov_pr = norm_weight(shape=(1, self.coverage_dim), name=_p( self.pname, 'W_cov_pr')) self.W_cov_pz = norm_weight(shape=(1, self.coverage_dim), name=_p( self.pname, 'W_cov_pz')) # source annotations self.W_cov_ch = norm_weight( shape=(self.n_cdim, self.coverage_dim), name=_p(self.pname, 'W_cov_ch')) self.W_cov_cr = norm_weight( shape=(self.n_cdim, self.coverage_dim), name=_p(self.pname, 'W_cov_cr')) self.W_cov_cz = norm_weight( shape=(self.n_cdim, self.coverage_dim), name=_p(self.pname, 'W_cov_cz')) # previous decoding states self.W_cov_hh = norm_weight( shape=(self.n_hids, self.coverage_dim), name=_p(self.pname, 'W_cov_hh')) self.W_cov_hr = norm_weight( shape=(self.n_hids, self.coverage_dim), name=_p(self.pname, 'W_cov_hr')) self.W_cov_hz = norm_weight( shape=(self.n_hids, self.coverage_dim), name=_p(self.pname, 'W_cov_hz')) self.params += [ self.W_cov_ph, self.W_cov_pr, self.W_cov_pz, self.W_cov_ch, self.W_cov_cr, self.W_cov_cz, self.W_cov_hh, self.W_cov_hr, self.W_cov_hz ] # for context gate, which works for both with_attention and with_context modes if self.with_context_gate: # parameters for coverage inputs # input form target context self.W_ctx_h = norm_weight(shape=(self.n_hids, self.n_hids), name=_p(self.pname, 'W_ctx_h')) self.W_ctx_c = norm_weight(shape=(self.n_cdim, self.n_hids), name=_p(self.pname, 'W_ctx_c')) self.b_ctx = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_ctx')) self.params += [self.W_ctx_h, self.W_ctx_c] # for readout n_out = self.n_in * self.maxout_part self.W_o_c = norm_weight(shape=(self.n_cdim, n_out), name=_p(self.pname, 'W_out_c')) self.W_o_h = norm_weight(shape=(self.n_hids, n_out), name=_p(self.pname, 'W_out_h')) self.W_o_e = norm_weight(shape=(self.n_in, n_out), name=_p(self.pname, 'W_out_e')) self.b_o = constant_weight(shape=(n_out, ), name=_p(self.pname, 'b_out_o')) self.params += [self.W_o_c, self.W_o_h, self.W_o_e, self.b_o]
def _init_params(self): shape_hh = (self.n_hids, self.n_hids) self.W_comb_att = norm_weight(rng=self.rng, shape=shape_hh, name=_p(self.pname, 'W_comb_att')) self.U_att = norm_weight(rng=self.rng, shape=(self.n_hids, 1), name=_p(self.pname, 'U_att')) self.c_att = constant_weight(shape=(1,), name=_p(self.pname, 'c_att')) self.params = [self.W_comb_att, self.U_att, self.c_att]
def __init__(self, n_in, n_out, name='LR'): self.W = norm_weight(shape=(n_in, n_out), name=_p(name, 'W')) self.b = constant_weight(shape=(n_out, ), name=_p(name, 'b')) self.params = [self.W, self.b] self.n_out = n_out
def _init_params(self): # generally, parameters with shape shape_ch = (self.c_ndim, self.n_hids) can be applied with tied weights # this for combining lastly generated words and decoder state, # and thus cannot be applied with tied weights shape_xh = (self.n_in, self.n_hids) shape_hh = (self.n_hids, self.n_hids) self.W_xz = norm_weight(shape=shape_xh, name=_p(self.pname, 'W_xz')) self.W_xr = norm_weight(shape=shape_xh, name=_p(self.pname, 'W_xr')) self.W_xh = norm_weight(shape=shape_xh, name=_p(self.pname, 'W_xh')) self.b_z = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_z')) self.b_r = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_r')) self.b_h = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_h')) self.W_hz = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_hz')) self.W_hr = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_hr')) self.W_hh = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_hh')) self.params = [ self.W_xz, self.W_xr, self.W_xh, self.W_hz, self.W_hr, self.W_hh, self.b_z, self.b_r, self.b_h ] shape_ch = (self.n_cdim, self.n_hids) self.W_cz = norm_weight(shape=shape_ch, name=_p(self.pname, 'W_cz')) self.W_cr = norm_weight(shape=shape_ch, name=_p(self.pname, 'W_cr')) self.W_ch = norm_weight(shape=shape_ch, name=_p(self.pname, 'W_ch')) self.W_c_init = norm_weight(shape=shape_ch, name=_p(self.pname, 'W_c_init')) # we don't add the new params if we use tied_weights, since we reuse the weights in decoder self.params += [self.W_cz, self.W_cr, self.W_ch, self.W_c_init] self.b_c_init = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_c_init')) self.params += [self.b_c_init] # we moved the parameters below here, to make it works for both with_context and with_attention modes # modification in this version # in the paper, e_{i,j} = a(s_{i-1}, h_j) # here, e_{i,j} = a(GRU(s_{i-1}, y_{i-1}), h_j), which considers the lastly generated target word # all the following parameters are for the introduced GRU # it is reasonable self.W_n1_h = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_n1_h')) self.W_n1_r = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_n1_r')) self.W_n1_z = ortho_weight(shape=shape_hh, name=_p(self.pname, 'W_n1_z')) self.b_n1_h = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_n1_h')) self.b_n1_r = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_n1_r')) self.b_n1_z = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_n1_z')) self.params += [ self.W_n1_h, self.W_n1_r, self.W_n1_z, self.b_n1_h, self.b_n1_r, self.b_n1_z ] ############################################### if self.with_attention: self.A_cp = norm_weight(shape=shape_ch, name=_p(self.pname, 'A_cp')) self.params += [self.A_cp] self.B_hp = norm_weight(shape=shape_hh, name=_p(self.pname, 'B_hp')) self.b_tt = constant_weight(shape=(self.n_hids, ), name=_p(self.pname, 'b_tt')) self.D_pe = norm_weight(shape=(self.n_hids, 1), name=_p(self.pname, 'D_pe')) # self.c_tt = constant_weight(shape=(1,), name=_p(self.pname, 'c_tt')) self.params += [self.B_hp, self.b_tt, self.D_pe] # for error on encoder states, we don't need the probability # thus no need for readout, which costs a large number of parameters # for readout n_out = self.n_in * self.maxout_part self.W_o_c = norm_weight(shape=(self.n_cdim, n_out), name=_p(self.pname, 'W_out_c')) self.W_o_h = norm_weight(shape=(self.n_hids, n_out), name=_p(self.pname, 'W_out_h')) self.W_o_e = norm_weight(shape=(self.n_in, n_out), name=_p(self.pname, 'W_out_e')) self.b_o = constant_weight(shape=(n_out, ), name=_p(self.pname, 'b_out_o')) self.params += [self.W_o_c, self.W_o_h, self.W_o_e, self.b_o]