def __init__(self, hs, x, mask, seqmask, x_dim, outputs_info, args, suffix=''): self.recdrop = args.recdrop self.W_concat, self.b_concat = _linear_params(args.rnn_dim * 2, args.rnn_dim, 'concat%s' % suffix) self.W_att1, self.b_att1 = _linear_params(args.rnn_dim, args.rnn_dim, 'att1%s' % suffix) self.W_att2, self.b_att2 = _linear_params(args.rnn_dim, args.rnn_dim, 'att2%s' % suffix) self.hs = hs # e.g. from encoder self.phi_hs = T.tanh(T.dot(self.hs, self.W_att1) + self.b_att1) super(GRULayerAttention, self).__init__(x, mask, seqmask, x_dim, outputs_info, args, suffix=suffix)
def __init__(self, hs, x, mask, seqmask, x_dim, outputs_info, args, suffix=""): self.recdrop = args.recdrop self.W_concat, self.b_concat = _linear_params(args.rnn_dim * 2, args.rnn_dim, "concat%s" % suffix) self.W_att1, self.b_att1 = _linear_params(args.rnn_dim, args.rnn_dim, "att1%s" % suffix) self.W_att2, self.b_att2 = _linear_params(args.rnn_dim, args.rnn_dim, "att2%s" % suffix) self.hs = hs # e.g. from encoder self.phi_hs = T.tanh(T.dot(self.hs, self.W_att1) + self.b_att1) super(GRULayerAttention, self).__init__(x, mask, seqmask, x_dim, outputs_info, args, suffix=suffix)
def __init__(self, x, mask, seqmask, x_dim, outputs_info, args, suffix=''): # NOTE if want to stack should equal hdim self.xdim = x_dim self.hdim = args.rnn_dim self.recdrop = args.recdrop self.stocdrop = args.stocdrop self.batch_norm = args.batch_norm if args.ortho: W = np.concatenate([norm_init(self.xdim, self.hdim, scale=0.01)] * 4, axis=1) U = np.concatenate([ortho_init(self.hdim, self.hdim, scale=0.05)] * 4, axis=1) b = np.zeros((4 * self.hdim, )).astype(floatX) self.W = theano.shared(W, name='W%s' % suffix) self.b = theano.shared(b, name='b%s' % suffix) self.U = theano.shared(U, name='U%s' % suffix) else: self.W, self.b = _linear_params(self.xdim, 4 * self.hdim, 'W%s' % suffix) self.U = _linear_params(self.hdim, 4 * self.hdim, 'U%s' % suffix, bias=False) self.params = [self.W, self.b, self.U] initial_gamma = 0.1 if self.batch_norm: self.gamma_inputs = theano.shared( initial_gamma * np.ones(4 * self.hdim, ).astype('float32')) self.gamma_hiddens = theano.shared( initial_gamma * np.ones(4 * self.hdim, ).astype('float32')) self.gamma_outputs = theano.shared( initial_gamma * np.ones(self.hdim, ).astype('float32')) self.beta_outputs = theano.shared( np.zeros(self.hdim, ).astype('float32')) self.params += [ self.gamma_inputs, self.gamma_hiddens, self.gamma_outputs, self.beta_outputs ] rval, updates = theano.scan(self._step, sequences=[x, seqmask], outputs_info=outputs_info) # out should be of dim (sequence length, batch size, hidden size) self.out = rval[0] * mask[:, :, None] self.cell = rval[1] * mask[:, :, None]
def __init__(self, inp, n_in, n_out): # initialize w/ zeros self.W, self.b = _linear_params(n_in, n_out, 'sm') E = T.dot(inp, self.W) + self.b # time, batch, cat (None just keeps dimension) E = T.exp(E - T.max(E, axis=2, keepdims=True)) pmf = E / T.sum(E, axis=2, keepdims=True) self.p_y_given_x = pmf self.y_pred = T.argmax(self.p_y_given_x, axis=1) self.out = self.p_y_given_x # parameters of the model self.params = [self.W, self.b]
def __init__(self, inp, n_in, n_out): # initialize w/ zeros self.W, self.b = _linear_params(n_in, n_out, "sm") E = T.dot(inp, self.W) + self.b # time, batch, cat (None just keeps dimension) E = T.exp(E - T.max(E, axis=2, keepdims=True)) pmf = E / T.sum(E, axis=2, keepdims=True) self.p_y_given_x = pmf self.y_pred = T.argmax(self.p_y_given_x, axis=1) self.out = self.p_y_given_x # parameters of the model self.params = [self.W, self.b]
def __init__(self, x, mask, seqmask, x_dim, outputs_info, args, suffix=""): # NOTE if want to stack should equal hdim self.xdim = x_dim self.hdim = args.rnn_dim self.recdrop = args.recdrop self.stocdrop = args.stocdrop self.batch_norm = args.batch_norm if args.ortho: W = np.concatenate([norm_init(self.xdim, self.hdim, scale=0.01)] * 4, axis=1) U = np.concatenate([ortho_init(self.hdim, self.hdim, scale=0.05)] * 4, axis=1) b = np.zeros((4 * self.hdim,)).astype(floatX) self.W = theano.shared(W, name="W%s" % suffix) self.b = theano.shared(b, name="b%s" % suffix) self.U = theano.shared(U, name="U%s" % suffix) else: self.W, self.b = _linear_params(self.xdim, 4 * self.hdim, "W%s" % suffix) self.U = _linear_params(self.hdim, 4 * self.hdim, "U%s" % suffix, bias=False) self.params = [self.W, self.b, self.U] initial_gamma = 0.1 if self.batch_norm: self.gamma_inputs = theano.shared(initial_gamma * np.ones(4 * self.hdim).astype("float32")) self.gamma_hiddens = theano.shared(initial_gamma * np.ones(4 * self.hdim).astype("float32")) self.gamma_outputs = theano.shared(initial_gamma * np.ones(self.hdim).astype("float32")) self.beta_outputs = theano.shared(np.zeros(self.hdim).astype("float32")) self.params += [self.gamma_inputs, self.gamma_hiddens, self.gamma_outputs, self.beta_outputs] rval, updates = theano.scan(self._step, sequences=[x, seqmask], outputs_info=outputs_info) # out should be of dim (sequence length, batch size, hidden size) self.out = rval[0] * mask[:, :, None] self.cell = rval[1] * mask[:, :, None]
def __init__(self, x, dim, suffix=""): # NOTE if want to stack should equal hdim self.W, self.b = _linear_params(dim * 2, dim, "ds%s" % suffix) # x.shape = [seq_len, batch_size, hdim] # x1.shape = [batch_size, seq_len / 2, hdim * 2] x1 = x.dimshuffle([1, 0, 2]).reshape([x.shape[1], x.shape[0] / 2, x.shape[2] * 2]) # x2.shape = [batch_size, seq_len / 2, hdim] x2 = x1.dot(self.W) + self.b # x3.shape = [seq_len / 2, batch_size, hdim] x3 = x2.dimshuffle([1, 0, 2]) self.out = T.tanh(x3) self.params = [self.W, self.b]
def __init__(self, x, dim, suffix=''): # NOTE if want to stack should equal hdim self.W, self.b = _linear_params(dim * 2, dim, 'ds%s' % suffix) # x.shape = [seq_len, batch_size, hdim] # x1.shape = [batch_size, seq_len / 2, hdim * 2] x1 = x.dimshuffle([1, 0, 2]).reshape( [x.shape[1], x.shape[0] / 2, x.shape[2] * 2]) # x2.shape = [batch_size, seq_len / 2, hdim] x2 = x1.dot(self.W) + self.b # x3.shape = [seq_len / 2, batch_size, hdim] x3 = x2.dimshuffle([1, 0, 2]) self.out = T.tanh(x3) self.params = [self.W, self.b]
def __init__(self, x, mask, seqmask, x_dim, outputs_info, args, suffix="", backwards=False): # NOTE if want to stack should equal hdim self.xdim = x_dim self.hdim = args.rnn_dim self.backwards = backwards self.recdrop = args.recdrop self.stocdrop = args.stocdrop # initialize parameters # TODO maybe try initialization here: https://github.com/kyunghyuncho/dl4mt-material/blob/master/session1/nmt.py, helps for memorizing long sequences self.W_z, self.b_wz = _linear_params(self.xdim, self.hdim, "wz%s" % suffix, act=T.nnet.sigmoid) self.U_z, self.b_uz = _linear_params(self.hdim, self.hdim, "uz%s" % suffix, act=T.nnet.sigmoid) self.W_r, self.b_wr = _linear_params(self.xdim, self.hdim, "wr%s" % suffix, act=T.nnet.sigmoid) self.U_r, self.b_ur = _linear_params(self.hdim, self.hdim, "ur%s" % suffix, act=T.nnet.sigmoid) self.W_h, self.b_wh = _linear_params(self.xdim, self.hdim, "wh%s" % suffix) self.U_h, self.b_uh = _linear_params(self.hdim, self.hdim, "uh%s" % suffix) self.setup(x, mask, seqmask, outputs_info)
def __init__(self, x, mask, seqmask, x_dim, outputs_info, args, suffix='', backwards=False): # NOTE if want to stack should equal hdim self.xdim = x_dim self.hdim = args.rnn_dim self.backwards = backwards self.recdrop = args.recdrop self.stocdrop = args.stocdrop # initialize parameters # TODO maybe try initialization here: https://github.com/kyunghyuncho/dl4mt-material/blob/master/session1/nmt.py, helps for memorizing long sequences self.W_z, self.b_wz = _linear_params(self.xdim, self.hdim, 'wz%s' % suffix, act=T.nnet.sigmoid) self.U_z, self.b_uz = _linear_params(self.hdim, self.hdim, 'uz%s' % suffix, act=T.nnet.sigmoid) self.W_r, self.b_wr = _linear_params(self.xdim, self.hdim, 'wr%s' % suffix, act=T.nnet.sigmoid) self.U_r, self.b_ur = _linear_params(self.hdim, self.hdim, 'ur%s' % suffix, act=T.nnet.sigmoid) self.W_h, self.b_wh = _linear_params(self.xdim, self.hdim, 'wh%s' % suffix) self.U_h, self.b_uh = _linear_params(self.hdim, self.hdim, 'uh%s' % suffix) self.setup(x, mask, seqmask, outputs_info)