def _init_w_emb(self): """Initialize task-specific word embeddings. """ self.W_EMB = theano.shared(value=HE_UNIFORM((self.w_i, self.ndim)), name="W_EMB") self._params.append(self.W_EMB)
def __init__(self, a_n_x, a_n_y): """Class constructor. Args: a_n_x (int): number of underlying cassifiers a_n_y (int): number of classes to predict """ self.n_x = a_n_x self.n_y = a_n_y # define the network # input matrix self.x = TT.dmatrix(name="x") # mapping from input to output vector self.X2Y = self._init_X2Y() self.y_bias = theano.shared(value=HE_UNIFORM((1, self.n_y)), name="y_bias") # prediction vector self.y_pred = TT.nnet.softmax( TT.tensordot(self.x, self.X2Y, ((1, 0), (2, 1))) + self.y_bias) # predicted label self.y_lbl = TT.argmax(self.y_pred, axis=1)[0] self._predict = theano.function([self.x], [self.y_lbl, self.y_pred], name="predict") # define trainable parameters self._params = [self.X2Y, self.y_bias]
def _init_nn(self): """Initialize neural network. """ self.intm_dim = max(MIN_DIM, self.ndim - (self.ndim - self.n_y) / 2) # indices of word embeddings self.W_INDICES_ARG1 = TT.ivector(name="W_INDICES_ARG1") self.W_INDICES_ARG2 = TT.ivector(name="W_INDICES_ARG2") # connective's index self.CONN_INDEX = TT.iscalar(name="CONN_INDEX") # initialize the matrix of word embeddings self.init_w_emb() # word embeddings of the arguments self.EMB_ARG1 = self.W_EMB[self.W_INDICES_ARG1] self.EMB_ARG2 = self.W_EMB[self.W_INDICES_ARG2] # connective's embedding self._init_conn_emb() self.EMB_CONN = self.CONN_EMB[self.CONN_INDEX] # perform matrix decomposition _, _, self.ARG1 = TT.nlinalg.svd(self.EMB_ARG1, full_matrices=True) _, _, self.ARG2 = TT.nlinalg.svd(self.EMB_ARG2, full_matrices=True) self.ARG_DIFF = self.ARG1 - self.ARG2 # map decomposed matrices to the intermediate level self.ARG_DIFF2I = theano.shared(value=HE_UNIFORM((self.ndim, 1)), name="ARG_DIFF2I") self.arg_diff_bias = theano.shared(value=HE_UNIFORM((1, self.ndim)), name="arg_diff_bias") self._params.extend([self.ARG_DIFF2I, self.arg_diff_bias]) self.ARGS = (TT.dot(self.ARG_DIFF, self.ARG_DIFF2I).T + self.arg_diff_bias).flatten() # define final units self.I = TT.concatenate((self.ARGS, self.EMB_CONN)) self.I2Y = theano.shared(value=HE_UNIFORM( (self.n_y, self.ndim + self.intm_dim)), name="I2Y") self.y_bias = theano.shared(value=HE_UNIFORM((1, self.n_y)), name="y_bias") self._params.extend([self.I2Y, self.y_bias]) self.Y_pred = TT.nnet.softmax(TT.dot(self.I2Y, self.I).T + self.y_bias) # initialize cost and optimization functions self.Y_gold = TT.vector(name="Y_gold") self._cost = TT.sum((self.Y_pred - self.Y_gold)**2) self._dev_cost = TT.sum((self.Y_pred - self.Y_gold)**2) self._pred_class = TT.argmax(self.Y_pred) grads = TT.grad(self._cost, wrt=self._params) self._init_funcs(grads)
def _init_conn_emb(self): """Initialize task-specific connective embeddings. """ self.CONN_EMB = theano.shared(value=HE_UNIFORM( (self.c_i, self.intm_dim)), name="CONN_EMB") self._params.append(self.CONN_EMB)
def _init_nn(self): """Initialize neural network. """ self.intm_dim = max(100, self.ndim - (self.ndim - self.n_y) / 2) # indices of word embeddings self.W_INDICES_ARG1 = TT.ivector(name="W_INDICES_ARG1") self.W_INDICES_ARG2 = TT.ivector(name="W_INDICES_ARG2") # connective's index self.CONN_INDEX = TT.iscalar(name="CONN_INDEX") # initialize the matrix of word embeddings self.init_w_emb() # word embeddings of the arguments self.EMB_ARG1 = self.W_EMB[self.W_INDICES_ARG1] self.EMB_ARG2 = self.W_EMB[self.W_INDICES_ARG2] # connective's embedding self._init_conn_emb() self.EMB_CONN = self.CONN_EMB[self.CONN_INDEX] # initialize forward LSTM unit invars = ((self.EMB_ARG1, False), (self.EMB_ARG2, False)) params, outvars = self._init_lstm(invars) self._params.extend(params) self.F_OUT_ARG1, self.F_OUT_ARG2 = outvars self.F_ARG1 = TT.mean(self.F_OUT_ARG1, axis=0) self.F_ARG2 = TT.mean(self.F_OUT_ARG2, axis=0) # define final units self.I = TT.concatenate((self.F_ARG1, self.F_ARG2, self.EMB_CONN)) self.I2Y = theano.shared(value=HE_UNIFORM( (self.n_y, self.intm_dim * 3)), name="I2Y") self.y_bias = theano.shared(value=HE_UNIFORM((1, self.n_y)), name="y_bias") self._params.extend([self.I2Y, self.y_bias]) self.Y_pred = TT.nnet.softmax(TT.dot(self.I2Y, self.I).T + self.y_bias) # initialize cost and optimization functions self.Y_gold = TT.vector(name="Y_gold") self._cost = TT.sum((self.Y_pred - self.Y_gold)**2) self._dev_cost = TT.sum((self.Y_pred - self.Y_gold)**2) self._pred_class = TT.argmax(self.Y_pred) grads = TT.grad(self._cost, wrt=self._params) self._init_funcs(grads)
def _init_lstm(self, a_invars, a_sfx="-forward"): """Initialize LSTM layer. Args: a_invars (list(theano.shared)): list of input parameters as symbolic theano variable a_sfx (str): suffix to use for function and parameter names Returns: (2-tuple): parameters to be optimized and list of symbolic outputs from the function """ intm_dim = self.intm_dim # initialize transformation matrices and bias term W_dim = (intm_dim, self.ndim) W = np.concatenate([ ORTHOGONAL(W_dim), ORTHOGONAL(W_dim), ORTHOGONAL(W_dim), ORTHOGONAL(W_dim) ], axis=0) W = theano.shared(value=W, name="W" + a_sfx) U_dim = (intm_dim, intm_dim) U = np.concatenate([ ORTHOGONAL(U_dim), ORTHOGONAL(U_dim), ORTHOGONAL(U_dim), ORTHOGONAL(U_dim) ], axis=0) U = theano.shared(value=U, name="U" + a_sfx) V = ORTHOGONAL(U_dim) # V for vendetta V = theano.shared(value=V, name="V" + a_sfx) b_dim = (1, intm_dim * 4) b = theano.shared(value=HE_UNIFORM(b_dim), name="b" + a_sfx) params = [W, U, V, b] # initialize dropout units w_do = theano.shared(value=floatX(np.ones((4 * intm_dim, ))), name="w_do") w_do = self._init_dropout(w_do) u_do = theano.shared(value=floatX(np.ones((4 * intm_dim, ))), name="u_do") u_do = self._init_dropout(u_do) # custom function for splitting up matrix parts def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] # define recurrent LSTM unit def _step(x_, h_, c_, W, U, V, b, w_do, u_do): """Recurrent LSTM unit. Note: The general order of function parameters to fn is: sequences (if any), prior result(s) (if needed), non-sequences (if any) Args: x_ (theano.shared): input vector h_ (theano.shared): output vector c_ (theano.shared): memory state W (theano.shared): input transform matrix U (theano.shared): inner-state transform matrix V (theano.shared): output transform matrix b (theano.shared): bias vector w_do (TT.col): dropout unit for the W matrix u_do (TT.col): dropout unit for the U matrix Returns: (2-tuple(h, c)) new hidden and memory states """ # pre-compute common terms: # W \in R^{236 x 100} # x \in R^{1 x 100} # U \in R^{236 x 59} # h \in R^{1 x 59} # b \in R^{1 x 236} # w_do \in R^{236 x 1} # u_do \in R^{236 x 1} # xhb \in R^{1 x 236} xhb = (TT.dot(W * w_do.dimshuffle( (0, 'x')), x_.T) + TT.dot(U * u_do.dimshuffle( (0, 'x')), h_.T)).T + b # i \in R^{1 x 59} i = TT.nnet.sigmoid(_slice(xhb, 0, intm_dim)) # f \in R^{1 x 59} f = TT.nnet.sigmoid(_slice(xhb, 1, intm_dim)) # c \in R^{1 x 59} c = TT.tanh(_slice(xhb, 2, intm_dim)) c = i * c + f * c_ # V \in R^{59 x 59} # o \in R^{1 x 59} o = TT.nnet.sigmoid(_slice(xhb, 3, intm_dim) + TT.dot(V, c.T).T) # h \in R^{1 x 59} h = o * TT.tanh(c) # return current output and memory state return h.flatten(), c.flatten() m = 0 n = intm_dim ov = None outvars = [] for iv, igbw in a_invars: m = iv.shape[0] ret, _ = theano.scan(_step, sequences=[iv], outputs_info=[ floatX(np.zeros((n, ))), floatX(np.zeros((n, ))) ], non_sequences=[W, U, V, b, w_do, u_do], name="LSTM" + str(iv) + a_sfx, n_steps=m, truncate_gradient=TRUNCATE_GRADIENT, go_backwards=igbw) ov = ret[0] outvars.append(ov) return params, outvars