def test_rmsprop_0(): # input x = TT.vector(name='x') B = theano.shared(floatX(np.ones((3, 5))), name='B') c = theano.shared(floatX(np.ones(3)), name='c') params = [B, c] # output y_pred = TT.nnet.softmax(TT.dot(B, x.T).T + c) y_gold = TT.vector(name="y_gold") # cost and grads cost = TT.sum((y_pred - y_gold)**2) grads = TT.grad(cost, wrt=params) # funcs cost_func, update_func, rms_params = rmsprop(params, grads, [x], y_gold, cost) # check return values assert len(rms_params) == 4 assert isinstance(rms_params[0][0], TT.sharedvar.TensorSharedVariable) assert not np.any(rms_params[0][0].get_value()) # check convergence X = [floatX(np.random.rand(5)) for _ in xrange(N)] Y = [floatX(np.random.rand(3)) for _ in xrange(N)] icost = init_cost = end_cost = 0. for i in xrange(MAX_I): icost = 0. for x, y in zip(X, Y): icost += cost_func(x, y) update_func() if i == 0: init_cost = icost elif i == MAX_I - 1: end_cost = icost assert end_cost < init_cost
def _init_w2emb(self): """Compute a mapping from Word2Vec to embeddings. Note: modifies instance variables in place """ # construct two matrices - one with the original word2vec # representations and another one with task-specific embeddings m = len(self.w2emb_i) n = self.ndim j = len(self._aux_keys) w2v_emb = floatX(np.empty((m, self.w2v.ndim))) task_emb = floatX(np.empty((m, n))) k = 0 for w, i in self.w2emb_i.iteritems(): k = i - j w2v_emb[k] = floatX(self.w2v[w]) task_emb[k] = floatX(self.W_EMB[i]) print("Computing task-specific transform matrix...", end="", file=sys.stderr) self.w2emb, res, rank, _ = np.linalg.lstsq(w2v_emb, task_emb) self.w2emb = floatX(self.w2emb) print(" done (w2v rank: {:d}, residuals: {:f})".format(rank, sum(res)), file=sys.stderr)
def train(self, a_ts, a_dev_data=None): """Method for training the model. Args: a_ts (list(2-tuple(x, y))): list of training JSON data a_dev_data (2-tuple(dict, dict) or None): list of development JSON data Returns: (void) """ # gold vector y_gold = TT.dvector(name="y_gold") # define cost and optimization function cost = TT.sum((self.y_pred - y_gold) ** 2) # predict = theano.function([self.x, y_gold], [self.y_pred, cost], # name="predict") gradients = TT.grad(cost, wrt=self._params) f_grad_shared, f_update, _ = rmsprop(self._params, gradients, [self.x], y_gold, cost) # perform actual training min_cost = INF best_params = [] start_time = end_time = None time_delta = prev_icost = icost = 0. a_ts = [(floatX(x), floatX(y)) for x, y in a_ts] for i in xrange(MAX_ITERS): icost = 0. np.random.shuffle(a_ts) start_time = datetime.utcnow() for x_i, y_i in a_ts: try: icost += f_grad_shared(x_i, y_i) f_update() except Exception as e: raise e if icost < min_cost: best_params = [p.get_value() for p in self._params] min_cost = icost end_time = datetime.utcnow() time_delta = (end_time - start_time).seconds print( "Iteration #{:d}: cost = {:f} ({:.2f} sec)".format(i, icost, time_delta), file=sys.stderr) if abs(prev_icost - icost) < CONV_EPS: break prev_icost = icost # set best values seen during training if best_params: for p, val in zip(self._params, best_params): p.set_value(val)
def train(self, a_ts, a_dev_data=None): """Method for training the model. Args: a_ts (list(2-tuple(x, y))): list of training JSON data a_dev_data (2-tuple(dict, dict) or None): list of development JSON data Returns: (void) """ # gold vector y_gold = TT.dvector(name="y_gold") # define cost and optimization function cost = TT.sum((self.y_pred - y_gold)**2) # predict = theano.function([self.x, y_gold], [self.y_pred, cost], # name="predict") gradients = TT.grad(cost, wrt=self._params) f_grad_shared, f_update, _ = rmsprop(self._params, gradients, [self.x], y_gold, cost) # perform actual training min_cost = INF best_params = [] start_time = end_time = None time_delta = prev_icost = icost = 0. a_ts = [(floatX(x), floatX(y)) for x, y in a_ts] for i in xrange(MAX_ITERS): icost = 0. np.random.shuffle(a_ts) start_time = datetime.utcnow() for x_i, y_i in a_ts: try: icost += f_grad_shared(x_i, y_i) f_update() except Exception as e: raise e if icost < min_cost: best_params = [p.get_value() for p in self._params] min_cost = icost end_time = datetime.utcnow() time_delta = (end_time - start_time).seconds print("Iteration #{:d}: cost = {:f} ({:.2f} sec)".format( i, icost, time_delta), file=sys.stderr) if abs(prev_icost - icost) < CONV_EPS: break prev_icost = icost # set best values seen during training if best_params: for p, val in zip(self._params, best_params): p.set_value(val)
def _generate_ts(self, a_data, a_get_w_emb_i, a_get_c_emb_i): """Generate training set. Args: a_data (tuple): input data (discourse relations and parses) a_get_w_emb_i (method): custom method for retrieving the word embedding index a_get_c_emb_i (method): custom method for retrieving the conn embedding index Returns: tuple: lists of input features and expected classes """ x, y = [], [] if a_data is None: return (x, y) # generate features rels, parses = a_data # frequency of words in the corpus self._compute_w_stat(parses) for i, irel in rels: x.append((i, self._rel2x(irel, parses, a_get_w_emb_i, a_get_c_emb_i))) y.append(floatX(irel[SENSE])) return (x, y)
def test_svd_1(self): # compile function that takes preliminary inout and outputs SVD get_svd = theano.function([self.svd.EMB_ARG1], self.svd.ARG1, name="get_svd") ret = get_svd(floatX(np.random.randn(20, 30))) import sys print(ret.size, ret.shape, file=sys.stderr) a = ret.dot(ret.T) assert np.allclose(a.diagonal(), np.ones(30)) a -= np.eye(30) assert a.max() < 1.e-6 assert a.min() > -1.e-5
def _init_w2v_emb(self): """Initialize word2vec embedding matrix. """ w_emb = np.empty((self.w_i, self.ndim)) w_emb[self.unk_w_i, :] = 1e-2 # prevent zeros in this row for w, i in self.w2emb_i.iteritems(): if i == self.unk_w_i: continue w_emb[i] = self.w2v[w] self.W_EMB = theano.shared(value=floatX(w_emb), name="W_EMB") # We unload embeddings every time before the training to free more # memory. Feel free to comment the line below, if you have plenty of # RAM. self.w2v.unload()
def __init__(self, a_w2v=False, a_lstsq=False, a_max_iters=MAX_ITERS): """Class constructor. Args: a_w2v (bool): use pre-trained word2vec instance a_lstsq (bool): pre-train task-specific word embeddings, but use least-square method to generate embeddings for unknown words from generic word2vec vectors a_max_iters (int): maximum number of iterations """ # access to the original word2vec resource if a_lstsq: a_w2v = True if a_w2v: self.w2v = Word2Vec # singleton object else: self.w2v = None self.lstsq = a_lstsq self._plain_w2v = self.w2v and not self.lstsq # matrix mapping word2vec to task-specific embeddings self.max_iters = a_max_iters self.w2emb = None self.ndim = -1 # vector dimensionality will be initialized later self.intm_dim = -1 # mapping from word to its embedding index self.unk_w_i = 0 self._aux_keys = set((0, )) self.w_i = 1 self.w2emb_i = dict() # mapping from connective to its embedding index self.unk_c_i = 0 self.c_i = 1 self.c2emb_i = dict() # variables needed for training self._trained = False self._params = [] self._w_stat = self._pred_class = None self.use_dropout = theano.shared(floatX(0.)) self.W_EMB = self.CONN_EMB = self._cost = self._dev_cost = None # initialize theano functions to None self._reset_funcs() # set up functions for obtaining word embeddings at train and test # times self._init_wemb_funcs()
def _init_dropout(self, a_input): """Create a dropout layer. Args: a_input (theano.vector): input layer Returns: theano.vector: dropout layer """ # the dropout layer itself output = TT.switch(self.use_dropout, a_input * (TRNG.binomial(a_input.shape, p=0.5, n=1, dtype=a_input.dtype)), a_input * floatX(0.5)) return output
def _init_X2Y(self): """Initialize tensor for mapping input mtx to output vec. Args: (void) Returns: (theano.shared): shared theano tensor """ tens = np.zeros((self.n_y, self.n_x, self.n_y)) tens -= EPS for i in xrange(self.n_y): tens[i, :, i] = 1. return theano.shared(value=floatX(tens), name="X2Y")
def _init_dropout(self, a_input): """Create a dropout layer. Args: a_input (theano.vector): input layer Returns: theano.vector: dropout layer """ # the dropout layer itself output = TT.switch( self.use_dropout, a_input * (TRNG.binomial(a_input.shape, p=0.5, n=1, dtype=a_input.dtype)), a_input * floatX(0.5)) return output
def _get_test_w2v_lstsq_emb_i(self, a_word): """Obtain embedding index for the given word. Args: a_word (str): word whose embedding index should be retrieved Returns: np.array: embedding of the input word """ a_word = _norm_word(a_word) emb_i = self.w2emb_i.get(a_word) if emb_i is None: if a_word in self.w2v: return floatX(np.dot(self.w2v[a_word], self.w2emb)) return self.W_EMB[self.unk_w_i] return self.W_EMB[emb_i]
def test_floatX_0(): scalar = floatX(0) assert scalar.dtype == config.floatX assert isinstance(scalar, np.ndarray)
def test_floatX_1(): scalar = floatX(range(5)) assert scalar.dtype == config.floatX assert isinstance(scalar, np.ndarray)
def _init_lstm(self, a_invars, a_sfx="-forward"): """Initialize LSTM layer. Args: a_invars (list(theano.shared)): list of input parameters as symbolic theano variable a_sfx (str): suffix to use for function and parameter names Returns: (2-tuple): parameters to be optimized and list of symbolic outputs from the function """ intm_dim = self.intm_dim # initialize transformation matrices and bias term W_dim = (intm_dim, self.ndim) W = np.concatenate([ ORTHOGONAL(W_dim), ORTHOGONAL(W_dim), ORTHOGONAL(W_dim), ORTHOGONAL(W_dim) ], axis=0) W = theano.shared(value=W, name="W" + a_sfx) U_dim = (intm_dim, intm_dim) U = np.concatenate([ ORTHOGONAL(U_dim), ORTHOGONAL(U_dim), ORTHOGONAL(U_dim), ORTHOGONAL(U_dim) ], axis=0) U = theano.shared(value=U, name="U" + a_sfx) V = ORTHOGONAL(U_dim) # V for vendetta V = theano.shared(value=V, name="V" + a_sfx) b_dim = (1, intm_dim * 4) b = theano.shared(value=HE_UNIFORM(b_dim), name="b" + a_sfx) params = [W, U, V, b] # initialize dropout units w_do = theano.shared(value=floatX(np.ones((4 * intm_dim, ))), name="w_do") w_do = self._init_dropout(w_do) u_do = theano.shared(value=floatX(np.ones((4 * intm_dim, ))), name="u_do") u_do = self._init_dropout(u_do) # custom function for splitting up matrix parts def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n * dim:(n + 1) * dim] return _x[:, n * dim:(n + 1) * dim] # define recurrent LSTM unit def _step(x_, h_, c_, W, U, V, b, w_do, u_do): """Recurrent LSTM unit. Note: The general order of function parameters to fn is: sequences (if any), prior result(s) (if needed), non-sequences (if any) Args: x_ (theano.shared): input vector h_ (theano.shared): output vector c_ (theano.shared): memory state W (theano.shared): input transform matrix U (theano.shared): inner-state transform matrix V (theano.shared): output transform matrix b (theano.shared): bias vector w_do (TT.col): dropout unit for the W matrix u_do (TT.col): dropout unit for the U matrix Returns: (2-tuple(h, c)) new hidden and memory states """ # pre-compute common terms: # W \in R^{236 x 100} # x \in R^{1 x 100} # U \in R^{236 x 59} # h \in R^{1 x 59} # b \in R^{1 x 236} # w_do \in R^{236 x 1} # u_do \in R^{236 x 1} # xhb \in R^{1 x 236} xhb = (TT.dot(W * w_do.dimshuffle( (0, 'x')), x_.T) + TT.dot(U * u_do.dimshuffle( (0, 'x')), h_.T)).T + b # i \in R^{1 x 59} i = TT.nnet.sigmoid(_slice(xhb, 0, intm_dim)) # f \in R^{1 x 59} f = TT.nnet.sigmoid(_slice(xhb, 1, intm_dim)) # c \in R^{1 x 59} c = TT.tanh(_slice(xhb, 2, intm_dim)) c = i * c + f * c_ # V \in R^{59 x 59} # o \in R^{1 x 59} o = TT.nnet.sigmoid(_slice(xhb, 3, intm_dim) + TT.dot(V, c.T).T) # h \in R^{1 x 59} h = o * TT.tanh(c) # return current output and memory state return h.flatten(), c.flatten() m = 0 n = intm_dim ov = None outvars = [] for iv, igbw in a_invars: m = iv.shape[0] ret, _ = theano.scan(_step, sequences=[iv], outputs_info=[ floatX(np.zeros((n, ))), floatX(np.zeros((n, ))) ], non_sequences=[W, U, V, b, w_do, u_do], name="LSTM" + str(iv) + a_sfx, n_steps=m, truncate_gradient=TRUNCATE_GRADIENT, go_backwards=igbw) ov = ret[0] outvars.append(ov) return params, outvars
def train(self, a_train_data, a_dev_data=None, a_n_y=-1, a_i=-1, a_train_out=None, a_dev_out=None): """Method for training the model. Args: a_train_data (2-tuple(list, dict)): list of training JSON data a_dev_data (2-tuple(list, dict) or None): list of development JSON data a_n_y (int): number of distinct classes a_i (int): row index for the output predictions a_train_out (np.array or None): predictions for the training set a_dev_out (np.array or None): predictions for the training set Returns: void: Note: updates ``a_train_out`` and ``a_dev_out`` in place """ self.n_y = a_n_y # allocate data to development set if there is none if a_dev_data is None or not a_dev_data[0]: train_rels, parses = a_train_data docs = parses.keys() n_docs = len(docs) n_dev = max(n_docs / 10, 1) # sample without replacement dev_docs = set(np.random.choice(docs, n_dev, False)) for ddname in dev_docs: print("dev_doc = '{:s}'".format(ddname).encode(ENCODING), file=sys.stderr) new_train_rels, dev_rels = [], [] for irel in train_rels: # relations are numbered at this place if irel[-1][DOC_ID] in dev_docs: dev_rels.append(irel) else: new_train_rels.append(irel) a_train_data = (new_train_rels, parses) a_dev_data = (dev_rels, parses) # convert training and development sets to features x_train, y_train = self._generate_ts(a_train_data, self.get_train_w_emb_i, self.get_train_c_emb_i) x_dev, y_dev = self._generate_ts(a_dev_data, self.get_test_w_emb_i, self.get_test_c_emb_i) # initialize the network self._init_nn() # activate dropout for training self.use_dropout.set_value(1.) # perform the training best_params = [] dev_err = dev_cost = 0. prev_train_cost = train_cost = 0. min_dev_err = min_dev_cost = INF try: for i in xrange(self.max_iters): train_cost = 0. start_time = datetime.utcnow() # perform one training iteration for (_, (emb1, emb2, conn)), y in zip(x_train, y_train): train_cost += self._grad_shared(emb1, emb2, conn, y) self._update() # estimate the model on the dev set dev_err = dev_cost = 0. # temporarily deactivate dropout self.use_dropout.set_value(0.) for (_, (emb1, emb2, conn)), y in zip(x_dev, y_dev): dev_err += (y[self._predict_class(emb1, emb2, conn)] == 0) dev_cost += self._compute_dev_cost(emb1, emb2, conn, y) # switch dropout on again self.use_dropout.set_value(1.) end_time = datetime.utcnow() time_delta = (end_time - start_time).seconds if min_dev_err == INF or dev_err < min_dev_err or \ (dev_err == min_dev_err and dev_cost < min_dev_cost): best_params = [p.get_value() for p in self._params] min_dev_err = dev_err min_dev_cost = dev_cost print("Iteration {:d}:\ttrain_cost = {:f}\t" "dev_err={:d}\tdev_cost={:f}\t({:.2f} sec)".format( i, train_cost, int(dev_err), dev_cost, time_delta), file=sys.stderr) if abs(prev_train_cost - train_cost) < CONV_EPS: break prev_train_cost = train_cost except BaseException as e: print("ERROR: '{:s}'".format(e.message)) # deactivate dropout self.use_dropout.set_value(0.) if best_params: for p, val in zip(self._params, best_params): p.set_value(val) else: raise RuntimeError("Network could not be trained.") # make predictions for the judge if a_i >= 0: # deactivate dropout once again if a_train_out is not None: for i, x_i in x_train: self._predict(x_i, a_train_out[i], a_i) if a_dev_out: for i, x_i in x_dev: self._predict(x_i, a_dev_out[i], a_i) else: for i, x_i in x_dev: self._predict(x_i, a_train_out[i], a_i) # reset function members to allow cPickle store this model self._reset_funcs() self._cleanup(self._rms_params) if self.w2v: self.W_EMB = floatX(self.W_EMB.get_value()) if self.lstsq: self.w2v.load() self._init_w2emb() if self.w2v is not None: self.w2v.unload() self.w2v = None self._trained = True