def logp_t(cls, value, transport, inputs): #print(value.tag.test_value) #print(mu.tag.test_value) #print(mapping.inv(value).tag.test_value) value = debug(value, 'value', force=False) delta = transport.inv(inputs, value, noise=True) det_m = transport.logdet_dinv(inputs, value) delta = debug(delta, 'delta', force=False) npi = np.float32(-0.5) * value.shape[0].astype( th.config.floatX) * tt.log(np.float32(2.0 * np.pi)) dot2 = np.float32(-0.5) * delta.dot(delta.T) npi = debug(npi, 'npi', force=False) dot2 = debug(dot2, 'dot2', force=False) det_m = debug(det_m, 'det_m', force=False) r = npi + dot2 + det_m cond1 = tt.or_(tt.any(tt.isinf_(delta)), tt.any(tt.isnan_(delta))) cond2 = tt.or_(tt.any(tt.isinf_(det_m)), tt.any(tt.isnan_(det_m))) return ifelse(cond1, np.float32(-1e30), ifelse(cond2, np.float32(-1e30), r))
def get_nesterov_sgd_updates(param_list, gradients, velocities, lr, mu): """Do SGD updates with Nesterov momentum.""" updates = [] for p, g, v in zip(param_list, gradients, velocities): new_v = mu * v - lr * g new_p = p - mu * v + (1 + mu) * new_v has_non_finite = (T.any(T.isnan(new_p) + T.isinf(new_p)) + T.any(T.isnan(new_v) + T.isinf(new_v))) updates.append((p, ifelse(has_non_finite, p, new_p))) updates.append((v, ifelse(has_non_finite, v, new_v))) return updates
def __init__(self, n_comp=10, verbose=False): # Theano initialization self.T_weights = shared(np.eye(n_comp, dtype=np.float32)) self.T_bias = shared(np.ones((n_comp, 1), dtype=np.float32)) T_p_x_white = T.fmatrix() T_lrate = T.fscalar() T_block = T.fscalar() T_unmixed = T.dot(self.T_weights,T_p_x_white) + T.addbroadcast(self.T_bias,1) T_logit = 1 - 2 / (1 + T.exp(-T_unmixed)) T_out = self.T_weights + T_lrate * T.dot(T_block * T.identity_like(self.T_weights) + T.dot(T_logit, T.transpose(T_unmixed)), self.T_weights) T_bias_out = self.T_bias + T_lrate * T.reshape(T_logit.sum(axis=1), (-1,1)) T_max_w = T.max(self.T_weights) T_isnan = T.any(T.isnan(self.T_weights)) self.w_up_fun = theano.function([T_p_x_white, T_lrate, T_block], [T_max_w, T_isnan], updates=[(self.T_weights, T_out), (self.T_bias, T_bias_out)], allow_input_downcast=True) T_matrix = T.fmatrix() T_cov = T.dot(T_matrix,T.transpose(T_matrix))/T_block self.cov_fun = theano.function([T_matrix, T_block], T_cov, allow_input_downcast=True) self.loading = None self.sources = None self.weights = None self.n_comp = n_comp self.verbose = verbose
def accurate_pixels_class(self, y): """ Returns number of correctly classified pixels per class and total number of pixels per class. (pair of numpy 1d arrays) :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label """ # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError( 'y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type) ) # check if y is of the correct datatype if not y.dtype.startswith('int'): raise NotImplementedError() correct = T.zeros((self.n_classes), dtype='int32') total = T.zeros((self.n_classes), dtype='int32') for i in range(self.n_classes): correct = T.set_subtensor( correct[i], T.switch( T.any(T.eq(y, i)), T.sum(T.eq(y[T.eq(y, i).nonzero()], self.y_pred[T.eq(y, i).nonzero()])), 0) ) total = T.set_subtensor(total[i], T.sum(T.eq(y, i))) return correct, total
def compile_eval_function(nnet): X = T.tensor4() y = T.ivector() # get prediciton by fully convolutional network prediction = lasagne.layers.get_output(nnet.dense3_conv_layer, deterministic=True, inputs=X) # get output scores on first dim # before flattening on 2dim and then get scores on second dim prediction = prediction.transpose((1, 0, 2, 3))\ .flatten(2).transpose((1, 0)) prediction = T.nnet.softmax(prediction) # spatial averaging prediction = T.mean(prediction, axis=0) # compute top1 and top5 accuracies sorted_pred = T.argsort(prediction) top1_acc = T.mean(T.eq(sorted_pred[-1], y), dtype='floatX') top5_acc = T.mean(T.any(T.eq(sorted_pred[-5:], T.shape_padright(y)), axis=1), dtype='floatX') return theano.function([X, y], [top1_acc, top5_acc])
def in_transit(self, t, r=None, texp=None, light_delay=False): """Get a list of timestamps that are in transit Args: t (vector): A vector of timestamps to be evaluated. r (Optional): The radii of the planets. texp (Optional[float]): The exposure time. Returns: The indices of the timestamps that are in transit. """ if light_delay: raise NotImplementedError( "Light travel time delay is not implemented for simple orbits" ) dt = tt.mod(tt.shape_padright(t) - self._ref_time, self.period) dt -= self._half_period if r is None: tol = 0.5 * self.duration else: x = (r + self.r_star) ** 2 - self._b_norm ** 2 tol = tt.sqrt(x) / self.speed if texp is not None: tol += 0.5 * texp mask = tt.any(tt.abs_(dt) < tol, axis=-1) return tt.arange(t.size)[mask]
def posdef(self, x, diag): """ Check to determine postive definiteness of the Kronecker-structured covariance matrix. This operation is slow, and is thus not recommended to be called repeatedly as a check during optimization. Rather, the user should use this function as a guide to ensuring positive definiteness of the model for varying values of the kernel parameters. Args: tensor x: The input coordinates. tensor diag: The white noise variances. This should be an NxM array where N is the length of x and M is the size of alpha. Returns: isposdef: A boolean that is True if the covariance matrix is positive definite and False otherwise. The user will need to call ``isposdef.eval()`` to compute the returned value from the theano tensor variable. """ diag = tt.as_tensor_variable(diag) diag = tt.reshape(diag.T, (1, diag.size))[0] x = tt.as_tensor_variable(x) T = self.term.value(x[:, None] - x[None, :]) if 'alpha' in vars(self): R = self.alpha[:, None] * self.alpha[None, :] K = tt.slinalg.kron(T, R) elif 'R' in vars(self): K = tt.slinalg(T, self.R) chol = tt.slinalg.Cholesky(on_error='nan') L = chol(K + tt.diag(diag)) return tt.switch(tt.any(tt.isnan(L)), np.array(False), np.array(True))
def cost(self): """ :rtype: (theano.Variable | None, dict[theano.Variable,theano.Variable] | None) :returns: cost, known_grads """ known_grads = None if self.loss == 'ce' or self.loss == 'priori': if self.attrs.get("target", "").endswith("[sparse:coo]"): assert isinstance(self.y, tuple) assert len(self.y) == 3 from NativeOp import crossentropy_softmax_and_gradient_z_sparse y_mask = self.network.j[self.attrs.get("target", "").replace("[sparse:coo]", "[sparse:coo:2:0]")] ce, grad_z = crossentropy_softmax_and_gradient_z_sparse( self.z, self.index, self.y[0], self.y[1], self.y[2], y_mask) return self.norm * T.sum(ce), {self.z: grad_z} if self.y_data_flat.type == T.ivector().type: # Use crossentropy_softmax_1hot to have a more stable and more optimized gradient calculation. # Theano fails to use it automatically; I guess our self.i indexing is too confusing. #idx = self.index.flatten().dimshuffle(0,'x').repeat(self.y_m.shape[1],axis=1) # faster than line below #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m * idx, y_idx=self.y_data_flat * self.index.flatten()) nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i]) #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) #nll = -T.log(T.nnet.softmax(self.y_m)[self.i,self.y_data_flat[self.i]]) #z_c = T.exp(self.z[:,self.y]) #nll = -T.log(z_c / T.sum(z_c,axis=2,keepdims=True)) #nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) #nll = T.set_subtensor(nll[self.j], T.constant(0.0)) else: nll = -T.dot(T.log(T.clip(self.p_y_given_x[self.i], 1.e-38, 1.e20)), self.y_data_flat[self.i].T) return self.norm * T.sum(nll), known_grads elif self.loss == 'entropy': h_e = T.exp(self.y_m) #(TB) pcx = T.clip((h_e / T.sum(h_e, axis=1, keepdims=True)).reshape((self.index.shape[0],self.index.shape[1],self.attrs['n_out'])), 1.e-6, 1.e6) # TBD ee = -T.sum(pcx[self.i] * T.log(pcx[self.i])) # TB #nll, pcxs = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y[self.i]) nll, _ = T.nnet.crossentropy_softmax_1hot(x=self.y_m, y_idx=self.y_data_flat) # TB ce = nll.reshape(self.index.shape) * self.index # TB y = self.y_data_flat.reshape(self.index.shape) * self.index # TB f = T.any(T.gt(y,0), axis=0) # B return T.sum(f * T.sum(ce, axis=0) + (1-f) * T.sum(ee, axis=0)), known_grads #return T.sum(T.switch(T.gt(T.sum(y,axis=0),0), T.sum(ce, axis=0), -T.sum(ee, axis=0))), known_grads #return T.switch(T.gt(T.sum(self.y_m[self.i]),0), T.sum(nll), -T.sum(pcx * T.log(pcx))), known_grads elif self.loss == 'priori': pcx = self.p_y_given_x[self.i, self.y_data_flat[self.i]] pcx = T.clip(pcx, 1.e-38, 1.e20) # For pcx near zero, the gradient will likely explode. return -T.sum(T.log(pcx)), known_grads elif self.loss == 'sse': if self.y_data_flat.dtype.startswith('int'): y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim=1), 'int32') y_oh = T.eq(T.shape_padleft(T.arange(self.attrs['n_out']), y_f.ndim), T.shape_padright(y_f, 1)) return T.mean(T.sqr(self.p_y_given_x[self.i] - y_oh[self.i])), known_grads else: #return T.sum(T.sum(T.sqr(self.y_m - self.y.reshape(self.y_m.shape)), axis=1)[self.i]), known_grads return T.sum(T.sqr(self.y_m[self.i] - self.y_data_flat.reshape(self.y_m.shape)[self.i])), known_grads #return T.sum(T.sum(T.sqr(self.z - (self.y.reshape((self.index.shape[0], self.index.shape[1], self.attrs['n_out']))[:self.z.shape[0]])), axis=2).flatten()[self.i]), known_grads #y_z = T.set_subtensor(T.zeros((self.index.shape[0],self.index.shape[1],self.attrs['n_out']), dtype='float32')[:self.z.shape[0]], self.z).flatten() #return T.sum(T.sqr(y_z[self.i] - self.y[self.i])), known_grads #return T.sum(T.sqr(self.y_m - self.y[:self.z.shape[0]*self.index.shape[1]]).flatten()[self.i]), known_grads else: assert False, "unknown loss: %s" % self.loss
def in_transit(self, t, r=0.0, texp=None): """Get a list of timestamps that are in transit Args: t (vector): A vector of timestamps to be evaluated. r (Optional): The radii of the planets. texp (Optional[float]): The exposure time. Returns: The indices of the timestamps that are in transit. """ z = tt.zeros_like(self.a) r = tt.as_tensor_variable(r) + z R = self.r_star + z # Wrap the times into time since transit hp = 0.5 * self.period dt = tt.mod(self._warp_times(t) + hp, self.period) - hp if self.ecc is None: # Equation 14 from Winn (2010) k = r / R arg = tt.square(1 + k) - tt.square(self.b) factor = R / (self.a * self.sin_incl) hdur = hp * tt.arcsin(factor * tt.sqrt(arg)) / np.pi t_start = -hdur t_end = hdur flag = z else: M_contact = self.contact_points_op( self.a, self.ecc, self.cos_omega, self.sin_omega, self.cos_incl + z, self.sin_incl + z, R + r, ) flag = M_contact[2] t_start = (M_contact[0] - self.M0) / self.n t_start = tt.mod(t_start + hp, self.period) - hp t_end = (M_contact[1] - self.M0) / self.n t_end = tt.mod(t_end + hp, self.period) - hp t_start = tt.switch(tt.gt(t_start, 0.0), t_start - self.period, t_start) t_end = tt.switch(tt.lt(t_end, 0.0), t_end + self.period, t_end) if texp is not None: t_start -= 0.5 * texp t_end += 0.5 * texp mask = tt.any(tt.and_(dt >= t_start, dt <= t_end), axis=-1) result = ifelse(tt.all(tt.eq(flag, 0)), tt.arange(t.size)[mask], tt.arange(t.size)) return result
def compute_step(self, param, previous_step): not_finite = tensor.any( tensor.or_(tensor.isnan(previous_step), tensor.isinf(previous_step))) step = tensor.switch(not_finite, self.scaler * param, previous_step) return step, []
def accurate_pixels_class(self, y): """ Returns number of correctly classified pixels per class and total number of pixels per class. (pair of numpy 1d arrays) :type y: theano.tensor.TensorType :param y: corresponds to a vector that gives for each example the correct label """ # check if y has same dimension of y_pred if y.ndim != self.y_pred.ndim: raise TypeError('y should have the same shape as self.y_pred', ('y', y.type, 'y_pred', self.y_pred.type)) # check if y is of the correct datatype if not y.dtype.startswith('int'): raise NotImplementedError() correct = T.zeros((self.n_classes), dtype='int32') total = T.zeros((self.n_classes), dtype='int32') for i in range(self.n_classes): correct = T.set_subtensor( correct[i], T.switch( T.any(T.eq(y, i)), T.sum( T.eq(y[T.eq(y, i).nonzero()], self.y_pred[T.eq(y, i).nonzero()])), 0)) total = T.set_subtensor(total[i], T.sum(T.eq(y, i))) return correct, total
def get_vanilla_sgd_updates(param_list, gradients, lr): """Do SGD updates with vanilla step rule.""" updates = [] for p, g in zip(param_list, gradients): new_p = p - lr * g has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p)) updates.append((p, ifelse(has_non_finite, p, new_p))) return updates
def any(x, axis=None, keepdims=False): """Bitwise reduction (logical OR). """ y = T.any(x, axis=axis, keepdims=keepdims) if isinstance(get_shape(x), (tuple, list)): output_shape = auto_infer_shape(T.any, x, axis=axis, keepdims=keepdims) add_shape(y, output_shape) return y
def in_transit(self, t, r=0.0, texp=None): """Get a list of timestamps that are in transit Args: t (vector): A vector of timestamps to be evaluated. r (Optional): The radii of the planets. texp (Optional[float]): The exposure time. Returns: The indices of the timestamps that are in transit. """ z = tt.zeros_like(self.a) r = tt.as_tensor_variable(r) + z R = self.r_star + z # Wrap the times into time since transit hp = 0.5 * self.period dt = tt.mod(self._warp_times(t) - self.t0 + hp, self.period) - hp if self.ecc is None: # Equation 14 from Winn (2010) k = r / R arg = tt.square(1 + k) - tt.square(self.b) factor = R / (self.a * self.sin_incl) hdur = hp * tt.arcsin(factor * tt.sqrt(arg)) / np.pi t_start = -hdur t_end = hdur flag = z else: M_contact = self.contact_points_op( self.a, self.ecc, self.cos_omega, self.sin_omega, self.cos_incl + z, self.sin_incl + z, R + r) flag = M_contact[2] t_start = (M_contact[0] - self.M0) / self.n t_start = tt.mod(t_start + hp, self.period) - hp t_end = (M_contact[1] - self.M0) / self.n t_end = tt.mod(t_end + hp, self.period) - hp t_start = tt.switch(tt.gt(t_start, 0.0), t_start - self.period, t_start) t_end = tt.switch(tt.lt(t_end, 0.0), t_end + self.period, t_end) if texp is not None: t_start -= 0.5*texp t_end += 0.5*texp mask = tt.any(tt.and_(dt >= t_start, dt <= t_end), axis=-1) result = ifelse(tt.all(tt.eq(flag, 0)), tt.arange(t.size)[mask], tt.arange(t.size)) return result
def model_4(gamble, exclude, A_v, B_v, C_v, amb_A, amb_B, amb_C, rho, lambda_param, alpha_noloss_context, alpha_loss_context, gamma, amb_gain_value, amb_loss_value): # When using non-centered paramaterisation parameters can go below zero - need to stop this rho = T.switch(T.lt(rho, 0.01), 0.01, rho) lambda_param = T.switch(T.lt(lambda_param, 0.01), 0.01, lambda_param) gamma = T.switch(T.lt(gamma, 0.01), 0.01, gamma) alpha_noloss_context = T.switch(T.lt(alpha_noloss_context, 0.01), 0.01, alpha_noloss_context) alpha_loss_context = T.switch(T.lt(alpha_loss_context, 0.01), 0.01, alpha_loss_context) alpha = T.switch(T.any(T.stack([A_v, B_v, C_v]).squeeze() < 0, axis=0), alpha_loss_context, alpha_noloss_context) # Calculate values for the 3 options (one of these may not be an option, in which case its value ends up being zero) u_A = T.switch(T.gt(A_v, 0), ((1 - amb_A) * T.power(A_v, rho)) + amb_A * (gamble[0] * (alpha * T.power(amb_gain_value, rho)) + ((1 - gamble[0]) * (alpha * T.power(amb_gain_value, rho)))), -((1 - amb_A) * lambda_param * T.power(T.abs_(A_v), rho) + amb_A * (gamble[0] * (alpha * T.power(amb_loss_value, rho)) + ((1 - gamble[0]) * (alpha * T.power(amb_loss_value, rho)))))) u_B = T.switch(T.gt(B_v, 0), ((1 - amb_B) * T.power(B_v, rho)) + amb_B * (gamble[1] * (alpha * T.power(amb_gain_value, rho)) + ((1 - gamble[1]) * (alpha * T.power(amb_gain_value, rho)))), -((1 - amb_B) * lambda_param * T.power(T.abs_(B_v), rho) + amb_B * (gamble[1] * (alpha * T.power(amb_loss_value, rho)) + ((1 - gamble[1]) * (alpha * T.power(amb_loss_value, rho)))))) u_C = T.switch(T.gt(C_v, 0), ((1 - amb_C) * T.power(C_v, rho)) + amb_C * (gamble[2] * (alpha * T.power(amb_gain_value, rho)) + ((1 - gamble[2]) * (alpha * T.power(amb_gain_value, rho)))), -((1 - amb_C) * lambda_param * T.power(T.abs_(C_v), rho) + amb_C * (gamble[2] * (alpha * T.power(amb_loss_value, rho)) + ((1 - gamble[2]) * (alpha * T.power(amb_loss_value, rho)))))) # If we have only two choices (i.e. no gamble), the ambiguous option should be labelled as a gamble gamble = T.switch(T.eq(exclude.sum(axis=0), 1), T.stack([amb_A, amb_B, amb_C]).squeeze(), gamble) # Get value of gamble option gamble_weighting = gamble / gamble.sum(axis=0) u_gamble = gamble_weighting[0] * (u_A * (1 - exclude[0])) + \ gamble_weighting[1] * (u_B * (1 - exclude[1])) + \ gamble_weighting[2] * (u_C * (1 - exclude[2])) # Get value of sure option sure_weighting = (1 - gamble) / ((1 - gamble).sum(axis=0) - exclude.sum(axis=0)) u_sure = sure_weighting[0] * (u_A * (1 - exclude[0])) + \ sure_weighting[1] * (u_B * (1 - exclude[1])) + \ sure_weighting[2] * (u_C * (1 - exclude[2])) # Calculate choice probability p = inv_logit(gamma * (u_gamble - u_sure)) return p
def logp_cho(cls, value, mu, cho, mapping): """ Calculates the log p of the parameters given the data :param value: the data :param mu: the location (obtained from the hiperparameters) :param cho: the cholesky decomposition of the dispersion matrix :param mapping: the mapping of the warped. :return: it returns the value of the log p of the parameters given the data (values) """ #print(value.tag.test_value) #print(mu.tag.test_value) #print(mapping.inv(value).tag.test_value) #mu = debug(mu, 'mu', force=True) #value = debug(value, 'value', force=False) delta = mapping.inv(value) - mu #delta = debug(delta, 'delta', force=True) #cho = debug(cho, 'cho', force=True) lcho = tsl.solve_lower_triangular(cho, delta) #lcho = debug(lcho, 'lcho', force=False) lcho2 = lcho.T.dot(lcho) #lcho2 = debug(lcho2, 'lcho2', force=True) npi = np.float32(-0.5) * cho.shape[0].astype( th.config.floatX) * tt.log(np.float32(2.0 * np.pi)) dot2 = np.float32(-0.5) * lcho2 #diag = debug(tnl.diag(cho), 'diag', force=True) #_log= debug(tt.log(diag), 'log', force=True) det_k = -tt.sum(tt.log(tnl.diag(cho))) det_m = mapping.logdet_dinv(value) #npi = debug(npi, 'npi', force=False) #dot2 = debug(dot2, 'dot2', force=False) #det_k = debug(det_k, 'det_k', force=False) #det_m = debug(det_m, 'det_m', force=False) r = npi + dot2 + det_k + det_m cond1 = tt.or_(tt.any(tt.isinf_(delta)), tt.any(tt.isnan_(delta))) cond2 = tt.or_(tt.any(tt.isinf_(det_m)), tt.any(tt.isnan_(det_m))) cond3 = tt.or_(tt.any(tt.isinf_(cho)), tt.any(tt.isnan_(cho))) cond4 = tt.or_(tt.any(tt.isinf_(lcho)), tt.any(tt.isnan_(lcho))) return ifelse( cond1, np.float32(-1e30), ifelse( cond2, np.float32(-1e30), ifelse(cond3, np.float32(-1e30), ifelse(cond4, np.float32(-1e30), r))))
def _step(input, *states): output, new_states = step_function(input, states) if masking: # if all-zero input timestep, return # all-zero output and unchanged states switch = T.any(input, axis=-1, keepdims=True) output = T.switch(switch, output, 0. * output) return_states = [] for state, new_state in zip(states, new_states): return_states.append(T.switch(switch, new_state, state)) return [output] + return_states else: return [output] + new_states
def do_compute(self, quiet): if quiet: self._d, self._W, _ = ops.factor_quiet(self._a, self._U, self._V, self._P) self._log_det = tt.switch(tt.any(self._d < 0.0), -np.inf, tt.sum(tt.log(self._d))) else: self._d, self._W, _ = ops.factor(self._a, self._U, self._V, self._P) self._log_det = tt.sum(tt.log(self._d)) self._norm = -0.5 * (self._log_det + self._size * np.log(2 * np.pi))
def grad(self, inputs, gradients): """ Cholesky decomposition reverse-mode gradient update. Symbolic expression for reverse-mode Cholesky gradient taken from [0]_ References ---------- .. [0] I. Murray, "Differentiation of the Cholesky decomposition", http://arxiv.org/abs/1602.07527 """ x = inputs[0] dz = gradients[0] chol_x = self(x) # Replace the cholesky decomposition with 1 if there are nans # or solve_upper_triangular will throw a ValueError. if self.on_error == 'nan': ok = ~tensor.any(tensor.isnan(chol_x)) chol_x = tensor.switch(ok, chol_x, 1) dz = tensor.switch(ok, dz, 1) # deal with upper triangular by converting to lower triangular if not self.lower: chol_x = chol_x.T dz = dz.T def tril_and_halve_diagonal(mtx): """Extracts lower triangle of square matrix and halves diagonal.""" return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.) def conjugate_solve_triangular(outer, inner): """Computes L^{-T} P L^{-1} for lower-triangular L.""" return solve_upper_triangular( outer.T, solve_upper_triangular(outer.T, inner.T).T) s = conjugate_solve_triangular( chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz))) if self.lower: grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s)) else: grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s)) if self.on_error == 'nan': return [tensor.switch(ok, grad, np.nan)] else: return [grad]
def build_aligner(self): tgt_action_seq = ndim_itensor(3, 'tgt_action_seq') tgt_action_seq_type = ndim_itensor(3, 'tgt_action_seq_type') tgt_node_seq = ndim_itensor(2, 'tgt_node_seq') tgt_par_rule_seq = ndim_itensor(2, 'tgt_par_rule_seq') tgt_par_t_seq = ndim_itensor(2, 'tgt_par_t_seq') tgt_node_embed = self.node_embedding[tgt_node_seq] query_tokens = ndim_itensor(2, 'query_tokens') query_token_embed, query_token_embed_mask = self.query_embedding( query_tokens, mask_zero=True) batch_size = tgt_action_seq.shape[0] max_example_action_num = tgt_action_seq.shape[1] tgt_action_seq_embed = T.switch( T.shape_padright(tgt_action_seq[:, :, 0] > 0), self.rule_embedding_W[tgt_action_seq[:, :, 0]], self.vocab_embedding_W[tgt_action_seq[:, :, 1]]) tgt_action_seq_embed_tm1 = tensor_right_shift(tgt_action_seq_embed) tgt_par_rule_embed = T.switch(tgt_par_rule_seq[:, :, None] < 0, T.alloc(0., 1, config.rule_embed_dim), self.rule_embedding_W[tgt_par_rule_seq]) if not config.frontier_node_type_feed: tgt_node_embed *= 0. if not config.parent_action_feed: tgt_par_rule_embed *= 0. decoder_input = T.concatenate( [tgt_action_seq_embed_tm1, tgt_node_embed, tgt_par_rule_embed], axis=-1) query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask, dropout=0, srng=self.srng) tgt_action_seq_mask = T.any(tgt_action_seq_type, axis=-1) alignments = self.decoder_lstm.align( decoder_input, context=query_embed, context_mask=query_token_embed_mask, mask=tgt_action_seq_mask, parent_t_seq=tgt_par_t_seq, srng=self.srng) alignment_inputs = [ query_tokens, tgt_action_seq, tgt_action_seq_type, tgt_node_seq, tgt_par_rule_seq, tgt_par_t_seq ] self.align = theano.function(alignment_inputs, [alignments])
def get_idx(q_nbrs, q_mem): """Gets the index of sample in memory for computing loss. We first look to see if the query label can be found in the retrieved neighbours, and if not, look to memory for a key with the same value. We keep track of a boolean mask, which indicates whether or not we were able to find a sample with a label that matches the query. """ # Whether a matching sample can be found in neighbours or memory any_match_nbrs = T.any(q_nbrs, axis=1) any_match_mem = T.any(q_mem, axis=1) any_match = T.or_(any_match_nbrs, any_match_mem) # Look in neighbours then memory for corresponding sample. # If from neighbours, we need to retrieve the full mem idx. rows = T.arange(nbrs.shape[0]) idx = T.switch(any_match_nbrs, nbrs[rows, tensor_choose_k(q_nbrs, self.rng, k=1)], tensor_choose_k(q_mem, self.rng, k=1, random=True)) return (idx, any_match)
def _get_accuracy(self, top_range, data_type): return_list = isinstance(top_range, list) if not return_list: top_range = [top_range] max_top_range = max(top_range) expanded = self._correct_answers.dimshuffle(0, 'x') expanded = expanded.repeat(max_top_range, axis=1) eq = T.eq(expanded, self.answers[:, :max_top_range]) # Compile new function only if top range or data type has changed if self._accuracy_config != [top_range, data_type]: self._accuracy = theano.function( inputs=[self._batch_index], outputs=[ T.any(eq[:, :top], axis=1).mean() for top in top_range ], givens={ self._input: self.data_loader.input(self._batch_index, data_type), self._correct_answers: self.data_loader.output(self._batch_index, data_type) }, ) self._accuracy_config = [top_range, data_type] n_batches = self.data_loader.n_batches(data_type) accuracy = np.zeros(shape=(n_batches, len(top_range))) interval = n_batches / 10 if interval == 0: interval = 1 for batch_index in xrange(n_batches): self.data_loader.load_data(batch_index, data_type) accuracy[batch_index, :] = np.asarray(self._accuracy(batch_index)) if self.verbosity >= 3 or \ (self.verbosity >= 2 and batch_index % interval == 0): partial_accuracy = accuracy[:batch_index + 1, :].mean(axis=0) text = '' for a in partial_accuracy: text += ' {:.2f}%'.format(100 * a) overwrite('{}/{} minibatches accuracy:{}'.format( batch_index + 1, n_batches, text)) overwrite() accuracy = accuracy.mean(axis=0).tolist() if not return_list: return accuracy[0] return accuracy
def compile_prop_f(self, signals, has_input, min_tau=0.0): tau_in = T.scalar('min_tau', dtype=FLOATX) inputs = [tau_in] x = self.signal(signals) # Get estimate of the state from layer above estimate = self.estimate(signals) # Feedforward originates from previous layer's state or given input if not has_input: feedforward = self.feedforward(signals) has_nans = T.as_tensor_variable(0) nans = 0.0 else: input_t = T.matrix('input', dtype=FLOATX) inputs += [input_t] nans = T.isnan(input_t) has_nans = T.any(nans) feedforward = T.where(nans, 0.0, input_t) self.info('Compiling propagation: [%6s] -> %4s <- [%6s]' % (",".join([p.name for p in self.prev] if self.prev else 'u/y'), self.name, ",".join([p.name for p in self.next] if self.next else ''))) # Apply nonlinearity to feedforward path only if self.nonlin: feedforward = self.nonlin(feedforward) if self.merge_op: assert not self.persistent, 'cannot combine with merge_op' new_value = self.merge_op(feedforward, estimate) elif self.persistent: new_value = feedforward else: new_value = feedforward - estimate # If predicting missing values, force them to zero in residual so # that they don't influence learning new_value = ifelse(has_nans, T.where(nans, 0.0, new_value), new_value) (new_X, t, d) = lerp(x.var, new_value, tau_in) d = T.max(d) updates = [(x.var, ifelse(self.enabled, new_X, x.var))] return theano.function(inputs=inputs, outputs=d, updates=updates)
def _step(*args): global single_result input = args[0] states = args[1:] output, new_states = step_function(input, states) if masking: # if all-zero input timestep, return # all-zero output and unchanged states switch = T.any(input) output = T.switch(switch, output, 0. * output) return_states = [] for state, new_state in zip(states, new_states): return_states.append(T.switch(switch, new_state, state)) return [output] + return_states else: return [output] + new_states
def compile_prop_f(self, signals, has_input, min_tau=0.0): tau_in = T.scalar('min_tau', dtype=FLOATX) inputs = [tau_in] x = self.signal(signals) # Get estimate of the state from layer above estimate = self.estimate(signals) # Feedforward originates from previous layer's state or given input if not has_input: feedforward = self.feedforward(signals) has_nans = T.as_tensor_variable(0) nans = 0.0 else: input_t = T.matrix('input', dtype=FLOATX) inputs += [input_t] nans = T.isnan(input_t) has_nans = T.any(nans) feedforward = T.where(nans, 0.0, input_t) self.info( 'Compiling propagation: [%6s] -> %4s <- [%6s]' % (",".join([p.name for p in self.prev] if self.prev else 'u/y'), self.name, ",".join([p.name for p in self.next] if self.next else ''))) # Apply nonlinearity to feedforward path only if self.nonlin: feedforward = self.nonlin(feedforward) if self.merge_op: assert not self.persistent, 'cannot combine with merge_op' new_value = self.merge_op(feedforward, estimate) elif self.persistent: new_value = feedforward else: new_value = feedforward - estimate # If predicting missing values, force them to zero in residual so # that they don't influence learning new_value = ifelse(has_nans, T.where(nans, 0.0, new_value), new_value) (new_X, t, d) = lerp(x.var, new_value, tau_in) d = T.max(d) updates = [(x.var, ifelse(self.enabled, new_X, x.var))] return theano.function(inputs=inputs, outputs=d, updates=updates)
def _step(input, *args): # separate states and contexts states = args[0:nb_states] output, other_outputs, new_states = step_function(input, args) if masking: # if all-zero input timestep, return # all-zero output and unchanged states switch = T.any(input, axis=-1, keepdims=True) output = T.switch(switch, output, 0. * output) for other_output in other_outputs: other_output = T.switch(switch, other_output, 0. * other_output) return_states = [] for state, new_state in zip(states, new_states): return_states.append(T.switch(switch, new_state, state)) return [output] + other_outputs + return_states else: return [output] + other_outputs + new_states
def categorical_acc(predictions, targets, top_k=1): if targets.ndim == predictions.ndim: targets = T.argmax(targets, axis=-1) elif targets.ndim != predictions.ndim - 1: raise TypeError('rank mismatch between targets and predictions') if top_k == 1: # standard categorical accuracy top = T.argmax(predictions, axis=-1) return T.eq(top, targets) else: # top-k accuracy top = T.argsort(predictions, axis=-1) # (Theano cannot index with [..., -top_k:], we need to simulate that) top = top[[slice(None) for _ in range(top.ndim - 1)] + [slice(-top_k, None)]] targets = T.shape_padaxis(targets, axis=-1) return T.any(T.eq(top, targets), axis=-1)
def get_net_fun(phonemeViseme, networkType, k=5, print_network= False): outputLayer, inputs = load_model(phonemeViseme, networkType, print_network) targets = T.ivector('targets') all_predictions = lasagne.layers.get_output(outputLayer, deterministic=True) get_all_prob = theano.function([inputs], all_predictions) maxprob = T.argmax(all_predictions, axis=1) get_first_prediction = theano.function([inputs], maxprob) accuracy = T.eq(maxprob, targets) avg_accuracy= T.mean(accuracy, dtype=theano.config.floatX) get_accuracy = theano.function([inputs, targets], avg_accuracy) # Top k accuracy # topk_accuracy = T.mean(T.any(T.eq(T.argsort(all_predictions, axis=1)[:, -k:], targets.dimshuffle(0, 'x')), axis=1), axis=1) topk_accuracy = T.any(T.eq(T.argsort(all_predictions, axis=1)[:, -k:], targets.dimshuffle(0, 'x')), axis=1) avg_topk_accuracy = T.mean(topk_accuracy, dtype=theano.config.floatX) get_topk_accuracy = theano.function([inputs, targets], avg_topk_accuracy) val_fn = theano.function([inputs, targets], [all_predictions, maxprob, avg_accuracy, avg_topk_accuracy]) def print_topk(im_path, k): im = prep_image(im_path) prob = get_all_prob(im)[0] #print(prob) phonemeNumberMap = classToPhoneme39 pred = [] for i in range(0, len(prob)): p = prob[i] prob_phoneme = phonemeNumberMap[i] pred.append([prob_phoneme, p]) # print(p, " ", prob_phoneme) pred = sorted(pred, key=lambda t: t[1], reverse=True) pred = pred[:k] for p in pred: print(p) return get_all_prob, get_first_prediction, print_topk, get_accuracy, get_topk_accuracy, val_fn
def posdef(self, x, diag): diag = tt.as_tensor_variable(diag) diag = tt.reshape(diag.T, (1, diag.size))[0] x = tt.as_tensor_variable(x) T = self.terms[0].value(x[:, None] - x[None, :]) if self.terms[0].alpha.ndim == 1: R = self.terms[0].alpha[:, None] * self.terms[0].alpha[None, :] K = tt.slinalg.kron(T, R) else: K = tt.slinalg(T, self.terms[0].alpha) for term in self.terms: T = term.value(x[:, None] - x[None, :]) if term.alpha.ndim == 1: R = term.alpha[:, None] * term.alpha[None, :] K += tt.slinalg.kron(T, R) else: K += tt.slinalg(T, term.alpha) chol = tt.slinalg.Cholesky(on_error='nan') L = chol(K + tt.diag(diag)) return tt.switch(tt.any(tt.isnan(L)), np.array(False), np.array(True))
def logp_cho(cls, value, mu, cho, freedom, mapping): delta = mapping.inv(value) - mu lcho = tsl.solve_lower_triangular(cho, delta) beta = lcho.T.dot(lcho) n = cho.shape[0].astype(th.config.floatX) np5 = np.float32(0.5) np2 = np.float32(2.0) npi = np.float32(np.pi) r1 = -np5 * (freedom + n) * tt.log1p(beta / (freedom - np2)) r2 = ifelse( tt.le(np.float32(1e6), freedom), -n * np5 * np.log(np2 * npi), tt.gammaln((freedom + n) * np5) - tt.gammaln(freedom * np5) - np5 * n * tt.log((freedom - np2) * npi)) r3 = -tt.sum(tt.log(tnl.diag(cho))) det_m = mapping.logdet_dinv(value) r1 = debug(r1, name='r1', force=True) r2 = debug(r2, name='r2', force=True) r3 = debug(r3, name='r3', force=True) det_m = debug(det_m, name='det_m', force=True) r = r1 + r2 + r3 + det_m cond1 = tt.or_(tt.any(tt.isinf_(delta)), tt.any(tt.isnan_(delta))) cond2 = tt.or_(tt.any(tt.isinf_(det_m)), tt.any(tt.isnan_(det_m))) cond3 = tt.or_(tt.any(tt.isinf_(cho)), tt.any(tt.isnan_(cho))) cond4 = tt.or_(tt.any(tt.isinf_(lcho)), tt.any(tt.isnan_(lcho))) return ifelse( cond1, np.float32(-1e30), ifelse( cond2, np.float32(-1e30), ifelse(cond3, np.float32(-1e30), ifelse(cond4, np.float32(-1e30), r))))
def __init__(self, n_comp=10, verbose=False): # Theano initialization self.T_weights = shared(np.eye(n_comp, dtype=np.float32)) self.T_bias = shared(np.ones((n_comp, 1), dtype=np.float32)) T_p_x_white = T.fmatrix() T_lrate = T.fscalar() T_block = T.fscalar() T_unmixed = T.dot(self.T_weights, T_p_x_white) + T.addbroadcast( self.T_bias, 1) T_logit = 1 - 2 / (1 + T.exp(-T_unmixed)) T_out = self.T_weights + T_lrate * T.dot( T_block * T.identity_like(self.T_weights) + T.dot(T_logit, T.transpose(T_unmixed)), self.T_weights) T_bias_out = self.T_bias + T_lrate * T.reshape(T_logit.sum(axis=1), (-1, 1)) T_max_w = T.max(self.T_weights) T_isnan = T.any(T.isnan(self.T_weights)) self.w_up_fun = theano.function([T_p_x_white, T_lrate, T_block], [T_max_w, T_isnan], updates=[(self.T_weights, T_out), (self.T_bias, T_bias_out)], allow_input_downcast=True) T_matrix = T.fmatrix() T_cov = T.dot(T_matrix, T.transpose(T_matrix)) / T_block self.cov_fun = theano.function([T_matrix, T_block], T_cov, allow_input_downcast=True) self.loading = None self.sources = None self.weights = None self.n_comp = n_comp self.verbose = verbose
def in_transit(self, t, r=None, texp=None): """Get a list of timestamps that are in transit Args: t (vector): A vector of timestamps to be evaluated. r (Optional): The radii of the planets. texp (Optional[float]): The exposure time. Returns: The indices of the timestamps that are in transit. """ dt = tt.mod(tt.shape_padright(t) - self._ref_time, self.period) dt -= self._half_period if self.r is None: tol = 0.5 * self.duration else: x = (r + self.r_star)**2 - self._b_norm**2 tol = tt.sqrt(x) / self.speed if texp is not None: tol += 0.5 * texp mask = tt.any(tt.abs_(dt) < tol, axis=-1) return tt.arange(t.size)[mask]
def in_transit(self, t, r=0.0, texp=None): """Get a list of timestamps that are in transit Args: t (vector): A vector of timestamps to be evaluated. r (Optional): The radii of the planets. texp (Optional[float]): The exposure time. Returns: The indices of the timestamps that are in transit. """ z = tt.zeros_like(self.a) r = tt.as_tensor_variable(r) + z R = self.r_star + z if self.ecc is None: M_contact = self.contact_points_op(self.a, self.incl + z, r, R) else: M_contact = self.contact_points_op(self.a, self.ecc, self.omega, self.incl + z, r, R) # Wrap the times into time since transit hp = 0.5 * self.period t_start = (M_contact[0] - self.M0) / self.n t_start = tt.mod(t_start + hp, self.period) - hp t_end = (M_contact[3] - self.M0) / self.n t_end = tt.mod(t_end + hp, self.period) - hp dt = tt.mod(self._warp_times(t) - self.t0 + hp, self.period) - hp if texp is not None: t_start -= 0.5 * texp t_end += 0.5 * texp mask = tt.any(tt.and_(dt >= t_start, dt <= t_end), axis=-1) return tt.arange(t.size)[mask]
def any(x, axis=None, keepdims=False): '''Bitwise reduction (logical OR). ''' return T.any(x, axis=axis, keepdims=keepdims)
def get_output_mask(self, train=False): X = self.get_input(train) return T.any(T.ones_like(X) * (1. - T.eq(X, self.mask_value)), axis=-1)
def _compile_functions(self): self._gradnorm = T.zeros([]) for _param, _grad in zip(self._params, self._grads): # apply rmsprop to before clipping gradients if self.rmsprop: avg_grad_sqr = self._avg_grad_sqrs[_param] new_avg_grad_sqr = self.averaging_coeff * avg_grad_sqr + \ (1 - self.averaging_coeff) * T.sqr(_grad) self._avg_grad_sqrs_updates[avg_grad_sqr] = new_avg_grad_sqr rms_grad_t = T.sqrt(new_avg_grad_sqr) rms_grad_t = T.maximum(rms_grad_t, self.stabilizer) _grad = _grad / rms_grad_t self._gradnorm += T.sum(_grad**2) self._gradnorm = T.sqrt(self._gradnorm) self._givens = {} self._givens[self._inputvar] = self._inputs_theano[ self._batch_idx * self.batchsize: (self._batch_idx + 1) * self.batchsize] if self.is_supervised: self._givens[self._targetvar] = self._outputs_theano[ self._batch_idx * self.batchsize: (self._batch_idx + 1) * self.batchsize] if self.has_masks: self._givens[self._maskvar] = self._masks_theano[ self._batch_idx * self.batchsize: (self._batch_idx + 1) * self.batchsize] self.gradnorm = theano.function( inputs=[], outputs=self._gradnorm, givens=self._givens) avg_gradnorm_update = { self._avg_gradnorm: self._avg_gradnorm * .8 + self._gradnorm * .2} self._update_weight_norm_ratios = [] for _param, _grad in zip(self._params, self._grads): if hasattr(self._model, 'skip_params'): if _param.name in self._model.skip_params: continue _clip_grad = T.switch( T.gt(self._gradnorm, self._gradient_clip_threshold), _grad * self._gradient_clip_threshold / self._gradnorm, _grad) try: # ... to apply learningrate_modifiers # Cliphid version: self._inc_updates[self._incs[_param]] = \ self._momentum * self._incs[_param] - \ self._learningrate * \ self._model.learningrate_modifiers[ _param.name] * _clip_grad self._updates[_param] = _param + self._incs[_param] self._updates_nomomentum[_param] = _param - \ self._learningrate * \ self._model.learningrate_modifiers[_param.name] * \ _clip_grad print 'Learning rate modifier for {0}: {1}'.format( _param.name, self._model.learningrate_modifiers[_param.name]) except (AttributeError, KeyError): self._inc_updates[self._incs[_param]] = self._momentum * \ self._incs[_param] - self._learningrate * _clip_grad self._updates[_param] = _param + self._incs[_param] self._updates_nomomentum[_param] = _param - \ self._learningrate * _clip_grad if self.monitor_update_weight_norm_ratio: print 'building update weight norm ratio graph for ', _param.name self._update_weight_norm_ratios.append( self._incs[_param].norm(2) / _param.norm(2)) self.any_isnan = T.any(T.isnan( T.concatenate([x.flatten() for x in self._grads], axis=0))) # compute function to get update_weight_norm_ratios (returned in same # order as params list) print 'compiling update weight norm ratio function...' self.get_update_weight_norm_ratios = theano.function( [], self._update_weight_norm_ratios) print 'done' # first update gradient norm running avg ordered_updates = collections.OrderedDict() try: ordered_updates.update(self._model.updates) except AttributeError: pass ordered_updates.update(avg_gradnorm_update) # so that it is considered in the parameter update computations ordered_updates.update(self._inc_updates) print 'compiling updateincs...' self._updateincs = theano.function( [], [self._cost, self._avg_gradnorm, self.any_isnan], updates=ordered_updates, givens=self._givens) print 'done' print 'compiling trainmodel...' self._trainmodel = theano.function( [self._n], self._noop, updates=self._updates) print 'done' print 'compiling trainmodel_nomomentum...' self._trainmodel_nomomentum = theano.function( [self._n], self._noop, updates=self._updates_nomomentum, givens=self._givens) print 'done' self._momentum_batchcounter = 0
def setup_backprop(self): eta = T.scalar('eta_for_backprop') x = T.lvector('x_for_backprop') y = T.lvector('y_for_backprop') y_in_x_inds = T.lmatrix('y_in_x_inds_for_backprop') dec_init_state, annotations = self._symb_encoder(x) def decoder_recurrence(y_t, cur_y_in_x_inds, h_prev, annotations, *params): h_for_write = self.spec.decoder.get_h_for_write(h_prev) scores = self.spec.get_attention_scores(h_for_write, annotations) alpha = self.spec.get_alpha(scores) c_t = self.spec.get_context(alpha, annotations) write_dist = self.spec.f_write(h_for_write, c_t, scores) base_p_y_t = write_dist[y_t] if self.spec.attention_copying: copying_p_y_t = T.dot( write_dist[self.out_vocabulary.size():], cur_y_in_x_inds) p_y_t = base_p_y_t + copying_p_y_t else: p_y_t = base_p_y_t h_t = self.spec.f_dec(y_t, c_t, h_prev) return (h_t, p_y_t) dec_results, _ = theano.scan( fn=decoder_recurrence, sequences=[y, y_in_x_inds], outputs_info=[dec_init_state, None], non_sequences=[annotations] + self.spec.get_all_shared()) p_y_seq = dec_results[1] log_p_y = T.sum(T.log(p_y_seq)) gradients = T.grad(log_p_y, self.params) # Do the updates here updates = [] if self.spec.step_rule in ('adagrad', 'rmsprop'): # Adagrad updates for p, g, c in zip(self.params, gradients, self.grad_cache): grad_norm = g.norm(2) clipped_grad = ifelse(grad_norm >= CLIP_THRESH, g * CLIP_THRESH / grad_norm, g) if self.spec.step_rule == 'adagrad': new_c = c + clipped_grad ** 2 else: # rmsprop decay_rate = 0.9 # Use fixed decay rate of 0.9 new_c = decay_rate * c + (1.0 - decay_rate) * clipped_grad ** 2 new_p = p + eta * clipped_grad / T.sqrt(new_c + 1e-4) has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p)) updates.append((p, ifelse(has_non_finite, p, new_p))) updates.append((c, ifelse(has_non_finite, c, new_c))) elif self.spec.step_rule == 'nesterov': # Nesterov momentum for p, g, v in zip(self.params, gradients, self.grad_cache): grad_norm = g.norm(2) clipped_grad = ifelse(grad_norm >= CLIP_THRESH, g * CLIP_THRESH / grad_norm, g) new_v = NESTEROV_MU * v + eta * clipped_grad new_p = p - NESTEROV_MU * v + (1 + NESTEROV_MU) * new_v has_non_finite = (T.any(T.isnan(new_p) + T.isinf(new_p)) + T.any(T.isnan(new_v) + T.isinf(new_v))) updates.append((p, ifelse(has_non_finite, p, new_p))) updates.append((v, ifelse(has_non_finite, v, new_v))) else: # Simple SGD updates for p, g in zip(self.params, gradients): grad_norm = g.norm(2) clipped_grad = ifelse(grad_norm >= CLIP_THRESH, g * CLIP_THRESH / grad_norm, g) new_p = p + eta * clipped_grad has_non_finite = T.any(T.isnan(new_p) + T.isinf(new_p)) updates.append((p, ifelse(has_non_finite, p, new_p))) #updates.append((p, new_p)) self._backprop = theano.function( inputs=[x, y, eta, y_in_x_inds], outputs=[p_y_seq, log_p_y], updates=updates)
def build(self): # (batch_size, max_example_action_num, action_type) tgt_action_seq = ndim_itensor(3, 'tgt_action_seq') # (batch_size, max_example_action_num, action_type) tgt_action_seq_type = ndim_itensor(3, 'tgt_action_seq_type') # (batch_size, max_example_action_num) tgt_node_seq = ndim_itensor(2, 'tgt_node_seq') # (batch_size, max_example_action_num) tgt_par_rule_seq = ndim_itensor(2, 'tgt_par_rule_seq') # (batch_size, max_example_action_num) tgt_par_t_seq = ndim_itensor(2, 'tgt_par_t_seq') # (batch_size, max_example_action_num, symbol_embed_dim) # tgt_node_embed = self.node_embedding(tgt_node_seq, mask_zero=False) tgt_node_embed = self.node_embedding[tgt_node_seq] # (batch_size, max_query_length) query_tokens = ndim_itensor(2, 'query_tokens') mask = T.TensorType(dtype='int32', name='mask', broadcastable=(True, False))() # (batch_size, max_query_length, query_token_embed_dim) # (batch_size, max_query_length) query_token_embed, query_token_embed_mask = self.query_embedding( query_tokens, mask_zero=True) # if WORD_DROPOUT > 0: # logging.info('used word dropout for source, p = %f', WORD_DROPOUT) # query_token_embed, query_token_embed_intact = WordDropout(WORD_DROPOUT, self.srng)(query_token_embed, False) batch_size = tgt_action_seq.shape[0] max_example_action_num = tgt_action_seq.shape[1] # previous action embeddings # (batch_size, max_example_action_num, action_embed_dim) tgt_action_seq_embed = T.switch( T.shape_padright(tgt_action_seq[:, :, 0] > 0), self.rule_embedding_W[tgt_action_seq[:, :, 0]], self.vocab_embedding_W[tgt_action_seq[:, :, 1]]) tgt_action_seq_embed_tm1 = tensor_right_shift(tgt_action_seq_embed) # parent rule application embeddings tgt_par_rule_embed = T.switch(tgt_par_rule_seq[:, :, None] < 0, T.alloc(0., 1, config.rule_embed_dim), self.rule_embedding_W[tgt_par_rule_seq]) if not config.frontier_node_type_feed: tgt_node_embed *= 0. if not config.parent_action_feed: tgt_par_rule_embed *= 0. # (batch_size, max_example_action_num, action_embed_dim + symbol_embed_dim + action_embed_dim) decoder_input = T.concatenate( [tgt_action_seq_embed_tm1, tgt_node_embed, tgt_par_rule_embed], axis=-1) # (batch_size, max_query_length, query_embed_dim) query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask, dropout=config.dropout, srng=self.srng) # (batch_size, max_example_action_num) tgt_action_seq_mask = T.any(tgt_action_seq_type, axis=-1) # decoder_hidden_states: (batch_size, max_example_action_num, lstm_hidden_state) # ctx_vectors: (batch_size, max_example_action_num, encoder_hidden_dim) decoder_hidden_states, _, ctx_vectors = self.decoder_lstm( decoder_input, context=query_embed, context_mask=query_token_embed_mask, mask=tgt_action_seq_mask, parent_t_seq=tgt_par_t_seq, dropout=config.dropout, srng=self.srng) # if DECODER_DROPOUT > 0: # logging.info('used dropout for decoder output, p = %f', DECODER_DROPOUT) # decoder_hidden_states = Dropout(DECODER_DROPOUT, self.srng)(decoder_hidden_states) # ==================================================== # apply additional non-linearity transformation before # predicting actions # ==================================================== decoder_hidden_state_trans_rule = self.decoder_hidden_state_W_rule( decoder_hidden_states) decoder_hidden_state_trans_token = self.decoder_hidden_state_W_token( T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1)) # (batch_size, max_example_action_num, rule_num) rule_predict = softmax( T.dot(decoder_hidden_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b) # (batch_size, max_example_action_num, 2) terminal_gen_action_prob = self.terminal_gen_softmax( decoder_hidden_states) # (batch_size, max_example_action_num, target_vocab_size) logits = T.dot(decoder_hidden_state_trans_token, T.transpose( self.vocab_embedding_W)) + self.vocab_embedding_b # vocab_predict = softmax(T.dot(decoder_hidden_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b) vocab_predict = softmax( logits.transpose(1, 0, 2) * mask + (T.min(logits.transpose(1, 0, 2), axis=1, keepdims=True) - 1) * (1 - mask)).transpose(1, 0, 2) # (batch_size, max_example_action_num, lstm_hidden_state + encoder_hidden_dim) ptr_net_decoder_state = T.concatenate( [decoder_hidden_states, ctx_vectors], axis=-1) # (batch_size, max_example_action_num, max_query_length) copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state) # (batch_size, max_example_action_num) rule_tgt_prob = rule_predict[ T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 0]] # (batch_size, max_example_action_num) vocab_tgt_prob = vocab_predict[ T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 1]] # (batch_size, max_example_action_num) copy_tgt_prob = copy_prob[ T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 2]] # (batch_size, max_example_action_num) tgt_prob = tgt_action_seq_type[:, :, 0] * rule_tgt_prob + \ tgt_action_seq_type[:, :, 1] * terminal_gen_action_prob[:, :, 0] * vocab_tgt_prob + \ tgt_action_seq_type[:, :, 2] * terminal_gen_action_prob[:, :, 1] * copy_tgt_prob likelihood = T.log(tgt_prob + 1.e-7 * (1 - tgt_action_seq_mask)) loss = -(likelihood * tgt_action_seq_mask).sum( axis=-1) # / tgt_action_seq_mask.sum(axis=-1) loss = T.mean(loss) # let's build the function! train_inputs = [ query_tokens, tgt_action_seq, tgt_action_seq_type, tgt_node_seq, tgt_par_rule_seq, tgt_par_t_seq, mask ] optimizer = optimizers.get(config.optimizer) optimizer.clip_grad = config.clip_grad updates, grads = optimizer.get_updates(self.params, loss) self.train_func = theano.function( train_inputs, [loss], # [loss, tgt_action_seq_type, tgt_action_seq, # rule_tgt_prob, vocab_tgt_prob, copy_tgt_prob, # copy_prob, terminal_gen_action_prob], updates=updates) # if WORD_DROPOUT > 0: # self.build_decoder(query_tokens, query_token_embed_intact, query_token_embed_mask) # else: # self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask) self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask, mask)
def logpow(x, m): """ Calculates log(x**m) since m*log(x) will fail when m, x = 0. """ # return m * log(x) return T.switch(T.any(T.eq(x, 0)), -np.inf, m * T.log(x))
def pgrad(g_out): g_out = T.clip(g_out, self.clip_lower_bound, self.clip_upper_bound) g_out = ifelse(T.any(T.isnan(g_out)), T.ones_like(g_out)*0.00001, g_out) return g_out
def mask(self, train=False): X = self.get_input('input')(train) return T.any(T.ones_like(X) * (1. - T.eq(X, self.mask_value)), axis=-1)
def get_output_mask(self, train=False): X = self.get_input(train) return T.any(T.ones_like(X) * (1.0 - T.eq(X, self.mask_value)), axis=-1)
def __init__(self, n_in, hidden_layer_size, n_out, L1_reg, L2_reg, hidden_layer_type, output_type='LINEAR', dropout_rate=0.0, optimizer='sgd', loss_function='MMSE', rnn_batch_training=False): """ This function initialises a neural network :param n_in: Dimensionality of input features :type in: Integer :param hidden_layer_size: The layer size for each hidden layer :type hidden_layer_size: A list of integers :param n_out: Dimensionality of output features :type n_out: Integrer :param hidden_layer_type: the activation types of each hidden layers, e.g., TANH, LSTM, GRU, BLSTM :param L1_reg: the L1 regulasation weight :param L2_reg: the L2 regulasation weight :param output_type: the activation type of the output layer, by default is 'LINEAR', linear regression. :param dropout_rate: probability of dropout, a float number between 0 and 1. """ logger = logging.getLogger("DNN initialization") self.n_in = int(n_in) self.n_out = int(n_out) self.n_layers = len(hidden_layer_size) self.dropout_rate = dropout_rate self.optimizer = optimizer self.loss_function = loss_function self.is_train = T.iscalar('is_train') self.rnn_batch_training = rnn_batch_training assert len(hidden_layer_size) == len(hidden_layer_type) self.list_of_activations = ['TANH', 'SIGMOID', 'SOFTMAX', 'RELU', 'RESU'] if self.rnn_batch_training: self.x = T.tensor3('x') self.y = T.tensor3('y') else: self.x = T.matrix('x') self.y = T.matrix('y') self.L1_reg = L1_reg self.L2_reg = L2_reg self.rnn_layers = [] self.params = [] self.delta_params = [] rng = np.random.RandomState(123) for i in range(self.n_layers): if i == 0: input_size = n_in else: input_size = hidden_layer_size[i-1] if i == 0: layer_input = self.x else: layer_input = self.rnn_layers[i-1].output if hidden_layer_type[i-1] == 'BSLSTM' or hidden_layer_type[i-1] == 'BLSTM': input_size = hidden_layer_size[i-1]*2 if hidden_layer_type[i] in self.list_of_activations: hidden_activation = hidden_layer_type[i].lower() hidden_layer = GeneralLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=hidden_activation, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'TANH_LHUC': hidden_layer = SigmoidLayer_LHUC(rng, layer_input, input_size, hidden_layer_size[i], activation=T.tanh, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'SLSTM': hidden_layer = SimplifiedLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'SGRU': hidden_layer = SimplifiedGRU(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'GRU': hidden_layer = GatedRecurrentUnit(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_NFG': hidden_layer = LstmNFG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_NOG': hidden_layer = LstmNOG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_NIG': hidden_layer = LstmNIG(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_NPH': hidden_layer = LstmNoPeepholes(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM': hidden_layer = VanillaLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'BSLSTM': hidden_layer = BidirectionSLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'BLSTM': hidden_layer = BidirectionLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'RNN': hidden_layer = VanillaRNN(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_LHUC': hidden_layer = VanillaLstm_LHUC(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) else: logger.critical("This hidden layer type: %s is not supported right now! \n Please use one of the following: SLSTM, BSLSTM, TANH, SIGMOID\n" %(hidden_layer_type[i])) sys.exit(1) self.rnn_layers.append(hidden_layer) self.params.extend(hidden_layer.params) input_size = hidden_layer_size[-1] if hidden_layer_type[-1] == 'BSLSTM' or hidden_layer_type[-1] == 'BLSTM': input_size = hidden_layer_size[-1]*2 output_activation = output_type.lower() if output_activation == 'linear': self.final_layer = LinearLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out) elif output_activation == 'recurrent': self.final_layer = RecurrentOutputLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, rnn_batch_training=self.rnn_batch_training) elif output_type.upper() in self.list_of_activations: self.final_layer = GeneralLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, activation=output_activation) else: logger.critical("This output layer type: %s is not supported right now! \n Please use one of the following: LINEAR, BSLSTM\n" %(output_type)) sys.exit(1) self.params.extend(self.final_layer.params) self.updates = {} for param in self.params: self.updates[param] = theano.shared(value = np.zeros(param.get_value(borrow = True).shape, dtype = theano.config.floatX), name = 'updates') if self.loss_function == 'CCE': self.finetune_cost = self.categorical_crossentropy_loss(self.final_layer.output, self.y) self.errors = self.categorical_crossentropy_loss(self.final_layer.output, self.y) elif self.loss_function == 'Hinge': self.finetune_cost = self.multiclass_hinge_loss(self.final_layer.output, self.y) self.errors = self.multiclass_hinge_loss(self.final_layer.output, self.y) elif self.loss_function == 'MMSE': if self.rnn_batch_training: self.y_mod = T.reshape(self.y, (-1, n_out)) self.final_layer_output = T.reshape(self.final_layer.output, (-1, n_out)) nonzero_rows = T.any(self.y_mod, 1).nonzero() self.y_mod = self.y_mod[nonzero_rows] self.final_layer_output = self.final_layer_output[nonzero_rows] self.finetune_cost = T.mean(T.sum((self.final_layer_output - self.y_mod) ** 2, axis=1)) self.errors = T.mean(T.sum((self.final_layer_output - self.y_mod) ** 2, axis=1)) else: self.finetune_cost = T.mean(T.sum((self.final_layer.output - self.y) ** 2, axis=1)) self.errors = T.mean(T.sum((self.final_layer.output - self.y) ** 2, axis=1))
def get_reward(self,session_states,session_actions,batch_i): """ WARNING! this runs on a single session, not on a batch reward given for taking the action in current environment state arguments: session_states float[batch_id, memory_id]: environment state before taking action session_actions int[batch_id]: agent action at this tick returns: reward float[batch_id]: reward for taking action from the given state """ #unpach states and actions session_states = check_list(session_states)[0] session_actions = check_list(session_actions)[0] time_range = T.arange(session_actions.shape[0]) has_tried_already = session_states[time_range,session_actions] session_is_active = T.eq(session_states[:,self.end_action_id],0) has_finished_now = T.eq(session_actions,self.end_action_id) has_finished_now = T.set_subtensor(has_finished_now[-1],1) end_tick = has_finished_now.nonzero()[0][0] action_is_categorical = in1d(session_actions, self.category_action_ids) response = self.joint_data[batch_i,session_actions].ravel() at_least_one_category_guessed = T.any(action_is_categorical[:end_tick] & (response[:end_tick]>0)) #categorical and attributes reward_for_intermediate_action = T.switch( action_is_categorical, response*(self.rw["category_positive"]-self.rw["category_negative"]) + self.rw["category_negative"], response*(self.rw["attribute_positive"]-self.rw["attribute_negative"]) + self.rw["attribute_negative"] ) reward_for_intermediate_action_first_time = T.switch( has_tried_already, self.rw["repeated_poll"], reward_for_intermediate_action, ) #ending session reward_for_end_action = T.switch(at_least_one_category_guessed, #if chosen at least 1 category self.rw["end_action"], #do not penalize self.rw["end_action_if_no_category_predicted"]) #else punish #include end action reward_for_action = T.switch( has_finished_now, reward_for_end_action, reward_for_intermediate_action_first_time, ) final_reward = T.switch( session_is_active, reward_for_action, 0, ) return final_reward.astype(theano.config.floatX)
def unroll_scan(fn, sequences=(), outputs_info=(), non_sequences=(), n_steps=None, go_backwards=False): """ Helper function to unroll for loops. Can be used to unroll theano.scan. The parameter names are identical to theano.scan, please refer to here for more information. Note that this function does not support the truncate_gradient setting from theano.scan. Code adapted from https://github.com/Lasagne/Lasagne. Thank you! Parameters ---------- fn : function Function that defines calculations at each step. sequences : TensorVariable or list of TensorVariables List of TensorVariable with sequence data. The function iterates over the first dimension of each TensorVariable. outputs_info : list of TensorVariables List of tensors specifying the initial values for each recurrent value. non_sequences: list of TensorVariables List of theano.shared variables that are used in the step function. n_steps: int Number of steps to unroll. go_backwards: bool If true the recursion starts at sequences[-1] and iterates backwards. Returns ------- Tuple of the form (outputs, updates). outputs is a list of TensorVariables. Each element in the list gives the recurrent values at each time step. updates is an empty dict for now. """ if not isinstance(sequences, (list, tuple)): sequences = [sequences] sequences = list(sequences) outputs_info = list(outputs_info) non_sequences = list(non_sequences) # When backwards reverse the recursion direction counter = range(n_steps) if go_backwards: counter = counter[::-1] output = [] prev_vals = outputs_info until = [] for i in counter: assert len(prev_vals) == len(outputs_info) prev_vals = [prev for prev, out_info in zip(prev_vals, outputs_info) if out_info is not None] step_input = [s[i] for s in sequences] + prev_vals + non_sequences out_ = fn(*step_input) # The returned values from step can be either a TensorVariable, # a list, or a tuple. Below, we force it to always be a list. if isinstance(out_, T.TensorVariable): out_ = [out_] if isinstance(out_, tuple): if len(out_) >= 1 and isinstance(out_[0], (list, tuple)): if len(out_) >= 2: assert not out_[1], "shared var updates not supported" if len(out_) >= 3: assert isinstance(out_[2], theano.scan_module.until) until.append(T.neq(out_[2].condition, 0)) out_ = list(out_[0]) else: out_ = list(out_) output.append(out_) prev_vals = output[-1] # iterate over each scan output and convert it to same format as scan: # [[output11, output12,...output1n], # [output21, output22,...output2n],...] output_scan = [] for i in range(len(output[0])): l = map(lambda x: x[i], output) output_scan.append(T.stack(*l)) if until: assert len(until) == n_steps until_conds = T.stack(*until) new_len = T.switch(T.any(until_conds), T.minimum(T.argmax(until_conds) + 1, n_steps), n_steps) output_scan = [out[:new_len] for out in output_scan] if len(output_scan) == 1: output_scan = output_scan[0] updates = {} return output_scan, updates
def __init__(self, n_in, hidden_layer_size, n_out, L1_reg, L2_reg, hidden_layer_type, output_type='LINEAR', network_type='S2S', ed_type='HED', dropout_rate=0.0, optimizer='sgd', MLU_div_lengths = [], loss_function='MMSE', rnn_batch_training=False): """ This function initialises a neural network :param n_in: Dimensionality of input features :type in: Integer :param hidden_layer_size: The layer size for each hidden layer :type hidden_layer_size: A list of integers :param n_out: Dimensionality of output features :type n_out: Integrer :param hidden_layer_type: the activation types of each hidden layers, e.g., TANH, LSTM, GRU, BLSTM :param L1_reg: the L1 regulasation weight :param L2_reg: the L2 regulasation weight :param output_type: the activation type of the output layer, by default is 'LINEAR', linear regression. :param dropout_rate: probability of dropout, a float number between 0 and 1. """ logger = logging.getLogger("DNN initialization") self.n_in = int(n_in) self.n_out = int(n_out) self.n_layers = len(hidden_layer_size) self.dropout_rate = dropout_rate self.optimizer = optimizer self.loss_function = loss_function self.is_train = T.iscalar('is_train') self.rnn_batch_training = rnn_batch_training assert len(hidden_layer_size) == len(hidden_layer_type) self.list_of_activations = ['TANH', 'SIGMOID', 'SOFTMAX', 'RELU', 'RESU'] BLSTM_variants = ['BLSTM', 'BSLSTM', 'BLSTME', 'BSLSTME'] Encoder_variants = ['RNNE', 'LSTME', 'BLSTME', 'SLSTME', 'TANHE'] Decoder_variants = ['RNND', 'LSTMD', 'SLSTMD'] if self.rnn_batch_training: self.x = T.tensor3('x') self.y = T.tensor3('y') else: self.x = T.matrix('x') self.y = T.matrix('y') if network_type == "S2S": self.d = T.ivector('d') self.f = T.matrix('f') self.L1_reg = L1_reg self.L2_reg = L2_reg self.rnn_layers = [] self.params = [] self.delta_params = [] rng = np.random.RandomState(123) prev_seg_end = 0 encoder_count = 0 MLU_div = MLU_div_lengths for i in range(self.n_layers): if i == 0: input_size = n_in else: input_size = hidden_layer_size[i-1] if hidden_layer_type[i-1] in BLSTM_variants: input_size = hidden_layer_size[i-1]*2 if i == 0: layer_input = self.x else: layer_input = self.rnn_layers[i-1].output ### sequence-to-sequence mapping ### if hidden_layer_type[i-1] in Encoder_variants: dur_input = self.d frame_feat_input = self.f # vanilla encoder-decoder (phone-level features) if ed_type == "VED": seq2seq_model = DistributedSequenceEncoder(rng, layer_input, dur_input) layer_input = T.concatenate((seq2seq_model.encoded_output, frame_feat_input), axis=1) input_size = input_size+4 # hierarchical encoder-decoder elif ed_type == "HED": seg_len = layer_input.size//input_size seg_dur_input = dur_input[prev_seg_end: prev_seg_end+seg_len] num_of_segs = T.sum(seg_dur_input) seq2seq_model = DistributedSequenceEncoder(rng, layer_input, seg_dur_input) addfeat_input = frame_feat_input[0:num_of_segs, MLU_div[encoder_count]:MLU_div[encoder_count+1]] layer_input = T.concatenate((seq2seq_model.encoded_output, addfeat_input), axis=1) input_size = input_size + (MLU_div[encoder_count+1]-MLU_div[encoder_count]) prev_seg_end = prev_seg_end + seg_len encoder_count = encoder_count + 1 # hidden layer activation if hidden_layer_type[i] in self.list_of_activations: hidden_activation = hidden_layer_type[i].lower() hidden_layer = GeneralLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=hidden_activation, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'TANHE' or hidden_layer_type[i] == 'SIGMOIDE': hidden_activation = hidden_layer_type[i][0:-1].lower() hidden_layer = GeneralLayer(rng, layer_input, input_size, hidden_layer_size[i], activation=hidden_activation, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'TANH_LHUC': hidden_layer = SigmoidLayer_LHUC(rng, layer_input, input_size, hidden_layer_size[i], activation=T.tanh, p=self.dropout_rate, training=self.is_train) elif hidden_layer_type[i] == 'SLSTM' or hidden_layer_type[i] == 'SLSTME': hidden_layer = SimplifiedLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'SLSTMD': hidden_layer = SimplifiedLstmDecoder(rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'SGRU': hidden_layer = SimplifiedGRU(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'GRU': hidden_layer = GatedRecurrentUnit(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM' or hidden_layer_type[i] == 'LSTME': hidden_layer = VanillaLstm(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTMD': hidden_layer = VanillaLstmDecoder(rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'BSLSTM' or hidden_layer_type[i] == 'BSLSTME': hidden_layer = BidirectionSLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'BLSTM' or hidden_layer_type[i] == 'BLSTME': hidden_layer = BidirectionLstm(rng, layer_input, input_size, hidden_layer_size[i], hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'RNN' or hidden_layer_type[i] == 'RNNE': hidden_layer = VanillaRNN(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'RNND': hidden_layer = VanillaRNNDecoder(rng, layer_input, input_size, hidden_layer_size[i], self.n_out, p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) elif hidden_layer_type[i] == 'LSTM_LHUC': hidden_layer = VanillaLstm_LHUC(rng, layer_input, input_size, hidden_layer_size[i], p=self.dropout_rate, training=self.is_train, rnn_batch_training=self.rnn_batch_training) else: logger.critical("This hidden layer type: %s is not supported right now! \n Please use one of the following: SLSTM, BSLSTM, TANH, SIGMOID\n" %(hidden_layer_type[i])) sys.exit(1) self.rnn_layers.append(hidden_layer) self.params.extend(hidden_layer.params) input_size = hidden_layer_size[-1] if hidden_layer_type[-1] in BLSTM_variants: input_size = hidden_layer_size[-1]*2 if hidden_layer_type[-1] in Decoder_variants: self.final_layer = self.rnn_layers[-1] else: output_activation = output_type.lower() if output_activation == 'linear': self.final_layer = LinearLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out) elif output_activation == 'recurrent': self.final_layer = RecurrentOutputLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, rnn_batch_training=self.rnn_batch_training) elif output_type.upper() in self.list_of_activations: self.final_layer = GeneralLayer(rng, self.rnn_layers[-1].output, input_size, self.n_out, activation=output_activation) else: logger.critical("This output layer type: %s is not supported right now! \n Please use one of the following: LINEAR, BSLSTM\n" %(output_type)) sys.exit(1) self.params.extend(self.final_layer.params) self.updates = {} for param in self.params: self.updates[param] = theano.shared(value = np.zeros(param.get_value(borrow = True).shape, dtype = theano.config.floatX), name = 'updates') if self.loss_function == 'CCE': self.finetune_cost = self.categorical_crossentropy_loss(self.final_layer.output, self.y) self.errors = self.categorical_crossentropy_loss(self.final_layer.output, self.y) elif self.loss_function == 'Hinge': self.finetune_cost = self.multiclass_hinge_loss(self.final_layer.output, self.y) self.errors = self.multiclass_hinge_loss(self.final_layer.output, self.y) elif self.loss_function == 'MMSE': if self.rnn_batch_training: self.y_mod = T.reshape(self.y, (-1, n_out)) self.final_layer_output = T.reshape(self.final_layer.output, (-1, n_out)) nonzero_rows = T.any(self.y_mod, 1).nonzero() self.y_mod = self.y_mod[nonzero_rows] self.final_layer_output = self.final_layer_output[nonzero_rows] self.finetune_cost = T.mean(T.sum((self.final_layer_output - self.y_mod) ** 2, axis=1)) self.errors = T.mean(T.sum((self.final_layer_output - self.y_mod) ** 2, axis=1)) else: self.finetune_cost = T.mean(T.sum((self.final_layer.output - self.y) ** 2, axis=1)) self.errors = T.mean(T.sum((self.final_layer.output - self.y) ** 2, axis=1))
def compute_step(self, param, previous_step): not_finite = tensor.any(tensor.or_( tensor.isnan(previous_step), tensor.isinf(previous_step))) step = tensor.switch(not_finite, self.scaler * param, previous_step) return step, []
def logpow(x, m): """ Calculates log(x**m) since m*log(x) will fail when m, x = 0. """ # return m * log(x) return switch(any(eq(x, 0)), -inf, m * log(x))
def get_output(self, train=False): X = self.get_input(train) return X * T.shape_padright(T.any((1.0 - T.eq(X, self.mask_value)), axis=-1))
def build(self): # (batch_size, max_example_action_num, action_type) tgt_action_seq = ndim_itensor(3, 'tgt_action_seq') # (batch_size, max_example_action_num, action_type) tgt_action_seq_type = ndim_itensor(3, 'tgt_action_seq_type') # (batch_size, max_example_action_num) tgt_node_seq = ndim_itensor(2, 'tgt_node_seq') # (batch_size, max_example_action_num) tgt_par_rule_seq = ndim_itensor(2, 'tgt_par_rule_seq') # (batch_size, max_example_action_num) tgt_par_t_seq = ndim_itensor(2, 'tgt_par_t_seq') # (batch_size, max_example_action_num, symbol_embed_dim) # tgt_node_embed = self.node_embedding(tgt_node_seq, mask_zero=False) tgt_node_embed = self.node_embedding[tgt_node_seq] # (batch_size, max_query_length) query_tokens = ndim_itensor(2, 'query_tokens') # (batch_size, max_query_length, query_token_embed_dim) # (batch_size, max_query_length) query_token_embed, query_token_embed_mask = self.query_embedding(query_tokens, mask_zero=True) # if WORD_DROPOUT > 0: # logging.info('used word dropout for source, p = %f', WORD_DROPOUT) # query_token_embed, query_token_embed_intact = WordDropout(WORD_DROPOUT, self.srng)(query_token_embed, False) batch_size = tgt_action_seq.shape[0] max_example_action_num = tgt_action_seq.shape[1] # previous action embeddings # (batch_size, max_example_action_num, action_embed_dim) tgt_action_seq_embed = T.switch(T.shape_padright(tgt_action_seq[:, :, 0] > 0), self.rule_embedding_W[tgt_action_seq[:, :, 0]], self.vocab_embedding_W[tgt_action_seq[:, :, 1]]) tgt_action_seq_embed_tm1 = tensor_right_shift(tgt_action_seq_embed) # parent rule application embeddings tgt_par_rule_embed = T.switch(tgt_par_rule_seq[:, :, None] < 0, T.alloc(0., 1, config.rule_embed_dim), self.rule_embedding_W[tgt_par_rule_seq]) if not config.frontier_node_type_feed: tgt_node_embed *= 0. if not config.parent_action_feed: tgt_par_rule_embed *= 0. # (batch_size, max_example_action_num, action_embed_dim + symbol_embed_dim + action_embed_dim) decoder_input = T.concatenate([tgt_action_seq_embed_tm1, tgt_node_embed, tgt_par_rule_embed], axis=-1) # (batch_size, max_query_length, query_embed_dim) query_embed = self.query_encoder_lstm(query_token_embed, mask=query_token_embed_mask, dropout=config.dropout, srng=self.srng) # (batch_size, max_example_action_num) tgt_action_seq_mask = T.any(tgt_action_seq_type, axis=-1) # decoder_hidden_states: (batch_size, max_example_action_num, lstm_hidden_state) # ctx_vectors: (batch_size, max_example_action_num, encoder_hidden_dim) decoder_hidden_states, _, ctx_vectors = self.decoder_lstm(decoder_input, context=query_embed, context_mask=query_token_embed_mask, mask=tgt_action_seq_mask, parent_t_seq=tgt_par_t_seq, dropout=config.dropout, srng=self.srng) # if DECODER_DROPOUT > 0: # logging.info('used dropout for decoder output, p = %f', DECODER_DROPOUT) # decoder_hidden_states = Dropout(DECODER_DROPOUT, self.srng)(decoder_hidden_states) # ==================================================== # apply additional non-linearity transformation before # predicting actions # ==================================================== decoder_hidden_state_trans_rule = self.decoder_hidden_state_W_rule(decoder_hidden_states) decoder_hidden_state_trans_token = self.decoder_hidden_state_W_token(T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1)) # (batch_size, max_example_action_num, rule_num) rule_predict = softmax(T.dot(decoder_hidden_state_trans_rule, T.transpose(self.rule_embedding_W)) + self.rule_embedding_b) # (batch_size, max_example_action_num, 2) terminal_gen_action_prob = self.terminal_gen_softmax(decoder_hidden_states) # (batch_size, max_example_action_num, target_vocab_size) vocab_predict = softmax(T.dot(decoder_hidden_state_trans_token, T.transpose(self.vocab_embedding_W)) + self.vocab_embedding_b) # (batch_size, max_example_action_num, lstm_hidden_state + encoder_hidden_dim) ptr_net_decoder_state = T.concatenate([decoder_hidden_states, ctx_vectors], axis=-1) # (batch_size, max_example_action_num, max_query_length) copy_prob = self.src_ptr_net(query_embed, query_token_embed_mask, ptr_net_decoder_state) # (batch_size, max_example_action_num) rule_tgt_prob = rule_predict[T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 0]] # (batch_size, max_example_action_num) vocab_tgt_prob = vocab_predict[T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 1]] # (batch_size, max_example_action_num) copy_tgt_prob = copy_prob[T.shape_padright(T.arange(batch_size)), T.shape_padleft(T.arange(max_example_action_num)), tgt_action_seq[:, :, 2]] # (batch_size, max_example_action_num) tgt_prob = tgt_action_seq_type[:, :, 0] * rule_tgt_prob + \ tgt_action_seq_type[:, :, 1] * terminal_gen_action_prob[:, :, 0] * vocab_tgt_prob + \ tgt_action_seq_type[:, :, 2] * terminal_gen_action_prob[:, :, 1] * copy_tgt_prob likelihood = T.log(tgt_prob + 1.e-7 * (1 - tgt_action_seq_mask)) loss = - (likelihood * tgt_action_seq_mask).sum(axis=-1) # / tgt_action_seq_mask.sum(axis=-1) loss = T.mean(loss) # let's build the function! train_inputs = [query_tokens, tgt_action_seq, tgt_action_seq_type, tgt_node_seq, tgt_par_rule_seq, tgt_par_t_seq] optimizer = optimizers.get(config.optimizer) optimizer.clip_grad = config.clip_grad updates, grads = optimizer.get_updates(self.params, loss) self.train_func = theano.function(train_inputs, [loss], # [loss, tgt_action_seq_type, tgt_action_seq, # rule_tgt_prob, vocab_tgt_prob, copy_tgt_prob, # copy_prob, terminal_gen_action_prob], updates=updates) # if WORD_DROPOUT > 0: # self.build_decoder(query_tokens, query_token_embed_intact, query_token_embed_mask) # else: # self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask) self.build_decoder(query_tokens, query_token_embed, query_token_embed_mask)