def arch_class_00(dim_desc, dim_labels, param_arch, logger): logger.info('Architecture:') # input layers desc = LL.InputLayer(shape=(None, dim_desc)) patch_op = LL.InputLayer(input_var=Tsp.csc_fmatrix('patch_op'), shape=(None, None)) logger.info(' input : dim = %d' % dim_desc) # layer 1: dimensionality reduction to 16 n_dim = 16 net = LL.DenseLayer(desc, n_dim) logger.info(' layer 1: FC%d' % n_dim) # layer 2: anisotropic convolution layer with 16 filters n_filters = 16 net = CL.GCNNLayer([net, patch_op], n_filters, nrings=5, nrays=16) string = ' layer 2: IC%d' % n_filters if param_arch['flag_batchnorm'] is True: net = LL.batch_norm(net) string = string + ' + batch normalization' logger.info(string) # layer 3: anisotropic convolution layer with 32 filters n_filters = 32 net = CL.GCNNLayer([net, patch_op], n_filters, nrings=5, nrays=16) string = ' layer 3: IC%d' % n_filters if param_arch['flag_batchnorm'] is True: net = LL.batch_norm(net) string = string + ' + batch normalization' logger.info(string) # layer 4: anisotropic convolution layer with 64 filters n_filters = 64 net = CL.GCNNLayer([net, patch_op], n_filters, nrings=5, nrays=16) string = ' layer 4: IC%d' % n_filters if param_arch['flag_batchnorm'] is True: net = LL.batch_norm(net) string = string + ' + batch normalization' logger.info(string) # layer 5: fully connected layer with 256 filters n_dim = 256 net = LL.DenseLayer(net, n_dim) string = ' layer 5: FC%d' % n_dim if param_arch['flag_batchnorm'] is True: net = LL.batch_norm(net) string = string + ' + batch normalization' logger.info(string) # layer 6: softmax layer producing a probability on the labels if param_arch['flag_nonlinearity'] == 'softmax': cla = LL.DenseLayer(net, dim_labels, nonlinearity=LN.softmax) string = ' layer 6: softmax' elif param_arch['flag_nonlinearity'] == 'log_softmax': cla = LL.DenseLayer(net, dim_labels, nonlinearity=log_softmax) string = ' layer 6: log-softmax' else: raise Exception('[e] the chosen non-linearity is not supported!') logger.info(string) # outputs return desc, patch_op, cla, net, logger
def get_model(inp, patch_op): icnn = LL.DenseLayer(inp, 16) icnn = batch_norm( utils_lasagne.GCNNLayer([icnn, patch_op], 16, nrings=4, nrays=8)) icnn = batch_norm( utils_lasagne.GCNNLayer([icnn, patch_op], 32, nrings=4, nrays=8)) icnn = batch_norm( utils_lasagne.GCNNLayer([icnn, patch_op], 64, nrings=4, nrays=8)) ffn = batch_norm(LL.DenseLayer(icnn, 512)) ffn = LL.DenseLayer(icnn, nclasses, nonlinearity=utils_lasagne.log_softmax) return ffn inp = LL.InputLayer(shape=(None, nin)) patch_op = LL.InputLayer(input_var=Tsp.csc_fmatrix('patch_op'), shape=(None, None)) ffn = get_model(inp, patch_op) # L.layers.get_output -> theano variable representing network output = LL.get_output(ffn) pred = LL.get_output(ffn, deterministic=True) # in case we use dropout # target theano variable indicatind the index a vertex should be mapped to wrt the latent space target = T.ivector('idxs') # to work with logit predictions, better behaved numerically cla = utils_lasagne.categorical_crossentropy_logdomain(output, target, nclasses).mean() acc = LO.categorical_accuracy(pred, target).mean()
def __init__(self, d, V, r, #nf, embeddings=None, nc=2, nf=0, pairwise_constraint=False, lambda_w=0.01, lambda_e=0.01, lambda_f=0.01, learning_rate='optimal', rnn=True, l1_ratio=0.15, beta=None, fixed_beta=True): assert(0 <= l1_ratio <= 1) if not rnn: print('skipping rnn...') #d = dimensionality of embeddings #V = size of vocabulary #r = number of dependency relations #nc = number of classes for classification self.learning_rate = learning_rate #|V| x d embedding matrix if embeddings is None: self.We = theano.shared(name='embeddings', value=0.2 * np.random.uniform(-1.0, 1.0, (V, d)) ).astype(theano.config.floatX) else: self.We = theano.shared(name='embeddings', value=embeddings, borrow=True ).astype(theano.config.floatX) #r x d x d tensor (matrix for each dependency relation) self.Wr = theano.shared(name='dependencies', value=0.2 * np.random.uniform(-1.0, 1.0, (r, d, d)) ).astype(theano.config.floatX) #d x d map from embedding to hidden vector self.Wv = theano.shared(name='Wv', value=0.2 * np.random.uniform(-1.0, 1.0, (d, d)) ).astype(theano.config.floatX) #d long bias vector self.b = theano.shared(name='b', value=np.zeros(d, dtype=theano.config.floatX)) if nc > 2: self.gamma = theano.shared(name='gamma', value=0.2 * np.random.uniform(-1.0, 1.0, (d, nc)) ).astype(theano.config.floatX) if nf > 0: #weights for fine grained features plus bias self.beta = theano.shared(name='beta', value=0.2 * np.random.uniform(-1.0, 1.0, (nf, nc)) ).astype(theano.config.floatX) else: self.gamma = theano.shared(name='gamma', value=0.2 * np.random.uniform(-1.0, 1.0, (d)) ).astype(theano.config.floatX) if nf > 0: #weights for fine grained features plus bias self.beta = theano.shared(name='beta', value=0.2 * np.random.uniform(-1.0, 1.0, (nf)) ).astype(theano.config.floatX) if nf > 0 and beta is not None: self.beta = theano.shared(name='beta', value=beta ).astype(theano.config.floatX) self.params = [] if rnn: self.params += [self.We, self.Wr, self.Wv, self.b, self.gamma] if nf > 0 and (beta is None or not fixed_beta): self.params += [self.beta] if learning_rate == 'adagrad': self.descender = Adagrad(self.params) #self.f = T.tanh self.f = normalized_tanh def recurrence(k, hidden_states, hidden_sums, x, r, p, mask): #at each node n in the tree, calculate Wr(p,n) \dot f(W_v \dot We_word(n) + b + sum_n) and add to sum_p h_k = self.f((T.dot(self.Wv, x[k].T) + hidden_sums[k].T).T + self.b).T*mask[k] #D x N sum_k = T.batched_dot(r[k], h_k.T) #N x D return T.set_subtensor(hidden_states[k], h_k.T), T.inc_subtensor(hidden_sums[p[k], T.arange(sum_k.shape[0])], sum_k) y = T.ivector('y') #all N x K matrices, where N is batch size and K is max sentence length (padded) x_idxs = T.imatrix('x') x_parents = T.imatrix('x_parents') x_rel_idxs = T.imatrix('x_rel') x_mask = T.imatrix('x_mask') #now these are K x N x D tensors X = self.We[x_idxs.T] #these are K x N x D x D tensors X_rel = self.Wr[x_rel_idxs.T] X_hidden_states = T.zeros((x_idxs.shape[1], x_idxs.shape[0], d), dtype=theano.config.floatX) X_hidden_sums = T.zeros((x_idxs.shape[1]+1, x_idxs.shape[0], d), dtype=theano.config.floatX) #these are K(+1) x K x N x D [X_h, X_s], updates = theano.scan(fn=recurrence, sequences=T.arange(x_idxs.shape[1]), outputs_info=[X_hidden_states, X_hidden_sums], non_sequences=[X, X_rel, x_parents.T, x_mask.T]) phi = sp.csc_fmatrix('phi') #X_h[-1, -1] is N x D base = 0 if rnn: base = base + T.dot(X_h[-1, -1], self.gamma) if nf > 0: if nc > 2: base = base + sp.structured_dot(phi, self.beta) else: base = base + sp.structured_dot(phi, self.beta.dimshuffle(0, 'x')).flatten() if nc > 2: p_y_given_x = T.nnet.softmax(base) y_pred = T.argmax(p_y_given_x, axis=1) costs = -T.log(p_y_given_x)[T.arange(y.shape[0]), y] else: p_y_given_x = T.nnet.sigmoid(base) y_pred = p_y_given_x > 0.5 costs = -y * T.log(p_y_given_x) - (1-y) * T.log(1-p_y_given_x) cost = costs.mean() if rnn: cost = cost + lambda_w * (self.We ** 2).sum() + lambda_w * (self.Wr ** 2).sum() + lambda_w * (self.Wv ** 2).sum() + lambda_w * (self.b ** 2).sum() + lambda_w * (self.gamma ** 2).sum() if pairwise_constraint: cost = cost - lambda_e * T.batched_dot(X_h[-1, -1][::2], X_h[-1, -1][1::2]).mean() if nf > 0 and (beta is None or not fixed_beta): cost = cost + lambda_f*(l1_ratio*T.abs_(self.beta).sum() + (1-l1_ratio) * (self.beta ** 2).sum()) grad = T.grad(cost, self.params) if learning_rate == 'optimal': def dloss(p, y): z = p * y if z > 18: return np.exp(-z) * -y if z < -18: return -y return -y / (np.exp(z) + 1) typw = np.sqrt(1.0 / np.sqrt(lambda_w)) initial_eta0 = typw / max(1.0, dloss(-typw, 1.0)) optimal_init = 1.0 / (initial_eta0 * lambda_w) print(typw, initial_eta0, optimal_init) self.t = theano.shared(name='t', value=0.).astype(theano.config.floatX) eta = 1.0 / (lambda_w * (optimal_init + self.t)) updates = [(p, p - eta*g) for p,g in zip(self.params, grad)] else: updates = [] inputs = [] if rnn: inputs += [x_idxs, x_parents, x_rel_idxs, x_mask] if nf > 0: inputs += [phi] inputs += [y] self.cost_and_grad = theano.function(inputs=inputs, outputs=[cost] + grad, updates=updates, allow_input_downcast=True) self.sums = theano.function(inputs=[x_idxs, x_parents, x_rel_idxs, x_mask], outputs=X_s, allow_input_downcast=True) self.states = theano.function(inputs=[x_idxs, x_parents, x_rel_idxs, x_mask], outputs=X_h, allow_input_downcast=True) self.classify = theano.function(inputs=inputs[:-1], outputs=y_pred, allow_input_downcast=True)
icnn1 = batch_norm(utils_lasagne.GCNNLayer([icnn, patch_op], 16, nrings=5, nrays=16)) ffn1 = icnn1 icnn2 = batch_norm(utils_lasagne.GCNNLayer([icnn1, patch_op], 32, nrings=5, nrays=16)) ffn2 = icnn2 ffn4 = LL.ConcatLayer([inp,ffn1,ffn2],axis=1, cropping=None); ffn = LL.DenseLayer(ffn4, nclasses, nonlinearity=utils_lasagne.log_softmax) return ffn inp = LL.InputLayer(shape=(None, nin)) patch_op = LL.InputLayer(input_var=Tsp.csc_fmatrix('patch_op'), shape=(None, None)) print(patch_op.shape[0]) ffn = get_model(inp, patch_op) output = LL.get_output(ffn) pred = LL.get_output(ffn, deterministic=True) target = T.ivector('idxs') cla = utils_lasagne.categorical_crossentropy_logdomain(output, target, nclasses).mean() acc = LO.categorical_accuracy(pred, target).mean() regL2 = L.regularization.regularize_network_params(ffn, L.regularization.l2) cost = cla + l2_weight * regL2 params = LL.get_all_params(ffn, trainable=True) grads = T.grad(cost, params)