예제 #1
0
    def test_lop_override(self, cls_ofg):
        x = T.vector()
        y = 1. / (1. + T.exp(-x))

        def lop_ov(inps, outs, grads):
            y_, = outs
            dedy_, = grads
            return [2. * y_ * (1. - y_) * dedy_]

        y_, dedy = T.vector(), T.vector()
        op_lop_ov = cls_ofg([x, y_, dedy], [2. * y_ * (1. - y_) * dedy])

        xx = T.vector()
        yy1 = T.sum(T.nnet.sigmoid(xx))
        gyy1 = 2. * T.grad(yy1, xx)

        for ov in [lop_ov, op_lop_ov]:
            op = cls_ofg([x], [y], lop_overrides=ov)
            yy2 = T.sum(op(xx))
            gyy2 = T.grad(yy2, xx)
            fn = function([xx], [gyy1, gyy2])

            xval = np.random.rand(32).astype(config.floatX)
            y1val, y2val = fn(xval)
            assert np.allclose(y1val, y2val)
예제 #2
0
def _compile_func():
    beta = T.vector('beta')
    b = T.scalar('b')
    X = T.matrix('X')
    y = T.vector('y')
    C = T.scalar('C')
    params = [beta, b, X, y, C]
    cost = 0.5 * (T.dot(beta, beta) + b * b) + C * T.sum(
        T.nnet.softplus(
            -T.dot(T.diag(y), T.dot(X, beta) + b)
        )
    )
    # Function computing in one go the cost, its gradient
    # with regard to beta and with regard to the bias.
    cost_grad = theano.function(params,[
        cost,
        T.grad(cost, beta),
        T.grad(cost, b)
    ])

    # Function for computing element-wise sigmoid, used for
    # prediction.
    log_predict = theano.function(
        [beta, b, X],
        T.nnet.sigmoid(b + T.dot(X, beta)),
        on_unused_input='warn'
    )

    return (cost_grad, log_predict)
예제 #3
0
파일: mlp.py 프로젝트: zhenwendai/DeepGP
 def _build_hidden_layers(self, input, add_cost, Y, updates, external_grad=None):
     
     lin_output = tensor.dot(input, self.W_theano.T)+self.b_theano[None,:]
     if self.activation=='tanh':
         output = tensor.tanh(lin_output)
     elif self.activation=='softplus':
         output = tensor.nnet.softplus(lin_output)
     elif self.activation is None:
         output = lin_output
     else:
         raise 'Unsupported activation function!'
     
     if self.regularization == 'L1':
         add_cost = add_cost -self.reg_weight*tensor.abs(self.W_theano).sum()
     elif self.regularization == 'L2':
         add_cost = add_cost -self.reg_weight*(self.W_theano**2).sum()
     
     # Compute the cost function
     if self.layer_forward is None:
         if external_grad is None:
             cost = -((output-Y)**2).sum()/self.sigma2_theano[0]+add_cost
         else:
             cost = (external_grad*output).sum()
         Y_out = output
     else:
         cost, Y_out = self.layer_forward._build_hidden_layers(output, add_cost, Y, updates, external_grad=external_grad)
         
     # Update parameter gradients
     W_grad = tensor.grad(cost, self.W_theano)
     b_grad = tensor.grad(cost, self.b_theano)
     updates.extend([(self.W_grad_theano,self.W_grad_theano+W_grad), (self.b_grad_theano,self.b_grad_theano+b_grad)])
     
     return cost, Y_out
    def fit(self, X, learning_rate=0.5, mu=0.99, epochs=1, batch_sz=100, show_fig=False):
        N, D = X.shape
        n_batches = N / batch_sz

        W0 = init_weights((D, self.M))
        self.W = theano.shared(W0, 'W_%s' % self.id)
        self.bh = theano.shared(np.zeros(self.M), 'bh_%s' % self.id)
        self.bo = theano.shared(np.zeros(D), 'bo_%s' % self.id)
        self.params = [self.W, self.bh, self.bo]
        self.forward_params = [self.W, self.bh]

        # TODO: technically these should be reset before doing backprop
        self.dW = theano.shared(np.zeros(W0.shape), 'dW_%s' % self.id)
        self.dbh = theano.shared(np.zeros(self.M), 'dbh_%s' % self.id)
        self.dbo = theano.shared(np.zeros(D), 'dbo_%s' % self.id)
        self.dparams = [self.dW, self.dbh, self.dbo]
        self.forward_dparams = [self.dW, self.dbh]

        X_in = T.matrix('X_%s' % self.id)
        X_hat = self.forward_output(X_in)

        # attach it to the object so it can be used later
        # must be sigmoidal because the output is also a sigmoid
        H = T.nnet.sigmoid(X_in.dot(self.W) + self.bh)
        self.hidden_op = theano.function(
            inputs=[X_in],
            outputs=H,
        )

        # cost = ((X_in - X_hat) * (X_in - X_hat)).sum() / N
        cost = -(X_in * T.log(X_hat) + (1 - X_in) * T.log(1 - X_hat)).sum() / (batch_sz * D)
        cost_op = theano.function(
            inputs=[X_in],
            outputs=cost,
        )

        updates = [
            (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)
        ] + [
            (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)
        ]
        train_op = theano.function(
            inputs=[X_in],
            updates=updates,
        )

        costs = []
        print "training autoencoder: %s" % self.id
        for i in xrange(epochs):
            print "epoch:", i
            X = shuffle(X)
            for j in xrange(n_batches):
                batch = X[j*batch_sz:(j*batch_sz + batch_sz)]
                train_op(batch)
                the_cost = cost_op(X) # technically we could also get the cost for Xtest here
                print "j / n_batches:", j, "/", n_batches, "cost:", the_cost
                costs.append(the_cost)
        if show_fig:
            plt.plot(costs)
            plt.show()
예제 #5
0
    def __init__(self,
                 input=tensor.dvector('input'),
                 target=tensor.dvector('target'),
                 n_input=1, n_hidden=1, n_output=1, lr=1e-3, **kw):
        super(NNet, self).__init__(**kw)

        self.input = input
        self.target = target
        self.lr = shared(lr, 'learning_rate')
        self.w1 = shared(numpy.zeros((n_hidden, n_input)), 'w1')
        self.w2 = shared(numpy.zeros((n_output, n_hidden)), 'w2')
        # print self.lr.type

        self.hidden = sigmoid(tensor.dot(self.w1, self.input))
        self.output = tensor.dot(self.w2, self.hidden)
        self.cost = tensor.sum((self.output - self.target)**2)

        self.sgd_updates = {
            self.w1: self.w1 - self.lr * tensor.grad(self.cost, self.w1),
            self.w2: self.w2 - self.lr * tensor.grad(self.cost, self.w2)}

        self.sgd_step = pfunc(
            params=[self.input, self.target],
            outputs=[self.output, self.cost],
            updates=self.sgd_updates)

        self.compute_output = pfunc([self.input], self.output)

        self.output_from_hidden = pfunc([self.hidden], self.output)
예제 #6
0
def test_batch_normalization_test():
    for axes in ('per-activation', 'spatial', (1, 2, 3, 4)):
        for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
            x, scale, bias, mean, var = (vartype(n)
                                         for n in ('x', 'scale', 'bias', 'mean', 'var'))
            ndim = x.ndim
            eps = 5e-3  # some non-standard value to test if it's used

            # remove non-existing axes
            if isinstance(axes, tuple):
                axes = tuple(i for i in axes if i < ndim)
            if len(axes) == 0:
                continue

            # forward pass
            out = bn.batch_normalization_test(x, scale, bias, mean,
                                              var, axes, eps)
            # reference forward pass
            if axes == 'per-activation':
                axes2 = (0,)
            elif axes == 'spatial':
                axes2 = (0,) + tuple(range(2, ndim))
            else:
                axes2 = axes
            scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes2)
                                          for t in (scale, bias, mean, var))
            out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2
            # backward pass
            dy = vartype('dy')
            grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy})
            # reference backward pass
            grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy})
            # compile
            f = theano.function([x, scale, bias, mean, var, dy],
                                [out, out2] + grads + grads2)
            # check if the abstract Ops have been replaced
            assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
                                              bn.AbstractBatchNormInference,
                                              bn.AbstractBatchNormTrainGrad))
                            for n in f.maker.fgraph.toposort()])
            # run
            for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes2 else s
                                    for d, s in enumerate(data_shape))
                X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
                Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
                outputs = f(X, Scale, Bias, Mean, Var, Dy)
                # compare outputs
                utt.assert_allclose(outputs[0], outputs[1])  # out
                # compare gradients
                utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5)  # dx
                utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5)  # dscale
                utt.assert_allclose(outputs[4], outputs[4 + 5])  # dbias
                utt.assert_allclose(outputs[5], outputs[5 + 5])  # dmean
                utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5)  # dvar
예제 #7
0
def sgd_optimization(learning_rate=0.13, n_epochs=1000, batch_size=100):
    dataset = generate_data()
    train_x, train_y = dataset[0]
    print train_x.type, train_y.type
    validate_x, validate_y = dataset[1]
    test_x, test_y = dataset[2]

    print 'train set size %d' %(train_x.get_value().shape[0])
    print 'validate set size %d' %(validate_x.get_value().shape[0])
    print 'test set size %d' %(test_x.get_value().shape[0])    
    
    n_batches = train_x.get_value(borrow=True).shape[0] / batch_size
    
    index = T.lscalar()
    
    x = T.matrix('x')
    y = T.ivector('y')
    
    lr = LogisticRegression(x, train_x.get_value().shape[1])
    cost = lr.negative_log_likelihood(y)
    
    print 'compile function test_model...'
    test_model = theano.function(inputs=[index], 
                                 outputs=lr.errors(y), 
                                 givens={
                                    x : train_x[index*batch_size : (index+1)*batch_size], 
                                    y : train_y[index*batch_size : (index+1)*batch_size]
                                 })
    
    g_w = T.grad(cost=cost, wrt=lr.w)
    g_b = T.grad(cost=cost, wrt=lr.b)
    updates = [(lr.w, lr.w-learning_rate*g_w), 
               (lr.b, lr.b-learning_rate*g_b)]
    
    print 'complie function train_model...'
    train_model = theano.function(inputs=[index], 
                                  outputs=cost, 
                                  updates=updates, 
                                  givens={
                                      x : train_x[index*batch_size : (index+1)*batch_size],
                                      y : train_y[index*batch_size : (index+1)*batch_size]
                                  })
    
    
    best_train_error = numpy.Inf    
    start_time = time.clock()
    for epoch in xrange(n_epochs):
        for minibatch_index in xrange(n_batches):
            batch_cost = train_model(minibatch_index)
            
        train_errors = [test_model(i) for i in xrange(n_batches)]
        train_error = numpy.mean(train_errors)
        if best_train_error > train_error:
            best_train_error = train_error
            
        print 'epoch %d, best_train_error %lf, train_error %lf' \
            %(epoch, best_train_error, train_error)
            #print 'iterator %d %lf' %(epoch*n_batches + minibatch_index+1, batch_cost)
    end_time = time.clock()
    print 'cost %d' %(end_time-start_time)
    def __gradients(self, mini_batch):
        objective = self.__objective(mini_batch)
        gradient_entity = T.grad(objective, wrt=self.Entity)
        gradient_relation = T.grad(objective, wrt=self.Relation)
        gradient_surface = T.grad(objective, wrt=self.RelationNormal)

        return gradient_entity, gradient_relation, gradient_surface
예제 #9
0
    def build(self):
        self.debug = []
        lM = []
        lpullerror = []
        lpusherror = []
        lupdate = []
        for i in xrange(self.M):
            if not self.localM: 
                lM.append(theano.shared(value=np.eye(self.dim, dtype='float32'), name='M', borrow=True))
                lpullerror.append(0.0)
                lpusherror.append(0.0)
                continue
            M = theano.shared(value=np.eye(self.dim, dtype='float32'), name='M', borrow=True)
            pullerror, pusherror = self._local_error(M, i)
            pullerror *= (1-self.mu)
            pusherror *= self.mu
            error = pullerror + pusherror
            update = (M, M - self._lr[i] * T.grad(error, M))

            lM.append(M)
            lpullerror.append((1-self.mu)*pullerror)
            lpusherror.append(self.mu*pusherror)
            lupdate.append(update)

        self.lM = lM
        self.lpusherror = lpusherror
        self.lpullerror = lpullerror
        self.lupdate = lupdate

        #gError = 0.0
        gM = []
        gpullerror = []
        gpusherror = []
        gupdate = []
        for i in xrange(self.M):
            if not self.globalM: 
                gM.append(theano.shared(value=np.eye(self.dim, dtype='float32'), name='M', borrow=True))
                gpullerror.append(0.0)
                gpusherror.append(0.0)
                continue
            M = theano.shared(value=np.eye(self.dim, dtype='float32'), name='M', borrow=True)
            if i == 0:
                pullerror, pusherror = self._global_error(M, i, None)
            else:
                pullerror, pusherror = self._global_error(M, i, gM[-1])
            error = (1-self.mu) * pullerror + self.mu * pusherror
        #    gError += error#*(float(i+1)/self.M)
            update = (M, M - self._lr[i+self.M] * T.grad(error, M))

            gM.append(M)
            gpullerror.append((1-self.mu)*pullerror)
            gpusherror.append(self.mu*pusherror)
            gupdate.append(update)
#       if self.globalM: 
#           gupdate = [(gM[i], gM[i] - self._lr[i+self.M]*T.grad(gError, M)) for i in xrange(self.M)]

        self.gM = gM
        self.gpusherror = gpusherror
        self.gpullerror = gpullerror
        self.gupdate = gupdate
예제 #10
0
파일: model.py 프로젝트: while519/tranpes
def create_TrainFunc_tranPES(simfn, embeddings,  marge=0.5, alpha=1., beta=1.):

    # parse the embedding data
    embedding = embeddings[0] # D x N matrix
    lembedding = embeddings[1]

    # declare the symbolic variables for training triples
    hp = S.csr_matrix('head positive') # N x batchsize matrix
    rp = S.csr_matrix('relation')
    tp = S.csr_matrix('tail positive')

    hn = S.csr_matrix('head negative')
    tn = S.csr_matrix('tail negative')

    lemb = T.scalar('embedding learning rate')
    lremb = T.scalar('relation learning rate')

    subtensorE = T.ivector('batch entities set')
    subtensorR = T.ivector('batch link set')

    # Generate the training positive and negative triples
    hpmat = S.dot(embedding.E, hp).T #  batchsize x D dense matrix
    rpmat = S.dot(lembedding.E, rp).T
    tpmat = S.dot(embedding.E, tp).T

    hnmat = S.dot(embedding.E, hn).T
    tnmat = S.dot(embedding.E, tn).T

    # calculate the score
    pos = tranPES3(simfn, T.concatenate([hpmat, tpmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tpmat)


    negh = tranPES3(simfn, T.concatenate([hnmat, tpmat], axis=1).reshape((hnmat.shape[0], 2, hnmat.shape[1])).dimshuffle(0, 2, 1), hnmat, rpmat, tpmat)
    negt = tranPES3(simfn, T.concatenate([hpmat, tnmat], axis=1).reshape((hpmat.shape[0], 2, hpmat.shape[1])).dimshuffle(0, 2, 1), hpmat, rpmat, tnmat)

    costh, outh = margeCost(pos, negh, marge)
    costt, outt = margeCost(pos, negt, marge)

    embreg = regEmb(embedding, subtensorE, alpha)
    lembreg = regLink(lembedding, subtensorR, beta)
    

    cost = costh + costt + embreg[0] + lembreg
    out = T.concatenate([outh, outt])
    outc = embreg[1]

    # list of inputs to the function
    list_in = [lemb, lremb, hp, rp, tp, hn, tn, subtensorE, subtensorR]

    # updating the embeddings using gradient descend
    emb_grad = T.grad(cost, embedding.E)
    New_embedding = embedding.E - lemb*emb_grad

    remb_grad = T.grad(cost, lembedding.E)
    New_rembedding = lembedding.E - lremb * remb_grad

    updates = OrderedDict({embedding.E: New_embedding, lembedding.E: New_rembedding})

    return theano.function(list_in, [cost, T.mean(out), T.mean(outc), embreg[0], lembreg],
                          updates=updates, on_unused_input='ignore')
예제 #11
0
    def get_gradients(self, X, Y, weights=1.0):
        W_mean, W_ls, b_mean, b_ls = self.parameters

        mean, log_sigma = self.sample_expected(Y)
        sigma = tensor.exp(log_sigma)

        cost = -log_sigma - 0.5 * (X - mean) ** 2 / tensor.exp(2 * log_sigma)
        if weights != 1.0:
            cost = -weights.dimshuffle(0, "x") * cost

        cost_scaled = sigma ** 2 * cost
        cost_gscale = (sigma ** 2).sum(axis=1).dimshuffle([0, "x"])
        cost_gscale = cost_gscale * cost

        gradients = OrderedDict()

        params = Selector(self.mlp).get_parameters()
        for pname, param in params.iteritems():
            gradients[param] = tensor.grad(cost_gscale.sum(), param, consider_constant=[X, Y])

        gradients[W_mean] = tensor.grad(cost_scaled.sum(), W_mean, consider_constant=[X, Y])
        gradients[b_mean] = tensor.grad(cost_scaled.sum(), b_mean, consider_constant=[X, Y])

        gradients[W_ls] = tensor.grad(cost_scaled.sum(), W_ls, consider_constant=[X, Y])
        gradients[b_ls] = tensor.grad(cost_scaled.sum(), b_ls, consider_constant=[X, Y])

        return gradients
    def theano_setup(self):
    
        # The matrices Wb and Wc were originally tied.
        # Because of that, I decided to keep Wb and Wc with
        # the same shape (instead of being transposed) to
        # avoid disturbing the code as much as possible.

        Wb = T.dmatrix('Wb')
        Wc = T.dmatrix('Wc')
        b = T.dvector('b')
        c = T.dvector('c')
        s = T.dscalar('s')
        x = T.dmatrix('x')
    
        h_act = T.dot(x, Wc) + c
        if self.act_func[0] == 'tanh':
            h = T.tanh(h_act)
        elif self.act_func[0] == 'sigmoid':
            h = T.nnet.sigmoid(h_act)
        elif self.act_func[0] == 'id':
            # bad idae
            h = h_act
        else:
            raise("Invalid act_func[0]")

        r_act = T.dot(h, Wb.T) + b
        if self.act_func[1] == 'tanh':
            r = s * T.tanh(r_act)
        elif self.act_func[1] == 'sigmoid':
            r = s * T.nnet.sigmoid(r_act)
        elif self.act_func[1] == 'id':
            r = s * r_act
        else:
            raise("Invalid act_func[1]")


        # Another variable to be able to call a function
        # with a noisy x and compare it to a reference x.
        y = T.dmatrix('y')

        loss = ((r - y)**2)
        sum_loss = T.sum(loss)
        
        # theano_encode_decode : vectorial function in argument X.
        # theano_loss : vectorial function in argument X.
        # theano_gradients : returns triplet of gradients, each of
        #                    which involves the all data X summed
        #                    so it's not a "vectorial" function.

        self.theano_encode_decode = function([Wb,Wc,b,c,s,x], r)
        self.theano_loss = function([Wb,Wc,b,c,s,x,y], loss)

        self.theano_gradients = function([Wb,Wc,b,c,s,x,y],
                                         [T.grad(sum_loss, Wb), T.grad(sum_loss, Wc),
                                          T.grad(sum_loss, b),  T.grad(sum_loss, c),
                                          T.grad(sum_loss, s)])
        # other useful theano functions for the experiments that involve
        # adding noise to the hidden states
        self.theano_encode = function([Wc,c,x], h)
        self.theano_decode = function([Wb,b,s,h], r)
def test_gradient_batch_normalization_op():
    epsilon = 1e-8
    op = gn.GradientBatchNormalizationOp(subtract_mean=True,
                                         keep_mean=False,
                                         epsilon=epsilon)

    X = np.random.randn(3, 4).astype(fX)
    W = np.random.randn(2, 3).astype(fX)

    x = T.matrix("x")
    w = T.matrix("w")

    orig_grad = T.grad(w.dot(x).sum(), x).eval({x: X, w: W})
    new_grad = T.grad(w.dot(op(x)).sum(), x).eval({x: X, w: W})
    mu = orig_grad.mean(axis=0, keepdims=True)
    sigma = orig_grad.std(axis=0, keepdims=True) + epsilon
    ans = (orig_grad - mu) / sigma
    np.testing.assert_allclose(ans,
                               new_grad,
                               rtol=1e-5)
    np.testing.assert_allclose(np.zeros(4),
                               new_grad.mean(axis=0),
                               atol=1e-5)
    np.testing.assert_allclose(np.ones(4),
                               new_grad.std(axis=0),
                               rtol=1e-5)
예제 #14
0
	def __init__(self, sizes, input_dim, output_dim):
		self.layers = len(sizes) + 1
				
		in_dim = [input_dim] + sizes
		out_dim = sizes + [output_dim]
		x = T.dvector('x')
		y = T.dvector('y')
		self.hyp_params = []
		for i, (r,c) in enumerate(zip(in_dim,out_dim)):
			if i == 0:
				obj = HiddenLayer(x, r, c)
			else:
				obj = HiddenLayer(obj.output,r,c)
			self.hyp_params.append(obj.params)

		

		yhat = obj.output

		prediction = T.argmax(yhat)
		self.predict = theano.function([x],[yhat])
		o_error = T.sum(T.sqr(yhat - y))
		# o_error = T.sum(T.nnet.categorical_crossentropy(yhat, y))
		updates = []
		learning_rate = T.scalar('learning_rate')
		for param in self.hyp_params:
			updates.append((param['W'], param['W'] - learning_rate * T.grad(o_error,param['W'])))
			updates.append((param['b'], param['b'] - learning_rate * T.grad(o_error,param['b'])))

		self.train_step = theano.function([x,y,learning_rate],[o_error],
						updates = updates)
	def train(self, epochs = 1000, learning_rate = 0.1):
		regression = self.regression
		X = self.X
		Y = self.Y
		
		x = T.matrix('x')  # data, presented as rasterized images
		y = T.vector('y')  # labels, presented as 1D vector of [int] labels
		
		error = regression.error(x, y)
		g_W = T.grad(cost=error, wrt=regression.W)
		g_b = T.grad(cost=error, wrt=regression.b)
		
		# start-snippet-3
		# specify how to update the parameters of the model as a list of
		# (variable, update expression) pairs.
		updates = [(regression.W, regression.W - learning_rate * g_W),
					(regression.b, regression.b - learning_rate * g_b)]
		
		# compiling a Theano function `train_model` that returns the cost, but in
		# the same time updates the parameter of the model based on the rules
		# defined in `updates`
		train_model = tn.function(
			inputs=[],
			outputs=error,
			updates=updates,
			givens={
				x: X,
				y: Y
			}
		)
		
		print('training start:')
		start_time = timeit.default_timer()
		epoch = 0
		while(epoch < epochs):
			avg_error = train_model()
			print('epoch {0}, error {1}'.format(epoch, avg_error), end='\r')
			epoch += 1
		print('training finish (start: {0}) took {1} seconds.'.format(regression.error(X, Y).eval(), timeit.default_timer() - start_time))
		
		
		# z = regression.compute(data_x).ravel()
		# e = regression.error(data_y, z)
		# l = regression.loss(e)
		# epoch = 0
		# while(epoch < epochs):
		# 	g = regression.grad(data_y, z)
		# 	d = regression.delta(g, data_x)
		# 	regression.W -= learning_rate * d[0]
		# 	regression.b -= learning_rate * d[1]
		# 	
		# 	z = regression.compute(data_x).ravel()
		# 	e = regression.error(data_y, z)
		# 	l = regression.loss(e)
		# 	# print(l.eval())
		# 	
		# 	epoch += 1
		# 	print('epoch:', epoch, end='\r')
		
		pass
예제 #16
0
    def test_relu_grad(self):
        seed = utt.fetch_seed()
        rng = numpy.random.RandomState(seed)

        imgsize_list = ((5, 5), (6, 6), (6, 6), (8, 8))
        n, c = 4, 2

        axis = 1

        image = T.dtensor4('image')
        image1 = T.dtensor4('image1')
        for imgsize in imgsize_list:
            imval = rng.rand(n, c, imgsize[0], imgsize[1])

            out = T.concatenate([image, image1], axis)
            sum_ref = T.sum(out)
            gx_ref = T.grad(sum_ref, [image, image1])
            f_ref = theano.function([image, image1], outputs=gx_ref, mode=mode_without_mkl)
            output_ref = f_ref(imval, imval)

            out_mkl = self.mkl_concatenate_func(axis, image, image1)
            sum_mkl = T.sum(out_mkl)
            gx_mkl = T.grad(sum_mkl, [image, image1])
            f_mkl = theano.function([image, image1], outputs=gx_mkl)
            output_mkl = f_mkl(imval, imval)

            utt.assert_allclose(output_mkl, output_ref)
예제 #17
0
    def test_reduce_custom_dtype(self):
        """
        Test the ability to provide your own output dtype for a reduce.
        """
        # We try multiple axis combinations even though axis should not matter.
        idx = 0
        for method in self.methods:
            for input_dtype in self.dtypes:
                x = tensor.matrix(dtype=input_dtype)
                for output_dtype in self.dtypes:
                # If the output is a complex, the gradient of the reduce will
                # cast the complex to the input dtype. We can't call the normal
                # cast on a complex to a not complex as this is ambiguous.
                    if (not input_dtype.startswith('complex') and
                        output_dtype.startswith('complex')):
                        continue

                    axis = self.axes[idx % len(self.axes)]
                    var = getattr(x, method)(dtype=output_dtype, axis=axis)
                    assert var.dtype == output_dtype

                    f = theano.function([x], var, mode=self.mode)
                    topo = f.maker.fgraph.toposort()
                    assert [n for n in topo if isinstance(n.op, self.op)], (topo,
                                                                            dtype)
                    data = numpy.random.rand(3, 4) * 10
                    data = data.astype(input_dtype)
                    f(data)
                    if "complex" in input_dtype:
                        continue
                    # Check that we can take the gradient
                    tensor.grad(var.sum(), x,
                                disconnected_inputs='ignore')
                    idx += 1
예제 #18
0
    def _training_updates(self, **kwargs):
        """Computes the update expression for updating the model parameters
        during training.
        
        .. note::
        
          This method should only be called from the ``setup()`` class method.

        :type learning_rate: theano.config.floatX
        :param learning_rate: A coefficient by which the gradient is
                              scaled on one update step.
        
        :type cost: theano.tensor.TensorType
        :param cost: The cost expression.

        :returns: A list of ``(param, update_expr)`` tuplets that can be
                  passed directly to ``theano.function`` as the ``updates``
                  field.
        """
        utils.check_kwargs(kwargs, ['learning_rate', 'cost'])

        learning_rate = kwargs['learning_rate']
        bound_cost = kwargs['cost']

        # Problem: need symbolic 'y' for self.negative_log_likelihood(y)
        # TODO: test behavior with dummy TT.ivector symbolic variable
        g_W = TT.grad(cost = bound_cost, wrt = self.W)
        g_b = TT.grad(cost = bound_cost, wrt = self.b)
        return [(self.W, self.W - learning_rate * g_W),
                (self.b, self.b - learning_rate * g_b)]
예제 #19
0
파일: zae.py 프로젝트: memisevic/zae
    def __init__(self, numvis, numhid, vistype, init_features, selectionthreshold=1.0, weightcost=0.0):
        self.numvis = numvis
        self.numhid  = numhid
        self.vistype = vistype
        self.weightcost = weightcost
        self.selectionthreshold = theano.shared(value=selectionthreshold, name='selectionthreshold')
        self.W_init = init_features.astype(theano.config.floatX)
        self.W = theano.shared(value = self.W_init, name='W')
        self.bvis = theano.shared(value=numpy.zeros(numvis, dtype=theano.config.floatX), name='bvis')
        self.inputs = T.matrix(name = 'inputs') 
        self.params = [self.W, self.bvis]

        self._prehiddens = T.dot(self.inputs, self.W) 
        self._hiddens = (self._prehiddens > self.selectionthreshold) * self._prehiddens
        if self.vistype == 'binary':
            self._outputs = T.nnet.sigmoid(T.dot(self._hiddens, self.W.T) + self.bvis)
            costpercase = -T.sum(self.inputs*T.log(self._outputs) + (1-self.inputs)*T.log(1-self._outputs), axis=1) 
        elif self.vistype == 'real':
            self._outputs = T.dot(self._hiddens, self.W.T) + self.bvis 
            costpercase = T.sum(0.5 * ((self.inputs - self._outputs)**2), axis=1) 

        self._cost = T.mean(costpercase)
        self._cost += self.weightcost * T.sum(self.W**2)
        self._grads = T.grad(self._cost, self.params)

        self.cost = theano.function([self.inputs], self._cost)
        self.grad = theano.function([self.inputs], T.grad(self._cost, self.params))
        self.prehiddens = theano.function([self.inputs], self._prehiddens)
        self.hiddens = theano.function([self.inputs], self._hiddens)
        self.recons_from_prehiddens = theano.function([self._prehiddens], self._outputs)
        self.recons_from_inputs = theano.function([self.inputs], self._outputs)
예제 #20
0
    def mcmc(ll, *frvs):
        full_observations = dict(observations)
        full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, frvs)]))
        
        loglik = -full_log_likelihood(full_observations)

        proposals = free_RVs_prop
        H = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + loglik

# -- this should be an inner loop
        g = []
        g.append(tensor.grad(loglik, frvs))
        
        proposals = [(p - epsilon*gg[0]/2.) for p, gg in zip(proposals, g)]

        rvsp = [(rvs + epsilon*rvp) for rvs,rvp in zip(frvs, proposals)]
        
        full_observations = dict(observations)
        full_observations.update(dict([(rv, s) for rv, s in zip(free_RVs, rvsp)]))
        new_loglik = -full_log_likelihood(full_observations)
        
        gnew = []
        gnew.append(tensor.grad(new_loglik, rvsp))
        proposals = [(p - epsilon*gn[0]/2.) for p, gn in zip(proposals, gnew)]
# --
        
        Hnew = tensor.add(*[tensor.sum(tensor.sqr(p)) for p in proposals])/2. + new_loglik

        dH = Hnew - H
        accept = tensor.or_(dH < 0., U < tensor.exp(-dH))

        return [tensor.switch(accept, -new_loglik, ll)] + \
            [tensor.switch(accept, p, f) for p, f in zip(rvsp, frvs)], \
            {}, theano.scan_module.until(accept)
예제 #21
0
    def test_conv_no_bias(self):
        images = T.dtensor4('input_conv')
        weights = T.dtensor4('weights')

        images_internal = U2IConv(imshp=(12, 3, 256, 256), kshp=(12, 3, 3, 3))(images)

        convOut = Conv2D(imshp=(12, 3, 256, 256), kshp=(12, 3, 3, 3), filter_flip=False)(images_internal, weights)
        convOut_user = I2U()(convOut)
        convOutLoss = T.mean(convOut_user)
        conv_op_di = T.grad(convOutLoss, images)
        conv_op_dk = T.grad(convOutLoss, weights)
        convOutBack = [conv_op_di, conv_op_dk]

        ival = numpy.random.rand(12, 3, 256, 256).astype(numpy.float64)
        wval = numpy.random.rand(12, 3, 3, 3).astype(numpy.float64)

        fopt = theano.function(inputs=[images, weights], outputs=convOutBack, mode=mode_with_mkl)
        new_out = fopt(ival, wval)

        convOut = conv2d(images, weights, input_shape=(12, 3, 256, 256), filter_shape=(12, 3, 3, 3), filter_flip=False)
        convOutLoss = T.mean(convOut)
        conv_op_di = T.grad(convOutLoss, images)
        conv_op_dk = T.grad(convOutLoss, weights)
        convOutBack = [conv_op_di, conv_op_dk]

        fori = theano.function(inputs=[images, weights], outputs=convOutBack, mode=mode_without_mkl)
        old_out = fori(ival, wval)

        assert len(fopt.maker.fgraph.toposort()) != len(fori.maker.fgraph.toposort())
        assert numpy.allclose(old_out[0], new_out[0])
        assert new_out[0].dtype == 'float64'
예제 #22
0
파일: test_rop.py 프로젝트: onze/Theano
    def check_mat_rop_lop(self, y, out_shape):
        vx = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX)
        vv = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX)
        yv = tensor.Rop(y, self.mx, self.mv)
        rop_f = function([self.mx, self.mv], yv)
        sy, _ = theano.scan( lambda i,y,x,v: (tensor.grad(y[i],x)*v).sum(),
                           sequences = tensor.arange(y.shape[0]),
                           non_sequences = [y,self.mx,self.mv])
        scan_f = function([self.mx,self.mv], sy)


        v1 = rop_f(vx,vv)
        v2 = scan_f(vx,vv)

        assert numpy.allclose(v1,v2), ('ROP mismatch: %s %s' % (v1, v2))

        self.check_nondiff_rop( theano.clone(y,
                                             replace={self.mx:break_op(self.mx)}))

        vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX)
        yv = tensor.Lop(y, self.mx, self.v)
        lop_f = function([self.mx, self.v], yv)

        sy = tensor.grad((self.v*y).sum(), self.mx)
        scan_f = function([self.mx, self.v], sy)


        v1 = lop_f(vx,vv)
        v2 = scan_f(vx,vv)
        assert numpy.allclose(v1,v2), ('LOP mismatch: %s %s' % (v1, v2))
예제 #23
0
def get_net(net_cfg, args={"lambda":0.5}):
    l_out = net_cfg(args)

    X = T.tensor4('X')
    X_noise = X + srng.normal(X.shape, std=1.)
    b_prime = theano.shared( np.zeros( (1, 28, 28) ) )
    net_out = get_output(l_out, X)
    net_out_noise = get_output(l_out, X_noise)
    energy = args["lambda"]*((X-b_prime)**2).sum() - net_out.sum()
    energy_noise = args["lambda"]*((X_noise-b_prime)**2).sum() - net_out_noise.sum()
    # reconstruction
    fx = X - T.grad(energy, X)
    fx_noise = X_noise - T.grad(energy_noise, X_noise)
    loss = ((X-fx_noise)**2).sum(axis=[1,2,3]).mean()

    
    
    params = get_all_params(l_out, trainable=True)
    params += [b_prime]
    lr = theano.shared(floatX(args["learning_rate"]))
    #updates = nesterov_momentum(loss, params, learning_rate=lr, momentum=0.9)
    updates = adadelta(loss, params, learning_rate=lr)
    #updates = rmsprop(loss, params, learning_rate=lr)
    train_fn = theano.function([X], [loss,energy], updates=updates)
    energy_fn = theano.function([X], energy)
    out_fn = theano.function([X], fx)
    
    return {
        "train_fn": train_fn,
        "energy_fn": energy_fn,
        "out_fn": out_fn,
        "lr": lr,
        "b_prime": b_prime,
        "l_out": l_out
    }
예제 #24
0
                def calculate_Rl(v_input):
                    # Sample a h_sample according to one v_input
                    _, hl_mean, hl_sample = self.sample_h_given_v(v_input)
                    # Calculate the probability of visible output according to h_sample
                    _, vn_mean = self.propdown(hl_sample)
                    # - Part1.
                    #   Desc: Multiply each element in grad with T.log(vn_mean).sum()
                    #   Hint: [array(...), array(...), array(...)] = T.grad(..., self.params)
                    #         The number of elements in gradient is the number of params which are partial derivation.

                    # part1 = map(lambda x: x * T.log(vn_mean).sum(),
                    #             T.grad(T.log(hl_mean).sum(),
                    #                    self.params,
                    #                    disconnected_inputs='warn'))
                    part1 = [x * T.log(vn_mean).sum() for x in T.grad(
                        T.log(hl_mean).sum(),
                        self.params,
                        disconnected_inputs='warn')]

                    # - Part2.
                    part2 = T.grad((T.log(self.propdown(hl_sample)[1]).sum()),
                                    self.params,
                                    consider_constant=[hl_sample],
                                    disconnected_inputs='warn')
                    # Rl is the result that add corresponding elements in two gradient.
                    # Rl = log(p(v^n|h^l;\theta)) * grad(log(p(h^l|v^n;\theta))) + grad(log(p(v^n|h^l;\theta)))
                    # Rl = map(lambda p1, p2: p1 + p2, part1, part2)
                    Rl = [x + y for x, y in zip(part1, part2)]

                    mi_cost_xi = T.log(vn_mean).sum()

                    Rl.append(mi_cost_xi)
                    return Rl
예제 #25
0
def test_downsample():
    shps = [
        (1, 1, 1, 12),
        (1, 1, 2, 2),
        (1, 1, 1, 1),
        (1, 1, 4, 4),
        (1, 1, 10, 11),
        (1, 2, 2, 2),
        (3, 5, 4, 4),
        (25, 1, 7, 7),
        (1, 1, 12, 12),
        (1, 1, 2, 14),
        (1, 1, 12, 14),
        (1, 1, 14, 14),
        (1, 1, 16, 16),
        (1, 1, 18, 18),
        (1, 1, 24, 24),
        (1, 6, 24, 24),
        (10, 1, 24, 24),
        (10, 6, 24, 24),
        (30, 6, 12, 12),
        (30, 2, 24, 24),
        (30, 6, 24, 24),
        (10, 10, 10, 11),
        (1, 1, 10, 1025),
        (1, 1, 10, 1023),
        (1, 1, 1025, 10),
        (1, 1, 1023, 10),
    ]

    numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps)

    for shp in shps:
        for ds in (2, 2), (3, 2), (1, 1):
            if ds[0] > shp[2]:
                continue
            if ds[1] > shp[3]:
                continue
            # GpuDownsampleFactorMax doesn't like having more than 512 columns
            # in the output tensor.
            if float(shp[3]) / ds[1] > 512:
                continue
            for ignore_border in (True, False):
                print "test_downsample", shp, ds, ignore_border
                ds_op = DownsampleFactorMax(ds, ignore_border=ignore_border)

                a = tcn.shared_constructor(my_rand(*shp), "a")
                f = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_with_gpu)
                f2 = pfunc([], ds_op(tensor.as_tensor_variable(a)), mode=mode_without_gpu)
                assert any([isinstance(node.op, tcn.blas.GpuDownsampleFactorMax) for node in f.maker.env.toposort()])
                assert any([isinstance(node.op, DownsampleFactorMax) for node in f2.maker.env.toposort()])
                assert numpy.allclose(f(), f2())

                g = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_with_gpu)
                g2 = pfunc([], tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), a), mode=mode_without_gpu)
                assert any(
                    [isinstance(node.op, tcn.blas.GpuDownsampleFactorMaxGrad) for node in g.maker.env.toposort()]
                )
                assert any([isinstance(node.op, DownsampleFactorMaxGrad) for node in g2.maker.env.toposort()])
                assert numpy.allclose(g(), g2())
예제 #26
0
파일: utils.py 프로젝트: samim23/dagbldr
def get_params_and_grads(graph, cost, verbose=False):
    params = []
    for k, p in graph.items():
        if k == DATASETS_ID:
            # skip datasets
            continue
        if k == RANDOM_ID:
            # skip random
            continue
        params.append(p)

    if verbose:
        grads = []
        for k, p in graph.items():
            if k == DATASETS_ID:
                # skip datasets
                continue
            if k == RANDOM_ID:
                # skip random
                continue
            print("Computing grad w.r.t %s" % k)
            grad = tensor.grad(cost, p)
            grads.append(grad)
    else:
        grads = tensor.grad(cost, params)
    return params, grads
예제 #27
0
 def fit(self,data_x,data_y):
     print "Training"
     start = time.clock()
     n_batches = data_x.get_value(borrow=True).shape[0]/self.batch_size
     tensor_x = T.matrix('x')
     tensor_y = T.ivector('y')
     index = T.lscalar('index')
     self.single_layer = Layer(self.n_in,self.n_out,T.nnet.softmax)
     cost = self.single_layer.negative_log_likelihood(tensor_x, tensor_y)
     g_W = T.grad(cost,self.single_layer.W)
     g_b = T.grad(cost,self.single_layer.b)
     updates = [(self.single_layer.W,self.single_layer.W - g_W*self.learning_rate),
                 (self.single_layer.b,self.single_layer.b - g_b*self.learning_rate)]
     train_batch = theano.function([index],[cost],
                                   updates=updates,
                                   givens={tensor_x : data_x[index*self.batch_size : (index + 1)*self.batch_size],
                                           tensor_y : data_y[index*self.batch_size : (index + 1)*self.batch_size]})
     train_batch_costs = [0 for i in xrange(n_batches)]
     for iter in xrange(self.iters):
         for minibatch_index in xrange(n_batches):
             train_batch_costs[minibatch_index] = train_batch(minibatch_index)
         if self.verbose==1: print "Iter %d --> %f" % (iter,np.mean(train_batch_costs))
     end = time.clock()
     print "Finished Training Logistic Regression Model\n" \
           "Iterations %d\n" \
           "Time Taken : %d secs" % (self.iters,end - start)
예제 #28
0
파일: rnn.py 프로젝트: zerkh/theano-fun
 def __build_theano__(self):
     x = ivector(name="x")
     y = ivector(name="y")
     U, V, W = self.U, self.V, self.W
     
     def forword_prop_step(x_t, s_t_prev, U, V, W):
         s_t = T.tanh(U[:,x_t] + V.dot(s_t_prev))
         o_t = T.nnet.softmax(W.dot(s_t))
         return [o_t[0], s_t]
     
     [o,s], updates = theano.scan(forword_prop_step, sequences=x, 
                                  outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))], 
                                  non_sequences=[U,V,W], truncate_gradient=4, strict=True)
     
     prediction = T.argmax(o, axis=1)
     o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
     
     dU = T.grad(o_error, U)
     dV = T.grad(o_error, V)
     dW = T.grad(o_error, W)
     
     self.forward = theano.function([x], o)
     self.predict = theano.function([x], prediction)
     self.c_error = theano.function([x, y], o_error)
     self.bptt = theano.function([x, y], [dU, dV, dW])
     
     learning_rate = scalar(name="learning_rate")
     self.sgd_step = theano.function([x, y, learning_rate], [], 
                                     updates=[(self.U, self.U-learning_rate*dU),
                                              (self.V, self.V-learning_rate*dV),
                                              (self.W, self.W-learning_rate*dW)])
def get_mean_square_norm_gradients_variance_method_00(D_by_layer, cost, accum = 0):

    # This returns a theano variable that will be of shape (minibatch_size, ).
    # It will contain, for each training example, the associated mean of the
    # variance wrt the gradient of that minibatch.

    for (layer_name, D) in D_by_layer.items():

        input = D['input']
        input_square_norms = tensor.sqr(D['input']).sum(axis=1)
        backprop_output = tensor.grad(cost, D['output'])
        # I don't think that theano recomputes this.
        # It should be just redundant nodes in the computational graph
        # that end up being computed only once anyways.
        grad_weight = tensor.grad(cost, D['weight'])
        grad_bias = tensor.grad(cost, D['bias'])
        backprop_output_square_norms = tensor.sqr(backprop_output).sum(axis=1)

        if D.has_key('weight'):
            A = input_square_norms * backprop_output_square_norms
            C = tensor.sqr(grad_weight).sum() # all the terms get this "middle" expression added to them
            B = (backprop_output.dot(grad_weight.T) * input).sum(axis=1)

            accum += (A - 2*B + C)

        if D.has_key('bias'):
            # this last `sum` could be a component-wise `max` if we wanted
            # to carry the maximum of the variances instead of the sum of squares
            accum = accum + tensor.sqr(backprop_output - grad_bias.reshape((1,-1))).sum(axis=1)


    return accum
예제 #30
0
 def __theano_build__(self):
   U, V, W = self.U, self.V, self.W
   x = T.ivector('x')
   y = T.ivector('y')
   def forward_prop_step(x_t, s_t_prev, U, V, W):
     s_t = T.tanh(U[:,x_t] + W.dot(s_t_prev))
     o_t = T.nnet.softmax(V.dot(s_t))
     return [o_t[0], s_t]
   [o,s], updates = theano.scan(
     forward_prop_step,
     sequences=x,
     outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))],
     non_sequences=[U, V, W],
     truncate_gradient=self.bptt_truncate,
     strict=True)
   prediction = T.argmax(o, axis=1)
   o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
   # Gradients
   dU = T.grad(o_error, U)
   dV = T.grad(o_error, V)
   dW = T.grad(o_error, W)      
   # Assign functions
   self.forward_propagation = theano.function([x], o)
   self.predict = theano.function([x], prediction)
   self.ce_error = theano.function([x, y], o_error)
   self.bptt = theano.function([x, y], [dU, dV, dW])
   # SGD
   learning_rate = T.scalar('learning_rate')
   self.sgd_step = theano.function([x,y,learning_rate], [], 
                   updates=[(self.U, self.U - learning_rate * dU),
                            (self.V, self.V - learning_rate * dV),
                            (self.W, self.W - learning_rate * dW)])
예제 #31
0
    def fit(self,
            X,
            Y,
            learning_rate=1e-4,
            mu=0.9,
            decay=0.9,
            epochs=8,
            batch_sz=100,
            show_fig=False):
        # make a validation set
        X, Y = shuffle(X, Y)
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)
        Xvalid, Yvalid = X[-1000:], Y[-1000:]
        X, Y = X[:-1000], Y[:-1000]

        self.rng = RandomStreams()

        # initialize hidden layers
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1
        W = np.random.randn(M1, K) / np.sqrt(M1)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # set up theano functions and variables
        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY_train = self.forward_train(thX)

        # this cost is for training
        cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY]))

        # gradients wrt each param
        grads = T.grad(cost, self.params)

        # for momentum
        dparams = [
            theano.shared(np.zeros_like(p.get_value())) for p in self.params
        ]

        # for rmsprop
        cache = [
            theano.shared(np.ones_like(p.get_value())) for p in self.params
        ]

        new_cache = [
            decay * c + (1 - decay) * g * g
            for p, c, g in zip(self.params, cache, grads)
        ]
        new_dparams = [
            mu * dp - learning_rate * g / T.sqrt(new_c + 1e-10)
            for p, new_c, dp, g in zip(self.params, new_cache, dparams, grads)
        ]
        updates = [(c, new_c) for c, new_c in zip(cache, new_cache)] + [
            (dp, new_dp) for dp, new_dp in zip(dparams, new_dparams)
        ] + [(p, p + new_dp) for p, new_dp in zip(self.params, new_dparams)]

        # momentum only
        # updates = [
        #     (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, dparams)
        # ] + [
        #     (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, dparams)
        # ]

        train_op = theano.function(inputs=[thX, thY], updates=updates)

        # for evaluation and prediction
        pY_predict = self.forward_predict(thX)
        cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY]))
        prediction = self.predict(thX)
        cost_predict_op = theano.function(inputs=[thX, thY],
                                          outputs=[cost_predict, prediction])

        n_batches = N / batch_sz
        costs = []
        for i in xrange(epochs):
            X, Y = shuffle(X, Y)
            for j in xrange(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                train_op(Xbatch, Ybatch)

                if j % 20 == 0:
                    c, p = cost_predict_op(Xvalid, Yvalid)
                    costs.append(c)
                    e = error_rate(Yvalid, p)
                    print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e

        if show_fig:
            plt.plot(costs)
            plt.show()
예제 #32
0
    input3_var: x3
},
                                      deterministic=False)

#Evaluate
eval_out = lasagne.layers.get_output(output, {
    input_var: x1,
    input2_var: x2,
    input2_var: x3
},
                                     deterministic=True)

all_params = lasagne.layers.get_all_params(output, trainable=True)
cost = T.nnet.binary_crossentropy(train_out, target_var).mean()

all_grads = T.grad(cost, all_params)

# Set the update function for parameters
# you might wan't to experiment with more advanded update schemes like rmsprob, adadelta etc.
updates = lasagne.updates.nesterov_momentum(all_grads,
                                            all_params,
                                            learning_rate=0.01,
                                            momentum=0.75)

f_eval = theano.function([input_var, input2_var, input3_var], eval_out)

f_train = theano.function([input_var, input2_var, input3_var, target_var],
                          [cost],
                          updates=updates)

from confusionmatrix import ConfusionMatrix
예제 #33
0
def test_scan_debugprint5():

    k = tensor.iscalar("k")
    A = tensor.dvector("A")

    # Symbolic description of the result
    result, updates = theano.scan(fn=lambda prior_result, A: prior_result * A,
                                  outputs_info=tensor.ones_like(A),
                                  non_sequences=A,
                                  n_steps=k)

    final_result = tensor.grad(result[-1].sum(), A)

    output_str = theano.printing.debugprint(final_result, file='str')
    lines = output_str.split('\n')

    expected_output = """Subtensor{int64} [id A] ''
    |for{cpu,grad_of_scan_fn}.1 [id B] ''
    | |Elemwise{sub,no_inplace} [id C] ''
    | | |Subtensor{int64} [id D] ''
    | | | |Shape [id E] ''
    | | | | |for{cpu,scan_fn} [id F] ''
    | | | |   |k [id G]
    | | | |   |IncSubtensor{Set;:int64:} [id H] ''
    | | | |   | |AllocEmpty{dtype='float64'} [id I] ''
    | | | |   | | |Elemwise{add,no_inplace} [id J] ''
    | | | |   | | | |k [id G]
    | | | |   | | | |Subtensor{int64} [id K] ''
    | | | |   | | |   |Shape [id L] ''
    | | | |   | | |   | |Rebroadcast{0} [id M] ''
    | | | |   | | |   |   |InplaceDimShuffle{x,0} [id N] ''
    | | | |   | | |   |     |Elemwise{second,no_inplace} [id O] ''
    | | | |   | | |   |       |A [id P]
    | | | |   | | |   |       |InplaceDimShuffle{x} [id Q] ''
    | | | |   | | |   |         |TensorConstant{1.0} [id R]
    | | | |   | | |   |Constant{0} [id S]
    | | | |   | | |Subtensor{int64} [id T] ''
    | | | |   | |   |Shape [id U] ''
    | | | |   | |   | |Rebroadcast{0} [id M] ''
    | | | |   | |   |Constant{1} [id V]
    | | | |   | |Rebroadcast{0} [id M] ''
    | | | |   | |ScalarFromTensor [id W] ''
    | | | |   |   |Subtensor{int64} [id K] ''
    | | | |   |A [id P]
    | | | |Constant{0} [id X]
    | | |TensorConstant{1} [id Y]
    | |Subtensor{:int64:} [id Z] ''
    | | |Subtensor{::int64} [id BA] ''
    | | | |Subtensor{:int64:} [id BB] ''
    | | | | |for{cpu,scan_fn} [id F] ''
    | | | | |Constant{-1} [id BC]
    | | | |Constant{-1} [id BD]
    | | |ScalarFromTensor [id BE] ''
    | |   |Elemwise{sub,no_inplace} [id C] ''
    | |Subtensor{:int64:} [id BF] ''
    | | |Subtensor{:int64:} [id BG] ''
    | | | |Subtensor{::int64} [id BH] ''
    | | | | |for{cpu,scan_fn} [id F] ''
    | | | | |Constant{-1} [id BI]
    | | | |Constant{-1} [id BJ]
    | | |ScalarFromTensor [id BK] ''
    | |   |Elemwise{sub,no_inplace} [id C] ''
    | |Subtensor{::int64} [id BL] ''
    | | |IncSubtensor{Inc;int64::} [id BM] ''
    | | | |Elemwise{second,no_inplace} [id BN] ''
    | | | | |for{cpu,scan_fn} [id F] ''
    | | | | |InplaceDimShuffle{x,x} [id BO] ''
    | | | |   |TensorConstant{0.0} [id BP]
    | | | |IncSubtensor{Inc;int64} [id BQ] ''
    | | | | |Elemwise{second,no_inplace} [id BR] ''
    | | | | | |Subtensor{int64::} [id BS] ''
    | | | | | | |for{cpu,scan_fn} [id F] ''
    | | | | | | |Constant{1} [id BT]
    | | | | | |InplaceDimShuffle{x,x} [id BU] ''
    | | | | |   |TensorConstant{0.0} [id BP]
    | | | | |Elemwise{second} [id BV] ''
    | | | | | |Subtensor{int64} [id BW] ''
    | | | | | | |Subtensor{int64::} [id BS] ''
    | | | | | | |Constant{-1} [id BX]
    | | | | | |InplaceDimShuffle{x} [id BY] ''
    | | | | |   |Elemwise{second,no_inplace} [id BZ] ''
    | | | | |     |Sum{acc_dtype=float64} [id CA] ''
    | | | | |     | |Subtensor{int64} [id BW] ''
    | | | | |     |TensorConstant{1.0} [id R]
    | | | | |Constant{-1} [id BX]
    | | | |Constant{1} [id BT]
    | | |Constant{-1} [id CB]
    | |Alloc [id CC] ''
    | | |TensorConstant{0.0} [id BP]
    | | |Elemwise{add,no_inplace} [id CD] ''
    | | | |Elemwise{sub,no_inplace} [id C] ''
    | | | |TensorConstant{1} [id Y]
    | | |Subtensor{int64} [id CE] ''
    | |   |Shape [id CF] ''
    | |   | |A [id P]
    | |   |Constant{0} [id CG]
    | |A [id P]
    |Constant{-1} [id CH]

    Inner graphs of the scan ops:

    for{cpu,grad_of_scan_fn}.1 [id B] ''
    >Elemwise{add,no_inplace} [id CI] ''
    > |Elemwise{mul} [id CJ] ''
    > | |<TensorType(float64, vector)> [id CK] -> [id BL]
    > | |A_copy [id CL] -> [id P]
    > |<TensorType(float64, vector)> [id CM] -> [id BL]
    >Elemwise{add,no_inplace} [id CN] ''
    > |Elemwise{mul} [id CO] ''
    > | |<TensorType(float64, vector)> [id CK] -> [id BL]
    > | |<TensorType(float64, vector)> [id CP] -> [id Z]
    > |<TensorType(float64, vector)> [id CQ] -> [id CC]

    for{cpu,scan_fn} [id F] ''
    >Elemwise{mul,no_inplace} [id CR] ''
    > |<TensorType(float64, vector)> [id CP] -> [id H]
    > |A_copy [id CL] -> [id P]

    for{cpu,scan_fn} [id F] ''
    >Elemwise{mul,no_inplace} [id CR] ''

    for{cpu,scan_fn} [id F] ''
    >Elemwise{mul,no_inplace} [id CR] ''

    for{cpu,scan_fn} [id F] ''
    >Elemwise{mul,no_inplace} [id CR] ''

    for{cpu,scan_fn} [id F] ''
    >Elemwise{mul,no_inplace} [id CR] ''"""

    for truth, out in zip(expected_output.split("\n"), lines):
        assert truth.strip() == out.strip()
예제 #34
0
    def __theano_build__(self):

        E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c

        max_x = T.iscalar('max_x')
        x = tlist.TypedListType(T.ivector)()
        l = tlist.length(x)

        def batch_padding(index, x_t, max_x):

            #f = func([wl_t], word_length, updates = {(num_zeros, 10-word_length[0])})
            #f(wl_t)

            shape_ex = T.shape(x_t[index])
            zero_vec = T.arange(max_x - shape_ex[0], dtype='int64')

            padded_x_t = T.concatenate(
                [x_t[index], T.zeros_like(zero_vec)], axis=0)
            return padded_x_t

        x_padded, updates = theano.scan(fn=batch_padding,
                                        outputs_info=None,
                                        non_sequences=[x, max_x],
                                        sequences=[T.arange(l, dtype='int64')])

        #x = T.imatrix('x')
        #y = T.imatrix('y')

        def forward_prop_step(x_t_padded, s_t1_prev, s_t2_prev):
            # This is how we calculated the hidden state in a simple RNN. No longer!
            # s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev))

            # Word embedding layer
            x_e = E[:, x_t_padded]

            # GRU Layer 1
            z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) +
                                       b[0])
            r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) +
                                       b[1])
            c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2])
            s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev

            # GRU Layer 2
            z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) +
                                       b[3])
            r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) +
                                       b[4])
            c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5])
            s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev

            # Final output calculation
            # Theano's softmax returns a matrix with one row, we only need the row
            o_t = T.nnet.softmax(V.dot(s_t2) + c)[0]

            return [o_t, s_t1, s_t2]

        [o, s,
         s2], updates = theano.scan(forward_prop_step,
                                    sequences=x_padded,
                                    truncate_gradient=self.bptt_truncate,
                                    outputs_info=[
                                        None,
                                        dict(initial=T.zeros(self.hidden_dim)),
                                        dict(initial=T.zeros(self.hidden_dim))
                                    ])

        prediction = T.argmax(o, axis=1)
        o_error = T.sum(T.nnet.categorical_crossentropy(o + 1e-6, y))

        # Total cost (could add regularization here)
        cost = o_error

        # Gradients
        dE = T.grad(cost, E)
        dU = T.grad(cost, U)
        dW = T.grad(cost, W)
        db = T.grad(cost, b)
        dV = T.grad(cost, V)
        dc = T.grad(cost, c)

        # Assign functions
        self.predict = theano.function([x], o)
        self.predict_class = theano.function([x], prediction)
        self.ce_error = theano.function([x, y], cost)
        self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc])

        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')

        # rmsprop cache updates
        mE = decay * self.mE + (1 - decay) * dE**2
        mU = decay * self.mU + (1 - decay) * dU**2
        mW = decay * self.mW + (1 - decay) * dW**2
        mV = decay * self.mV + (1 - decay) * dV**2
        mb = decay * self.mb + (1 - decay) * db**2
        mc = decay * self.mc + (1 - decay) * dc**2

        self.sgd_step = theano.function(
            [x, y, learning_rate,
             theano.Param(decay, default=0.9)], [],
            updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)),
                     (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)),
                     (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)),
                     (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)),
                     (b, b - learning_rate * db / T.sqrt(mb + 1e-6)),
                     (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)),
                     (self.mE, mE), (self.mU, mU), (self.mW, mW),
                     (self.mV, mV), (self.mb, mb), (self.mc, mc)])
예제 #35
0
def sgd_optimization_mnist(learning_rate=0.13,
                           n_epochs=1000,
                           dataset='mnist.pkl.gz',
                           batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print('... training the model')
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    # test it on the test set

                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of'
                           ' best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

                    # save the best model
                    with open('best_model.pkl', 'wb') as f:
                        pickle.dump(classifier, f)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(('Optimization complete with best validation score of %f %%,'
           'with test performance %f %%') %
          (best_validation_loss * 100., test_score * 100.))
    print('The code run for %d epochs, with %f epochs/sec' %
          (epoch, 1. * epoch / (end_time - start_time)))
    print(
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' %
         ((end_time - start_time))),
        file=sys.stderr)
def sgd_optimization_mnist(learning_rate=0.13,
                           n_epochs=1000,
                           dataset='mnist.pkl.gz',
                           batch_size=600):
    datasets = load_data(
        dataset
    )  # return value datasets is a three-element list, with each element a two-element tuple

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    # asarray.shape returns the size of the asarray as a tuple
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    #Build Model
    print('...building the model')

    index = T.lscalar()
    # variable declaration. Equal to "int64 index" in C++.
    # 0-dimension (i.e. ndim=0) variable with no name

    x = T.matrix('x')  # 'x' is the name of the matrix variable x
    y = T.ivector('y')

    classifier = LogisticRegression(
        input=x, n_in=28 * 28,
        n_out=10)  # instantialize an class object called classifier

    cost = classifier.negative_log_likelihood(y)

    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    #Train the model
    print('... training the model')
    patience = 5000
    patience_increase = 2
    improvement_threshold = 0.995
    validation_frequency = min(n_train_batches, patience // 2)

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        # for each epoch, all minibatches are used for training
        for minibatch_index in range(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index)

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                if this_validation_loss < best_validation_loss:
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss

                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]

                    test_score = numpy.mean(test_losses)

                    print(('	epoch %i, minibatch %i/%i, test error of'
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

                    with open('best_model.pkl', 'wb') as f:
                        pickle.dump(classifier, f)

            if patience < iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(('Optimization complete with best validation score of %f %%,'
           'with test performance %f %%') %
          (best_validation_loss * 100., test_score * 100.))
    print('The code run for %d epochs, with %f epochs/sec' %
          (epoch, 1. * epoch / (end_time - start_time)))
    print(
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' %
         ((end_time - start_time))),
        file=sys.stderr)
예제 #37
0
                         shape=(None, None))
print(patch_op.shape[0])
ffn = get_model(inp, patch_op)

output = LL.get_output(ffn)
pred = LL.get_output(ffn, deterministic=True)

target = T.ivector('idxs')
cla = utils_lasagne.categorical_crossentropy_logdomain(output, target,
                                                       nclasses).mean()
acc = LO.categorical_accuracy(pred, target).mean()
regL2 = L.regularization.regularize_network_params(ffn, L.regularization.l2)

cost = cla + l2_weight * regL2
params = LL.get_all_params(ffn, trainable=True)
grads = T.grad(cost, params)
grads_norm = T.nlinalg.norm(T.concatenate([g.flatten() for g in grads]), 2)
updates = L.updates.adam(grads, params, learning_rate=0.001)
funcs = dict()

funcs['train'] = theano.function(
    [inp.input_var, patch_op.input_var, target],
    [cost, cla, l2_weight * regL2, grads_norm, acc],
    updates=updates,
    on_unused_input='warn')
funcs['acc_loss'] = theano.function(
    [inp.input_var, patch_op.input_var, target], [acc, cost],
    on_unused_input='warn')
funcs['predict'] = theano.function([inp.input_var, patch_op.input_var], [pred],
                                   on_unused_input='warn')
예제 #38
0
    def __init__(self,
                 d,
                 V,
                 r,
                 nc,
                 nf,
                 pairwise_constraint=False,
                 embeddings=None,
                 fix_embeddings=False):
        #d = dimensionality of embeddings
        #V = size of vocabulary
        #r = number of dependency relations
        #nc = number of classes for classification

        #|V| x d embedding matrix
        if embeddings is None:
            self.We = theano.shared(
                name='embeddings',
                value=0.2 * np.random.uniform(-1.0, 1.0, (V, d))).astype(
                    theano.config.floatX)
        else:
            self.We = theano.shared(name='embeddings',
                                    value=embeddings).astype(
                                        theano.config.floatX)

        #r x d x d tensor (matrix for each dependency relation)
        self.Wr = theano.shared(
            name='dependencies',
            value=0.2 * np.random.uniform(-1.0, 1.0, (r, d, d))).astype(
                theano.config.floatX)

        #d x d map from embedding to hidden vector
        self.Wv = theano.shared(
            name='Wv',
            value=0.2 * np.random.uniform(-1.0, 1.0,
                                          (d, d))).astype(theano.config.floatX)

        #d long bias vector
        self.b = theano.shared(name='b',
                               value=np.zeros(d, dtype=theano.config.floatX))

        #weights for fine grained features plus bias
        #self.beta = theano.shared(name='beta',
        #                          value=0.2 * np.random.uniform(-1.0, 1.0, (nc, nf))
        #                          ).astype(theano.config.floatX)

        #low dimension approximation to classification parameters
        self.a = []
        for i in range(nc):
            a = []
            for j in range(3):
                a.append(
                    theano.shared(name='a_{}_{}'.format(i, j),
                                  value=0.2 *
                                  np.random.uniform(-1.0, 1.0, d)).astype(
                                      theano.config.floatX))
                #value=np.zeros(d, dtype=theano.config.floatX)))
            self.a.append(a)

        self.pairwise_constraint = pairwise_constraint

        if fix_embeddings:
            self.params = [self.Wr, self.Wv, self.b
                           ] + [j for i in self.a for j in i]  # + [self.beta]
        else:
            self.params = [self.We, self.Wr, self.Wv, self.b
                           ] + [j for i in self.a for j in i]  # + [self.beta]

        self.descender = Adagrad(self.params)

        #self.f = T.tanh
        self.f = normalized_tanh

        def recurrence(n, hidden_states, hidden_sums, x, r, p):
            #at each node n in the tree, calculate Wr(p,n) \dot f(W_v \dot We_word(n) + b + sum_n) and add to sum_p
            h_n = self.f(T.dot(self.Wv, x[n]) + self.b + hidden_sums[n])
            sum_n = T.dot(r[n], h_n)

            return T.set_subtensor(hidden_states[n], h_n), T.inc_subtensor(
                hidden_sums[p[n]], sum_n)

        idxs = []
        x = []
        rel_idxs = []
        r = []
        p = []
        hidden_sums = []
        hidden_states = []
        h = []
        s = []
        if pairwise_constraint:
            num_events = 4
        else:
            num_events = 2

        for i in range(num_events):
            idxs.append(T.ivector('idxs'))
            x.append(self.We[idxs[i]])

            rel_idxs.append(T.ivector('rel_idxs'))
            r.append(self.Wr[rel_idxs[i]])

            p.append(T.ivector('parents'))

            hidden_states.append(
                T.zeros((idxs[i].shape[0], d), dtype=theano.config.floatX))
            #needs to be sent_length + 1 to store final sum
            hidden_sums.append(
                T.zeros((idxs[i].shape[0] + 1, d), dtype=theano.config.floatX))

            h.append(None)
            s.append(None)
            [h[i], s[i]], updates = theano.scan(
                fn=recurrence,
                sequences=T.arange(x[i].shape[0]),
                outputs_info=[hidden_states[i], hidden_sums[i]],
                non_sequences=[x[i], r[i], p[i]])

        #A = T.dot(self.a_1, self.a_2.reshape((1, d))) + T.nlinalg.diag(self.a_3)
        #cost = T.dot(T.dot(h[0][-1, -1], A), h[1][-1, -1])
        #cost = T.dot(h[0][-1, -1], h[1][-1, -1])
        #grad = T.grad(cost, self.params)
        #self.cost_and_grad = theano.function(inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1]],
        #                                     outputs=[cost] + grad)

        A_stack = []
        for i in range(len(self.a)):
            A_stack.append(
                T.dot(self.a[i][0].reshape((d, 1)), self.a[i][1].reshape(
                    (1, d))) + T.nlinalg.diag(self.a[i][2]))
        A = T.vertical_stack(*A_stack).reshape((d, d, nc))

        self.states = theano.function(
            inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1]],
            outputs=[h[0], h[1]])

        #add fine-grained features
        #phi = T.vector('phi')

        p_y_given_x = T.nnet.softmax(
            T.dot(h[0][-1, -1], A).T.dot(h[1][-1,
                                              -1]))  # + T.dot(self.beta, phi))
        y_pred = T.argmax(p_y_given_x, axis=1)

        self.classify = theano.function(
            inputs=[idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1],
                    p[1]],  # , phi],
            outputs=y_pred)

        y = T.iscalar('y')

        if not pairwise_constraint:
            sentence_nll = -(T.log(p_y_given_x)[0, y])

            grad = T.grad(sentence_nll, self.params)

            self.cost_and_grad = theano.function(
                inputs=[
                    idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1], y
                ],  #, phi, y],
                outputs=[sentence_nll] + grad)
        else:
            lambda_e = T.scalar('lambda_e')

            phi2 = T.vector('phi2')
            p_y_given_x1 = T.nnet.softmax(
                T.dot(h[0][-1, -1], A).T.dot(h[1][-1, -1]) +
                T.dot(self.beta, phi))
            p_y_given_x2 = T.nnet.softmax(
                T.dot(h[2][-1, -1], A).T.dot(h[3][-1, -1]) +
                T.dot(self.beta, phi2))

            sentence_nll = -(T.log(p_y_given_x1)[0, y]) - (
                T.log(p_y_given_x2)[0, y])

            #add constraint that events should be maximally similar
            cost = sentence_nll - lambda_e * T.dot(h[0][-1, -1], h[2][
                -1, -1]) - lambda_e * T.dot(h[1][-1, -1], h[3][-1, -1])

            #grad = T.grad(sentence_nll, self.params[:4] + [A])
            grad = T.grad(cost, self.params)

            self.cost_and_grad = theano.function(inputs=[
                idxs[0], rel_idxs[0], p[0], idxs[1], rel_idxs[1], p[1], phi,
                idxs[2], rel_idxs[2], p[2], idxs[3], rel_idxs[3], p[3], phi2,
                y,
                theano.In(lambda_e, value=1)
            ],
                                                 outputs=[cost] + grad)
def run(subBatchSize=500,
        maxEpochNum=100,
        eta=0.1,
        trainErrPeriod=5,
        testErrPeriod=10,
        logfile='./log.txt',
        saveWeightFile=None,
        saveWeightsFor='train',
        loadWeightFile=None):

    my = myCNN()
    # Read dataset
    base = './datasets/10class'
    trainSet_dir = base + '/'
    train_filename = ('subset1.pkl', 'subset2.pkl', 'subset3.pkl',
                      'subset4.pkl', 'subset5.pkl', 'subset6.pkl',
                      'subset7.pkl', 'subset8.pkl')
    (trainImages, trainLabels) = load_pet_dataset(trainSet_dir, train_filename)
    testSet_dir = base + '/'
    test_filename = ('subset9.pkl', 'subset10.pkl')
    (testImages, testLabels) = load_pet_dataset(testSet_dir, test_filename)

    # Get the number of images in the training set
    numOfTrainImages = trainImages.get_value().shape[0]
    # Get the number of images in the test set
    numOfTestImages = testImages.get_value().shape[0]
    # Get the sub batch size for training set
    assert (
        numOfTrainImages % subBatchSize == 0
    ), "The subbatch size must be a divisor of the number of train images"
    numOfTrainSubBatches = numOfTrainImages / subBatchSize
    # Get the sub batch size for test set
    assert (
        numOfTestImages % subBatchSize == 0
    ), "The subbatch size must be a divisor of the number of test images"
    numOfTestSubBatches = numOfTestImages / subBatchSize

    x = T.matrix('x')  # data input symbolic variable
    y = T.ivector('y')  # labels symbolic variable

    # -----< Construction of Network Model >-----

    layer0 = x.reshape((subBatchSize, 64, 64, 3)).transpose(0, 3, 1, 2)

    [layer1, layer1_w,
     layer1_b] = my.convolutionLayer(featureMaps=layer0,
                                     featureMapShape=(subBatchSize, 3, 64, 64),
                                     kernelShape=(16, 3, 7, 7),
                                     bias=0.1)
    layer2 = my.maxPoolingLayer(featureMaps=layer1,
                                poolingShape=(2, 2),
                                stride=2)

    layer3 = my.reLuLayer(featureMaps=layer2)

    [layer4, layer4_w, layer4_b
     ] = my.convolutionLayer(featureMaps=layer3,
                             featureMapShape=(subBatchSize, 32, 29, 29),
                             kernelShape=(32, 16, 4, 4))

    layer5 = my.maxPoolingLayer(featureMaps=layer4,
                                poolingShape=(2, 2),
                                stride=2)

    layer6 = my.reLuLayer(featureMaps=layer5)

    [layer7, layer7_w, layer7_b
     ] = my.convolutionLayer(featureMaps=layer6,
                             featureMapShape=(subBatchSize, 32, 13, 13),
                             kernelShape=(64, 32, 4, 4))

    layer8 = my.maxPoolingLayer(featureMaps=layer7,
                                poolingShape=(2, 2),
                                stride=2)

    layer9 = my.reLuLayer(featureMaps=layer8)

    layer9 = layer9.flatten(2)

    #[layer10, layer10_w, layer10_b] = my.dropoutLayer(inputUnits=layer9,
    #                                                  inputDim=64*5*5,
    #                                                  outputDim=64,
    #                                                  prob=0.5)
    [layer10, layer10_w,
     layer10_b] = my.fullyConnectedLayer(inputUnits=layer9,
                                         inputDim=64 * 5 * 5,
                                         outputDim=64)

    layer10 = layer10.reshape((subBatchSize, 64))

    [error, numOfWrongClass, layer11_w,
     layer11_b] = my.softmaxLayer(inputVect=layer10,
                                  labels=y,
                                  inputDim=64,
                                  numOfClasses=10)

    # --------------------< Construction of Training Function >--------------------

    # Load weight if it is desired
    loadweight = True
    if loadweight is True and loadWeightFile is not None:
        with open(loadWeightFile, 'rb') as w:
            weights = pickle.load(w)
            (param1, param2, param3, param4, param5, param6, param7, param8,
             param9, param10) = weights
            layer1_w.set_value(param1)
            layer1_b.set_value(param2)
            layer4_w.set_value(param3)
            layer4_b.set_value(param4)
            layer7_w.set_value(param5)
            layer7_b.set_value(param6)
            layer10_w.set_value(param7)
            layer10_b.set_value(param8)
            layer11_w.set_value(param9)
            layer11_b.set_value(param10)
        loadweight = False
        print "Pretrained weights were loaded!"

    # Define symbolic index variable
    index = T.iscalar('index')
    # Define parameters
    params = [
        layer1_w, layer1_b, layer4_w, layer4_b, layer7_w, layer7_b, layer10_w,
        layer10_b, layer11_w, layer11_b
    ]
    # Take the derivative of error function with respect to parameters
    grads = T.grad(cost=error, wrt=params)

    # Define updates
    updates = [(w, w - eta * delta) for w, delta in zip(params, grads)]

    # Definition of symbolic training function
    training = function(
        [index],
        error,
        givens={
            x: trainImages[index * subBatchSize:(index + 1) * subBatchSize],
            y: trainLabels[index * subBatchSize:(index + 1) * subBatchSize]
        },
        updates=updates,
    )

    # Definiton of the symbolic function computing the training error
    computeTrainingError = function(
        [index],
        numOfWrongClass,
        givens={
            x: trainImages[index * subBatchSize:(index + 1) * subBatchSize],
            y: trainLabels[index * subBatchSize:(index + 1) * subBatchSize]
        })

    # Definiton of the symbolic testing function
    testing = function(
        [index],
        numOfWrongClass,
        givens={
            x: testImages[index * subBatchSize:(index + 1) * subBatchSize],
            y: testLabels[index * subBatchSize:(index + 1) * subBatchSize]
        })

    print "The total number of training images in the dataset : " + str(
        numOfTrainImages)
    print "The total number of test images in the dataset : " + str(
        numOfTestImages)
    # Log file

    with open(logfile, "a") as logf:
        logf.write('The total number of training images in the dataset : ' +
                   str(numOfTrainImages) + '\n')
        logf.write('The total number of test images in the dataset : ' +
                   str(numOfTestImages) + '\n')

    minErr = numOfTrainImages + numOfTestImages

    for epoch in range(1, maxEpochNum + 1):

        for subBatchIndex in range(numOfTrainSubBatches):

            err = training(subBatchIndex)

        if (epoch % trainErrPeriod == 0) or (epoch == 1):
            # Compute the training error
            trainingError = [
                computeTrainingError(inx)
                for inx in range(numOfTrainSubBatches)
            ]

            # Get the total wrong classified number of elements in the training set
            totalWrongClass = np.sum(trainingError)
            print "Epoch : " + str(epoch) + " Training error : %" + str(
                totalWrongClass * 100.0 /
                numOfTrainImages) + " " + str(totalWrongClass)
            # Write log file
            with open(logfile, "a") as logf:
                logf.write('Epoch : ' + str(epoch) + '\n')
                logf.write('Training : ' +
                           str(totalWrongClass * 100.0 / numOfTrainImages) +
                           ' ' + str(totalWrongClass) + '\n')

        if (epoch % testErrPeriod == 0) or (epoch == 1):
            # Compute the testing error
            testingError = [testing(inx) for inx in range(numOfTestSubBatches)]
            # Get the total wrong classified number of elements in the test set
            totalTestWrongClass = np.sum(testingError)
            print "\t\t  Testing error : %" + str(
                totalTestWrongClass * 100.0 /
                numOfTestImages) + " " + str(totalTestWrongClass)
            # Write log file
            with open(logfile, "a") as logf:
                logf.write('Testing : ' +
                           str(totalTestWrongClass * 100.0 / numOfTestImages) +
                           ' ' + str(totalTestWrongClass) + '\n')

        # Save weights
        if saveWeightsFor == 'train':
            currentErr = totalWrongClass
        elif saveWeightsFor == 'test':
            currentErr = totalTestWrongClass
        else:
            print "Please enter the option name to save weights for training or test!"

        if minErr > currentErr and saveWeightFile is not None:
            print "Weights are saved!"
            minErr = currentErr
            with open(saveWeightFile, 'wb') as w:
                pickle.dump((layer1_w.get_value(), layer1_b.get_value(),
                             layer4_w.get_value(), layer4_b.get_value(),
                             layer7_w.get_value(), layer7_b.get_value(),
                             layer10_w.get_value(), layer10_b.get_value(),
                             layer11_w.get_value(), layer11_b.get_value()),
                            w,
                            protocol=pickle.HIGHEST_PROTOCOL)
예제 #40
0
def train(
        dim_word=100,  # word vector dimensionality
        dim=1000,  # the number of LSTM units
        encoder='gru',
        decoder='gru_cond',
        patience=10,  # early stopping patience
        max_epochs=5000,
        finish_after=10000000,  # finish after this many updates
        dispFreq=100,
        decay_c=0.,  # L2 regularization penalty
        alpha_c=0.,  # alignment regularization
        clip_c=-1.,  # gradient clipping threshold
        lrate=0.01,  # learning rate
        n_words_src=100000,  # source vocabulary size
        n_words=100000,  # target vocabulary size
        maxlen=100,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=100,  # generate some samples after every sampleFreq
        datasets=[
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'
        ],
        valid_datasets=[
            '../data/dev/newstest2011.en.tok',
            '../data/dev/newstest2011.fr.tok'
        ],
        dictionaries=[
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'
        ],
        use_dropout=False,
        reload_=False):

    # Model options
    model_options = locals().copy()

    # load dictionaries and invert them
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = pkl.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            models_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(datasets[0],
                         datasets[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_datasets[0],
                         valid_datasets[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print 'Buliding sampler'
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    uidx = 0
    estop = False
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, y in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x,
                                                y,
                                                maxlen=maxlen,
                                                n_words_src=n_words_src,
                                                n_words=n_words)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(numpy.minimum(5, x.shape[1])):
                    stochastic = True
                    sample, score = gen_sample(tparams,
                                               f_init,
                                               f_next,
                                               x[:, jj][:, None],
                                               model_options,
                                               trng=trng,
                                               k=1,
                                               maxlen=30,
                                               stochastic=stochastic,
                                               argmax=False)
                    print 'Source ', jj, ': ',
                    for vv in x[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[0]:
                            print worddicts_r[0][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Truth ', jj, ' : ',
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data, model_options,
                           valid).mean()

    print 'Valid ', valid_err

    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err
예제 #41
0
                  word_embeddings=id_vec,
                  batch_size=params['batch_size'],
                  max_sequence_len=params['max_sequence_len'],
                  embedding_size=params['embedding_size'],
                  filter_sizes=params["filter_size"],
                  num_filters=params["num_filters"])

    dbg_x1 = model.dbg_x1
    # = que_x
    dbg_outputs_que = model.dbg_outputs_que  # = que_vec[0].shape
    #在类中只是将计算图定义完了,计算图的真正的启动--定义函数输入输出以触发图的计算,还有梯度反传的定部分还没有定义,
    #梯度反传和计算图之间是要通过function的定义联系到一起的。

    cost, cos_sim = model.cost, model.cos_sim
    graph_params = model.params
    grads = T.grad(cost, graph_params)
    learning_rate = T.dscalar("learning_rate")
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(graph_params, grads)]

    qt, at, lt = T.matrix("q1"), T.matrix("a1"), T.vector("l1")
    prob = T.fscalar("prob")
    train_model = theano.function(inputs=[qt, at, lt, prob, learning_rate],
                                  outputs=[cost, dbg_x1, dbg_outputs_que],
                                  updates=updates,
                                  givens={
                                      que: qt,
                                      ans: at,
                                      label: lt,
                                      keep_prob: prob
                                  })
예제 #42
0
def train_model(batch_size=100, n_h=50, n_epochs=40):

    # Load the datasets with Fuel
    dictionary = pkl.load(open(DICT_FILE, 'r'))
    dictionary['~'] = len(dictionary)
    reverse_mapping = dict((j, i) for i, j in dictionary.items())

    print("Loading the data")
    train = TextFile(files=[TRAIN_FILE],
                     dictionary=dictionary,
                     unk_token='~',
                     level='character',
                     preprocess=str.lower,
                     bos_token=None,
                     eos_token=None)

    train_stream = DataStream.default_stream(train)

    # organize data in batches and pad shorter sequences with zeros
    train_stream = Batch(train_stream,
                         iteration_scheme=ConstantScheme(batch_size))
    train_stream = Padding(train_stream)

    # idem dito for the validation text
    val = TextFile(files=[VAL_FILE],
                   dictionary=dictionary,
                   unk_token='~',
                   level='character',
                   preprocess=str.lower,
                   bos_token=None,
                   eos_token=None)

    val_stream = DataStream.default_stream(val)

    # organize data in batches and pad shorter sequences with zeros
    val_stream = Batch(val_stream, iteration_scheme=ConstantScheme(batch_size))
    val_stream = Padding(val_stream)

    print('Building model')

    # Set the random number generator' seeds for consistency
    rng = numpy.random.RandomState(12345)

    x = T.lmatrix('x')
    mask = T.matrix('mask')

    # Construct the LSTM layer
    recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h)

    logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1],
                                      n_in=n_h,
                                      n_out=111)

    cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x, x[1:],
                                             mask[1:]) / batch_size

    # create a list of all model parameters to be fit by gradient descent
    params = logreg_layer.params + recurrent_layer.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # update_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    learning_rate = 0.1
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    update_model = theano.function([x, mask], cost, updates=updates)

    evaluate_model = theano.function([x, mask], cost)

    # Define and compile a function for generating a sequence step by step.
    x_t = T.iscalar()
    h_p = T.vector()
    c_p = T.vector()
    h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p)
    energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b

    energy_exp = T.exp(energy - T.max(energy, 1)[:, None])

    output = energy_exp / energy_exp.sum(1)[:, None]
    single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t])

    start_time = time.clock()

    iteration = 0

    for epoch in range(n_epochs):
        print 'epoch:', epoch

        for x_, mask_ in train_stream.get_epoch_iterator():
            iteration += 1

            cross_entropy = update_model(x_.T, mask_.T)

            # Generate some text after each 20 minibatches
            if iteration % 40 == 0:
                try:
                    prediction = numpy.ones(111, dtype=config.floatX) / 111.0
                    h_p = numpy.zeros((n_h, ), dtype=config.floatX)
                    c_p = numpy.zeros((n_h, ), dtype=config.floatX)
                    initial = 'the meaning of life is '
                    sentence = initial
                    for char in initial:
                        x_t = dictionary[char]
                        prediction, h_p, c_p = single_step(
                            x_t, h_p.flatten(), c_p.flatten())
                    sample = numpy.random.multinomial(1, prediction.flatten())
                    for i in range(450):
                        x_t = numpy.argmax(sample)
                        prediction, h_p, c_p = single_step(
                            x_t, h_p.flatten(), c_p.flatten())
                        sentence += reverse_mapping[x_t]
                        sample = numpy.random.multinomial(
                            1, prediction.flatten())
                    print 'LSTM: "' + sentence + '"'
                except ValueError:
                    print 'Something went wrong during sentence generation.'

            if iteration % 40 == 0:
                print 'epoch:', epoch, '  minibatch:', iteration
                val_scores = []
                for x_val, mask_val in val_stream.get_epoch_iterator():
                    val_scores.append(evaluate_model(x_val.T, mask_val.T))
                print 'Average validation CE per sentence:', numpy.mean(
                    val_scores)

    end_time = time.clock()
    print('Optimization complete.')
    print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
예제 #43
0
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
             dataset='mnist.pkl.gz', batch_size=20, n_hidden=300, momentum_coeff=0.):
    """

    :param learning_rate: learning rate used for the parameters
    :param L1_reg: lambda for the L1 regularization
    :param L2_reg: lambda for the L2-squared regularization
    :param n_epochs: number of epochs on which to train the data.
    :param dataset: pickled mnist data file
    :param batch_size: size of the mini-batch to be used with
    sgd
    :param n_hidden: number of hidden units
    :param momentum_coeff: Controls the amount of damping of the velocity
    as a result of previous gradients in sgd
    """

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # Compute the number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches  = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Allocate symbolic variables for the data
    index = T.lscalar() # index to minibatch
    x = T.matrix('x')
    y = T.ivector('y')

    is_train = T.iscalar('is_train') # pseudo boolean for switching between training and prediction

    rng = numpy.random.RandomState(1234)
    theano_rng = T.shared_randomstreams.RandomStreams(rng.randint(999999))

    # construct the MLP class
    classifier = MLP(
        rng=rng,
        theano_rng=theano_rng,
        input=x,
        n_in=28 * 28,
        n_hidden=n_hidden,
        n_out=10,
        is_train=is_train
    )

    # The cost that we minimize during training is the negative log likelihood
    # of the model plus the regularization terms
    cost = (
        classifier.negative_log_likelihood(y)
        + L1_reg * classifier.L1
        + L2_reg * classifier.L2_sqr
    )

    # We compile a Theano function that computes the mistakes that are
    # made by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size],
            is_train: numpy.asarray([0], dtype='int32')[0]
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size],
            is_train: numpy.asarray([0], dtype='int32')[0]
        }
    )

    train_loss = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            is_train: numpy.asarray([0], dtype='int32')[0]
        }
    )

    # Compute the gradient of the cost w.r.t theta
    #check
    gparams = [T.grad(cost, param) for param in classifier.params]

    # # specify how to update the parameters of the model as a list of
    # # (variable, update expression) pairs
    # updates = [
    #     (param, param - learning_rate * gparam)
    #     for param, gparam in zip(classifier.params, gparams)
    # ]

    # List of updates for every set of parameters
    updates = []

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs
    for param, gparam in zip(classifier.params, gparams):

        # Each parameter is updated by taking a step in the direction of the gradient.
        # However, we also "mix in" previous gradients i.e. when the previous momenta
        # have the same direction, this contributes to the velocity of the gradient descent
        # and therefore, we take larger steps. Here, the velocity `dict` tracks old gradients.

        velocity = theano.shared(theano._asarray(param.get_value()*0., dtype=theano.config.floatX))
        updated_velocity = momentum_coeff * velocity - learning_rate * gparam

        updates.append((velocity, updated_velocity))
        updates.append((param, param + updated_velocity))


    # compiling a Theano function which returns the cost, but at the
    # same time updates the parameters of the model
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size : (index + 1) * batch_size],
            y: train_set_y[index * batch_size : (index + 1) * batch_size],
            is_train: numpy.asarray([1], dtype='int32')[0]
        }
    )

    ###############
    # TRAIN MODEL #
    ###############

    print '... training'

    # early-stopping parameters
    patience = 10000  # The number of iterations to execute regardless of the validation error
    patience_increase = 2
    improvement_threshold = 0.995
    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    best_W = None
    best_epoch = 0
    start_time = time.clock()

    epoch = 0
    done_looping = False

    # Keeping track of training, testing and validation errors
    # per epoch
    validations = []
    tests = []
    trainings = []

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index)

            # A fancy way of keeping track of the current iteration
            iter = (epoch - 1) * n_train_batches + minibatch_index

            # Check the validation error every validation frequency
            # (in this case, we check every epoch)
            if (iter + 1) % validation_frequency == 0:

                # Compute the validation error i.e. the zero-one
                # loss on the validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]

                # The validation error is the mean over all the minibatches
                # of the validation set
                this_validation_loss = numpy.mean(validation_losses)

                # test the current model using the test set,
                # averaging over the test scores obtained by
                # all minibatches
                test_losses = [test_model(i) for i
                               in xrange(n_test_batches)]
                test_score = numpy.mean(test_losses)

                # The error achieved by the current model
                # on the training dataset
                train_losses = [train_loss(i) for i
                                in xrange(n_train_batches)]
                train_score = numpy.mean(train_losses)

                # For plotting error curve
                validations.append(this_validation_loss * 100)
                tests.append(test_score * 100)
                trainings.append(train_score * 100)

                print(
                    'epoch %i, minibatch %i/%i, validation error %f %%' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )

                # Maintain global best for validation loss
                if this_validation_loss < best_validation_loss:

                    # If the improvement in the validation loss surpasses
                    # the improvement threshold, we allow an increase in
                    # patience
                    if(this_validation_loss <
                               best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    # Update the global best
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    best_W = classifier.hiddenLayer.W.get_value(borrow=False)
                    best_epoch = epoch

                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break


    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))

    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    image = Image.fromarray(
    tile_raster_images(best_W.T,
                           img_shape=(28, 28), tile_shape=(3, 10),
                           tile_spacing=(1, 1)))
    image.save('repflds.png')

    # Plot the errors against the epochs
    epochs = numpy.arange(1, n_epochs + 1)
    plt.plot(epochs, trainings, 'b', epochs, validations, 'g', epochs, tests, 'r')
    green_circle, = plt.plot(best_epoch, best_validation_loss * 100., 'o', mec='g', ms=15, mew=1, mfc='none',
                             label="Best Validation Error")

    # Create plot legend
    blue_patch = mpatches.Patch(color='blue', label='Train')
    green_patch = mpatches.Patch(color='green', label='Validation')
    red_patch = mpatches.Patch(color='red', label='Test')
    plt.legend(handles=[blue_patch, green_patch, red_patch, green_circle], numpoints = 1)
    plt.savefig('error.png')
예제 #44
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=200,
                    dataset='mnist.pkl.gz',
                    nkerns=[20, 50], batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(
        rng,
        input=layer0_input,
        image_shape=(batch_size, 1, 28, 28),
        filter_shape=(nkerns[0], 1, 5, 5),
        poolsize=(2, 2)
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 5, 5),
        poolsize=(2, 2)
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(
        rng,
        input=layer2_input,
        n_in=nkerns[1] * 4 * 4,
        n_out=500,
        activation=T.tanh
    )

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i)
                        for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
예제 #45
0
def sgd(cost, params, lr=0.01):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        updates.append([p, p - g * lr])
    return updates
예제 #46
0
    def build_model(self):
        trng = RandomStreams(self.seed)

        # Used for dropout.
        self.use_noise = theano.shared(numpy_floatX(0.))

        if self.reload_model:
            self.load_params()

        self.tparams = self.init_tparams()

        self.lr = tensor.scalar(dtype=config.floatX)
        self.x, self.mask_x, emb_x, self.y, self.mask_y, emb_y, self.z = self.emb_layer.build(
            self.tparams)

        emb_x = dropout_layer(emb_x, self.use_noise, trng, self.dropout_rate)
        emb_y = dropout_layer(emb_y, self.use_noise, trng, self.dropout_rate)

        proj_x_fw = self.encoder_lstm_fw_layer.build(self.tparams, emb_x,
                                                     self.mask_x)
        proj_x_bw = reverse(
            self.encoder_lstm_bw_layer.build(self.tparams, reverse(emb_x),
                                             reverse(self.mask_x)))

        proj_x = tensor.concatenate([proj_x_fw, proj_x_bw],
                                    axis=-1) * self.mask_x[:, :, None]

        proj_y_fw = self.encoder_lstm_fw_layer.build(self.tparams, emb_y,
                                                     self.mask_y)
        proj_y_bw = reverse(
            self.encoder_lstm_bw_layer.build(self.tparams, reverse(emb_y),
                                             reverse(self.mask_y)))

        proj_y = tensor.concatenate([proj_y_fw, proj_y_bw],
                                    axis=-1) * self.mask_y[:, :, None]

        weight = tensor.batched_dot(proj_x.dimshuffle(1, 0, 2),
                                    proj_y.dimshuffle(1, 2,
                                                      0)).dimshuffle(1, 2, 0)
        weight_x = tensor.exp(weight - weight.max(axis=0, keepdims=True))
        weight_y = tensor.exp(weight - weight.max(axis=1, keepdims=True))

        weight_x = weight_x * self.mask_x[:, None, :]
        weight_y = weight_y * self.mask_y[None, :, :]

        alpha = weight_x / weight_x.sum(axis=0, keepdims=True)
        beta = weight_y / weight_y.sum(axis=1, keepdims=True)

        proj_y_att = (proj_x.dimshuffle(0, 'x', 1, 2) *
                      alpha.dimshuffle(0, 1, 2, 'x')).sum(axis=0)
        proj_x_att = (proj_y.dimshuffle('x', 0, 1, 2) *
                      beta.dimshuffle(0, 1, 2, 'x')).sum(axis=1)

        proj_x_cat = tensor.concatenate(
            [proj_x, proj_x_att, proj_x - proj_x_att, proj_x * proj_x_att],
            axis=-1)
        proj_y_cat = tensor.concatenate(
            [proj_y, proj_y_att, proj_y - proj_y_att, proj_y * proj_y_att],
            axis=-1)

        fusion_mlp_x = ReLU(
            self.fusion_mlp_layer.build(self.tparams, proj_x_cat))
        fusion_mlp_y = ReLU(
            self.fusion_mlp_layer.build(self.tparams, proj_y_cat))

        fusion_mlp_x = dropout_layer(fusion_mlp_x, self.use_noise, trng,
                                     self.dropout_rate)
        fusion_mlp_y = dropout_layer(fusion_mlp_y, self.use_noise, trng,
                                     self.dropout_rate)

        fusion_lstm_fw_x = self.fusion_lstm_fw_layer.build(
            self.tparams, fusion_mlp_x, self.mask_x)
        fusion_lstm_bw_x = reverse(
            self.fusion_lstm_bw_layer.build(self.tparams,
                                            reverse(fusion_mlp_x),
                                            reverse(self.mask_x)))

        fusion_lstm_x = tensor.concatenate(
            [fusion_lstm_fw_x, fusion_lstm_bw_x], axis=-1)

        fusion_lstm_fw_y = self.fusion_lstm_fw_layer.build(
            self.tparams, fusion_mlp_y, self.mask_y)
        fusion_lstm_bw_y = reverse(
            self.fusion_lstm_bw_layer.build(self.tparams,
                                            reverse(fusion_mlp_y),
                                            reverse(self.mask_y)))

        fusion_lstm_y = tensor.concatenate(
            [fusion_lstm_fw_y, fusion_lstm_bw_y], axis=-1)

        logit_x_mean = (fusion_lstm_x * self.mask_x[:, :, None]).sum(
            axis=0) / self.mask_x.sum(axis=0)[:, None]
        logit_x_max = (fusion_lstm_x * self.mask_x[:, :, None]).max(axis=0)

        logit_y_mean = (fusion_lstm_y * self.mask_y[:, :, None]).sum(
            axis=0) / self.mask_y.sum(axis=0)[:, None]
        logit_y_max = (fusion_lstm_y * self.mask_y[:, :, None]).max(axis=0)

        logit = tensor.concatenate(
            [logit_x_mean, logit_x_max, logit_y_mean, logit_y_max], axis=-1)

        logit = dropout_layer(logit, self.use_noise, trng, self.dropout_rate)

        logit = tensor.tanh(self.dense_mlp_layer.build(self.tparams, logit))

        logit = dropout_layer(logit, self.use_noise, trng, self.dropout_rate)

        self.pred_prob = tensor.nnet.nnet.softmax(
            self.class_mlp_layer.build(self.tparams, logit))
        self.pred = self.pred_prob.argmax(axis=-1)

        off = 1e-8
        if self.pred_prob.dtype == 'float16':
            off = 1e-6

        self.log_cost = -tensor.log(self.pred_prob[
            tensor.arange(self.x.shape[1]), self.z] + off).mean()
        self.cost = self.log_cost
        if self.decay_c > 0.:
            decay_c = theano.shared(numpy.float32(self.decay_c),
                                    name='decay_c')
            weight_decay = 0.
            for kk, vv in self.tparams.iteritems():
                weight_decay += (vv**2).sum()
            weight_decay *= decay_c
            self.cost += weight_decay

        self.grads = tensor.grad(self.cost, wrt=self.tparams.values())
        g2 = 0.
        for g in self.grads:
            g2 += (g**2).sum()
        self.grad_norm = tensor.sqrt(g2)

        if self.clip_c > 0.:
            new_grads = []
            for g in self.grads:
                new_grads.append(
                    tensor.switch(g2 > self.clip_c**2,
                                  g * self.clip_c / tensor.sqrt(g2), g))
            self.grads = new_grads
예제 #47
0
    def fit(self,
            X,
            Y,
            V=None,
            K=None,
            D=50,
            lr=10e-1,
            mu=0.99,
            batch_sz=100,
            epochs=6):
        if V is None:
            V = len(set(X))
        if K is None:
            K = len(set(Y))
        N = len(X)

        W = np.random.randn(V, K) / np.sqrt(V + K)
        b = np.zeros(K)
        self.W = theano.shared(W)
        self.b = theano.shared(b)
        self.params = [self.W, self.b]

        thX = T.ivector('X')
        thY = T.ivector('Y')

        py_x = T.nnet.softmax(self.W[thX] + self.b)
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]
        self.cost_predict_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        updates = [(p, p + mu * dp - lr * g)
                   for p, dp, g in zip(self.params, dparams, grads)] + [
                       (dp, mu * dp - lr * g) for dp, g in zip(dparams, grads)
                   ]
        train_op = theano.function(inputs=[thX, thY],
                                   outputs=[cost, prediction],
                                   updates=updates,
                                   allow_input_downcast=True)

        costs = []
        n_batches = N / batch_sz
        for i in xrange(epochs):
            X, Y = shuffle(X, Y)
            print "epoch:", i
            for j in xrange(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                c, p = train_op(Xbatch, Ybatch)
                costs.append(c)
                if j % 200 == 0:
                    print "i:", i, "j:", j, "n_batches:", n_batches, "cost:", c, "error:", np.mean(
                        p != Ybatch)
        plt.plot(costs)
        plt.show()
예제 #48
0
_, y = apply_along_axis(lambda row: random.multinomial(1, exp(row)),
                        axis=1,
                        arr=lP).nonzero()
y = y.astype(int32)
W = W.astype(float32)
X = X.astype(float32)

#setup theano
tW = T.matrix('W')
tX = T.matrix('X')
ty = T.ivector('y')
tlambda = T.scalar('lambda')

#symbolic representation
tEta = T.dot(tX, tW)
tP = T.nnet.softmax(tEta)
terror = T.nnet.categorical_crossentropy(tP, ty).mean(
) + tlambda * tW.norm(2)**2  # we could add some Tikhonov regularization
tgrad = T.grad(terror, tW)
f = theano.function([tW, tX, ty, tlambda], terror)
g = theano.function([tW, tX, ty, tlambda], tgrad)

W0 = random.randn(D, K).astype(float32)

#gradient descent
for it in xrange(500):
    ft = f(W0, X, y, 0.1)
    gt = g(W0, X, y, 0.1)
    W0 -= 0.1 * gt
    print it, "objective:", ft, "gradnorm:", linalg.norm(gt, ord=inf)
예제 #49
0
    def build_chain_trainer(self):
        bs = self.bs
        td = self.td

        wi = T.ivector('wi')  # bs (disamb. word indices)
        nwi = T.ivector('nwi')  # negative samples
        lr = T.dscalar('lr').astype(theano.config.floatX)  # learning rate
        lam = T.dscalar('lam').astype(theano.config.floatX)
        L = self.params['L']
        L1 = self.params['L1']  # hd x td
        #Wt = self.params['Wt']
        if not self.hinge_cost:
            L2 = self.params['L2']
            B = self.params['B']  # td
            B2 = self.params['B2']

        dwe = self.params['dwe']
        df = self.dat[wi, :]  #T.itensor3('df')# bs x mw x ms
        pr = self.sense_priors[wi, :]  # bs x mw x ms
        mk = self.dmask[wi, :]  #T.itensor3('mk')# bs x mw x ms
        pd = self.pd[
            wi, :]  #T.imatrix('pd') # bs x mdw (plain definition sentence)
        pe = self.ex[wi, :]  # plain example sentences bs x mew
        dw = dwe[wi, :]  # bs x td
        msk = self.wmask[wi, :].dimshuffle(0, 1, 'x')  # bs x mw x 1
        ndw = dwe[nwi, :]  # negative words

        def to_vect(d, m, p):
            hid_inp = dwe[d, :]  # mw x ms x hd
            logit = T.exp(T.dot(hid_inp, L0)[:, :, p])  # (mw x ms) x mw
            mk = T.switch(T.lt(p, 0), 0,
                          1)  # mw: word-level mask (different mask from m)
            mask = mk.dimshuffle(0, 'x', 'x')
            l2 = logit * mask  # mw x ms x mw
            l2 = T.sum(l2 * mk.dimshuffle('x', 'x', 0), axis=2) * m  # mw x ms
            w0 = l2 / T.sum(l2, axis=1).dimshuffle(0, 'x')
            w1 = T.switch(T.isnan(w0), 0, w0)
            w = w1.dimshuffle(0, 1, 'x')  # mw x ms x 1
            res = T.sum(w * hid_inp, axis=1)  # mw x hd
            return res  #, logit, weights

        def to_weight(d, m, p, prior):
            logit = T.tensordot(dwe[d, :], dwe.T,
                                axes=1)[:, :, d]  # mw x ms x mw x ms
            cnt = T.sum(m, axis=1).dimshuffle('x', 'x', 0)  # 1 x 1 x mw
            logit = T.sum(logit * m.dimshuffle('x', 'x', 0, 1),
                          axis=3) / cnt  # mw x ms x mw
            logit = T.exp(10 *
                          T.switch(T.isnan(logit), 0, logit))  # mw x ms x mw
            logit = T.prod(logit, axis=2) * prior  # mw x ms
            sm = T.sum(logit * m, axis=1, keepdims=True)  # mw x 1
            #mask = T.switch(T.lt(p, 0), 0, 1).dimshuffle(0, 'x') #
            logit = (logit * m) / sm  # mw x ms
            return T.switch(T.or_(T.isnan(logit), T.isinf(logit)), 0, logit)

        '''def to_weight(d, m, p, prior):
			A = dwe[d, :] # mw x ms x td
			#tmp = T.tensordot(T.dot(A, Wt), A.T, axes=1) # mw x ms x ms x mw
			#B = A * Wt.dimshuffle('x', 'x', 0) # 'diag' setting
			#tmp = T.tensordot(B, B.T, axes = 1)
			tmp = T.tensordot(A, A.T, axes = 1) # 'iden' setting
			tmp = T.exp(1000 * tmp.dimshuffle(0, 1, 3, 2)) # mw x ms x mw x ms
			tmp = tmp * m.dimshuffle('x', 'x', 0, 1)
			nrm = T.sum(tmp, axis=3)
			tmp = tmp / nrm.dimshuffle(0, 1, 2, 'x')
			tmp = T.switch(T.isnan(tmp), 0, tmp)
			mk = T.switch(T.lt(p, 0), 0, 1) # mw: word-level mask (different mask from m)
			tmp = T.max(tmp, axis=3) * mk.dimshuffle('x', 'x', 0) # mw x ms x mw
			tmp = T.exp(T.sum(T.log(T.switch(T.eq(tmp, 0), 1, tmp)), axis=2)) * m # mw x ms
			tmp = tmp * prior
			tmp = tmp / T.sum(tmp, axis=1).dimshuffle(0, 'x')
			return T.switch(T.isnan(tmp), 0, tmp)'''

        def cosim(x, y):
            return T.mean(
                T.sum(x * y, axis=1) / (x.norm(2, axis=1) * y.norm(2, axis=1)))

        #dat, _ = theano.scan(fn=to_vect, sequences=[df, mk, pd]) # bs x mw x td
        #ndat, _ = theano.scan(fn=to_vect_tmp, sequences=[ndf, nmk, npd]) # bs x mw x td
        weights, _ = theano.scan(fn=to_weight, sequences=[df, mk, pd,
                                                          pr])  # bs x mw x ms
        hid_inp = dwe[df, :]  # bs x mw x ms x td
        dat = T.sum(weights.dimshuffle(0, 1, 2, 'x') * hid_inp,
                    axis=2)  # bs x mw x td '''
        inp = dat.astype(theano.config.floatX)
        def_emb = T.sum(T.dot(inp, L) * msk, axis=1)  # bs x hd
        #neg_inp = ndat.astype(theano.config.floatX)
        #def_emb = get_sentence(inp, msk) # bs x hd

        #neg_def_emb = get_sentence(neg_inp, neg_msk)

        #w_cost = T.sum((def_emb - dw) ** 2)
        #w_neg_cost = T.sum((def_emb - ndw) ** 2)
        if self.hinge_cost:
            def_emb = T.dot(def_emb, L1)
            w_cost = -cosim(def_emb, dw)
            rep = nwi.shape[0] / wi.shape[
                0]  # b/c there are more negative samples than pos.
            de = T.extra_ops.repeat(def_emb, rep, axis=0)
            w_neg_cost = -cosim(de, ndw)
            cost = T.mean(T.maximum(0,
                                    0.01 + w_cost - w_neg_cost))  # hingeloss
        else:
            regress = T.dot(T.nnet.sigmoid(T.dot(def_emb, L1) + B),
                            L2) + B2  # bs x td
            cost = T.mean(
                (regress - dw)**
                2) + 0.01 * T.sum(abs(L2))  # only regularize the last

        if self.reg_alpha:
            cost += 0.1 * T.sum(abs(weights))
        #w_cost = get_word_probs(def_emb, wi, L1) #dwe.T) # dwe instead of L1
        #w_neg_cost = get_word_probs(def_emb, nwi, L1) #dwe.T) # dwe instead of L1

        #c_cost = -get_context_probs(def_emb, pe, L0) # negative of the likelihood
        #c_neg_cost = -get_context_probs(def_emb, npe, L0)

        #all_params = [self.params[k] for k in self.params if k != 'dwe' and not k.startswith('L')]
        all_params = [self.params[k] for k in self.params if k != 'dwe']
        #L_params = [L0]
        '''Copy of the same function in Lasagne (with minor changes)'''

        def apply_nesterov_momentum(ups, mom, shape=None):
            params = ups.keys()
            ups = OrderedDict(ups)
            if shape is None:
                shape = [p.get_value(borrow=True).shape for p in params]

            for (param, shp) in zip(params, shape):
                velocity = theano.shared(np.zeros(shp,
                                                  dtype=theano.config.floatX),
                                         broadcastable=param.broadcastable)
                x = mom * velocity + ups[param] - param
                ups[velocity] = x
                ups[param] = mom * x + ups[param]
            return ups

        dwe_params = [dw, ndw]
        if self.do_sgd:
            grads = T.grad(cost, all_params)
            updates = OrderedDict()
            for (p, g) in zip(all_params, grads):
                updates[p] = p - lr * g
            apply_nesterov_momentum(updates, mom=0.9)
            if self.no_alt or not self.do_fixedpoint:
                dgrads = T.grad(cost, dwe_params)
                dwe_update = OrderedDict()
                for (p, g) in zip(dwe_params, dgrads):
                    dwe_update[p] = p - lr * g
                    foo = lr * g
                apply_nesterov_momentum(dwe_update,
                                        mom=0.9,
                                        shape=[(bs, td), (bs, td)])
        else:
            updates = adadelta(cost, all_params, learning_rate=lr)
            #L_update = adadelta(cost, L_params, learning_rate = lr)
            if self.no_alt or not self.do_fixedpoint:
                dwe_update = adadelta(cost, dwe_params, learning_rate=lr)

        if not self.no_alt and self.do_fixedpoint:  # because no alternating training means optimization
            if self.do_rw:
                #posword = self.base[wi] + 0.3 * def_emb #0.3 * ((1 - self.lam) * def_emb + self.lam * dw)
                idf = self.idf[wi].dimshuffle(
                    0, 1, 'x')  # bs x mw x 1  (dat is bs x mw x hd)
                rw_term = T.sum(dat * idf, axis=1)  # bs x hd
                disc_fact = 0.9
                if self.init_dwe:
                    #posword = disc_fact * rw_term # + self.base[wi] # truerw
                    posword = (
                        1 - lam
                    ) * dw + lam * disc_fact * rw_term  # + self.base[wi] # truerw
                else:
                    base = self.lam * def_emb + (1 - self.lam) * dw
                    posword = base + disc_fact * rw_term
                word_update = T.set_subtensor(
                    dw, posword.astype(theano.config.floatX))
                dwe_update = {dwe: word_update}
                dwe_ret = T.max(T.abs_(posword -
                                       dw))  # max-norm of the increment
            else:
                posword = (1 - self.lam) * def_emb + self.lam * dw
                word_update = T.set_subtensor(dw, posword - self.lam * ndw)
                dwe_update = {dwe: word_update}
                dwe_ret = word_update
        else:  #elif not self.do_fixedpoint or self.no_alt:
            word_update = dwe_update[dw]
            word_update = T.set_subtensor(dw, word_update)
            nword_update = dwe_update[ndw]
            word_update = T.set_subtensor(word_update[nwi, :], nword_update)
            dwe_update = {dwe: word_update}  #T.set_subtensor(dw, word_update)
            if self.no_alt:
                updates.update({dwe: word_update})
            dwe_ret = word_update
            #updates.update({dwe: dwe_update[dwe]}) #word_update})
        #updates.update({dwe: word_update})

        self.train_step = theano.function([wi, nwi, lr], [cost, weights],
                                          updates=updates)
        if not self.no_alt:
            self.dwe_train_step = theano.function([wi, nwi, lam],
                                                  [cost, dwe_ret, weights],
                                                  updates=dwe_update)
예제 #50
0
def evaluate_mnist_1(learning_rate=0.1,
                     n_epochs=100,
                     nkerns=[4, 6],
                     batch_size=2):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """
    rng = numpy.random.RandomState(3)
    xs = []
    ys = []
    # f = open('temp_value', 'r+')
    # f = open('out_10', 'r+')
    f = open('out_10_10', 'r+')

    while (1):
        line = f.readline()
        line2 = f.readline()
        if not line:
            break
        line = line.replace("\n", "")

        values = [float(i) for i in line.split()]
        value = float(line2)

        xs.append(values)
        ys.append(value)

    print(len(xs))
    print(len(xs[0]))
    print(len(ys))
    # print(ys)
    # print(xs)

    test_set_x, test_set_y = shared_dataset([xs, ys])
    valid_set_x, valid_set_y = shared_dataset([xs, ys])
    train_set_x, train_set_y = shared_dataset([xs, ys])

    # train_set_x, train_set_y = datasets[0]
    # valid_set_x, valid_set_y = datasets[1]
    # test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    batch_size = len(ys)
    # batch_size=1
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size
    # n_train_batches = 1
    # n_valid_batches = 1
    # n_test_batches = 1

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ishape = (28, 28)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)
    # myprint=theano.function([x],x)
    # myprint([layer2_input])

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=20,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=20, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)
    prob = layer3.prob_y_given_x(y)

    f1 = open('weights', 'w+')
    print "layer 0 weights"
    for w in layer0.W.get_value():
        for r in w:
            for s in r:
                for d in s:
                    f1.write(str(d) + '\n')

    # print layer0.W.get_value()
    # print layer0.b.get_value()
    print "layer 1 weights"
    # print layer1.W.get_value()
    # print layer1.b.get_value()
    for w in layer1.W.get_value():
        for r in w:
            for s in r:
                for d in s:
                    f1.write(str(d) + '\n')

    print "layer 2 weights"
    # print layer2.W.get_value()
    w = layer2.W.get_value()
    # for d in w:
    #     print d
    for i in range(len(w[0])):
        for j in range(len(w)):
            f1.write(str(w[j][i]) + '\n')
    # print layer2.b.get_value()

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    prob_model = theano.function(
        [index],
        prob,
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    conv_model0 = theano.function(
        [index],
        layer0.output,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})
    conv_model0_conv = theano.function(
        [index],
        layer0.conv_out,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})

    conv_model1 = theano.function(
        [index],
        layer1.output,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})
    conv_model1_conv = theano.function(
        [index],
        layer1.conv_out,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})
    conv_model2 = theano.function(
        [index],
        layer2.output,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params
    # params = layer0.params + layer1.params + layer2.params + layer3.params

    # x_printed = theano.printing.Print('this is a very important value')(x)
    # f_with_print = theano.function([x], x_printed)
    # f_with_print(layer3.params)

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    val_grads = T.grad(cost, layer3.p_y_given_x)
    # print "AAAA"
    # theano.printing.debugprint(temp_grads)
    # print "AAAA"

    grad_model = theano.function(
        [index],
        grads,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    val_grad_model = theano.function(
        [index],
        val_grads,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []

    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    bestConvW = layer0.W.get_value()

    while (epoch < n_epochs) and (not done_looping):

        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index
            val_grads_ij = val_grad_model(minibatch_index)
            grads_ij = grad_model(minibatch_index)
            conv0_ij = conv_model0(minibatch_index)
            conv1_ij = conv_model1(minibatch_index)
            conv2_ij = conv_model2(minibatch_index)
            conv0_conv_ij = conv_model0_conv(minibatch_index)
            conv1_conv_ij = conv_model1_conv(minibatch_index)

            print 'training @ iter = ', iter
            print "last layer var grads"
            print val_grads_ij[0]

            # print "Layer 0 convolution"
            # for c in conv0_conv_ij[0]:
            #     print c
            #     print ""
            # print ""
            # print "Layer 1 convolution"
            # for c in conv1_conv_ij[0]:
            #     print c
            #     print ""
            # print ""
            probs = prob_model(minibatch_index)
            print "Probs"
            print probs
            # print "layer 0 grads"
            # print grads_ij[6]
            # print grads_ij[7]
            # print "layer 1 grads"
            # print grads_ij[4]
            # print grads_ij[5]
            # print "layer 2 grads"
            # print grads_ij[2]
            # print grads_ij[3]
            print "log reg layer grads"
            print grads_ij[0]
            print grads_ij[1]
            print "Layer 0 output"
            # for c in conv0_ij:
            #     for d in c:
            #         print d
            # print conv0_ij[0][0]
            print "Layer 1 output"
            # print conv1_ij[0][0]
            # for c in conv1_ij:
            #     for d in c:
            #         print d
            print "Layer 2 output"
            # for c in conv2_ij:
            #     print c
            cost_ij = train_model(minibatch_index)

            # for c in conv0_conv_ij[1]:
            #     print c
            #     print ""

            print "learning_rate"
            print learning_rate
            print "layer 0 weights"
            # print layer0.W.get_value()
            # print layer0.b.get_value()
            print "layer 1 weights"
            # print layer1.W.get_value()
            # print layer1.b.get_value()
            print "layer 2 weights"
            w = layer2.W.get_value()
            # print w[0]
            # print w[1]

            # for c in layer2.W.get_value():
            #     print c
            # print layer2.b.get_value()
            print "log reg layer weights"
            print layer3.W.get_value()
            print layer3.b.get_value()
            print "COST"
            print cost_ij

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    bestConvW = layer0.W.get_value()
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(
                        ('     epoch %i, minibatch %i/%i, test error of best '
                         'model %f %%') % (epoch, minibatch_index + 1,
                                           n_train_batches, test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
예제 #51
0
    def get_cost_updates(self, lr=0.1, persistent=None, k=1):
        """This functions implements one step of CD-k or PCD-k

        :param lr: learning rate used to train the RBM

        :param persistent: None for CD. For PCD, shared variable
            containing old state of Gibbs chain. This must be a shared
            variable of size (batch size, number of hidden units).

        :param k: number of Gibbs steps to do in CD-k/PCD-k

        Returns a proxy for the cost and the updates dictionary. The
        dictionary contains the update rules for weights and biases but
        also an update of the shared variable used to store the persistent
        chain, if one is used.

        """

        # compute positive phase
        pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input)

        # decide how to initialize persistent chain:
        # for CD, we use the newly generate hidden sample
        # for PCD, we initialize from the old state of the chain
        if persistent is None:
            chain_start = ph_sample
        else:
            chain_start = persistent
        # end-snippet-2
        # perform actual negative phase
        # in order to implement CD-k/PCD-k we need to scan over the
        # function that implements one gibbs step k times.
        # Read Theano tutorial on scan for more information :
        # http://deeplearning.net/software/theano/library/scan.html
        # the scan will return the entire Gibbs chain
        ([
            pre_sigmoid_nvs, nv_means, nv_samples, pre_sigmoid_nhs, nh_means,
            nh_samples
        ], updates) = theano.scan(
            self.gibbs_hvh,
            # the None are place holders, saying that
            # chain_start is the initial state corresponding to the
            # 6th output
            outputs_info=[None, None, None, None, None, chain_start],
            n_steps=k)
        # start-snippet-3
        # determine gradients on RBM parameters
        # note that we only need the sample at the end of the chain
        chain_end = nv_samples[-1]

        cost = T.mean(self.free_energy(self.input)) - T.mean(
            self.free_energy(chain_end))
        # We must not compute the gradient through the gibbs sampling
        gparams = T.grad(cost, self.params, consider_constant=[chain_end])
        # end-snippet-3 start-snippet-4
        # constructs the update dictionary
        for gparam, param in zip(gparams, self.params):
            # make sure that the learning rate is of the right dtype
            updates[param] = param - gparam * T.cast(
                lr, dtype=theano.config.floatX)
        if persistent:
            # Note that this works only if persistent is a shared variable
            updates[persistent] = nh_samples[-1]
            # pseudo-likelihood is a better proxy for PCD
            monitoring_cost = self.get_pseudo_likelihood_cost(updates)
        else:
            # reconstruction cross-entropy is a better proxy for CD
            monitoring_cost = self.get_reconstruction_cost(
                updates, pre_sigmoid_nvs[-1])

        return monitoring_cost, updates
x = theano.shared(D[0], name="x")
y = theano.shared(D[1], name="y")
w = theano.shared(rng.randn(feats).astype(theano.config.floatX), name="w")
b = theano.shared(np.asarray(0., dtype=theano.config.floatX), name="b")
x.tag.test_value = D[0]
y.tag.test_value = D[1]
#print "Initial model:"
#print w.get_value(), b.get_value()

# Construct Theano expression graph
p_1 = 1 / (1 + tt.exp(-tt.dot(x, w) - b))  # Probability of having a one
prediction = p_1 > 0.5  # The prediction that is done: 0 or 1
xent = -y * tt.log(p_1) - (1 - y) * tt.log(1 - p_1)  # Cross-entropy
cost = tt.cast(xent.mean(), 'float32') + \
    0.01 * (w ** 2).sum()  # The cost to optimize
gw, gb = tt.grad(cost, [w, b])

# Compile expressions to functions
train = theano.function(inputs=[],
                        outputs=[prediction, xent],
                        updates=[(w, w - 0.01 * gw), (b, b - 0.01 * gb)],
                        name="train")
predict = theano.function(inputs=[], outputs=prediction, name="predict")

if any([
        n.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm']
        for n in train.maker.fgraph.toposort()
]):
    print('Used the cpu')
elif any([
        n.op.__class__.__name__ in ['GpuGemm', 'GpuGemv']
예제 #53
0
def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    """
    This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias

    We check that we loop when there are too many threads

    """

    n_in = 1000
    batch_size = 4097
    n_out = 1250

    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
        n_in = 4098
        n_out = 4099

    y = T.lvector('y')

    b = T.fvector('b')

    # we precompute the dot with big shape before to allow the test of
    # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
    # (the launch timed out and was terminated) on GPU card not
    # powerful enough. We need the big shape to check for corner
    # case.
    dot_result = T.fmatrix('dot_result')

    # Seed numpy.random with config.unittests.rseed
    utt.seed_rng()

    xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
                       dtype=numpy.float32)
    yy = numpy.ones((batch_size,), dtype='int32')
    b_values = numpy.zeros((n_out,), dtype='float32')
    W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')

    dot_value = numpy.asarray(numpy.dot(xx, W_values), dtype='float32')
    del W_values
    p_y_given_x = T.nnet.softmax(dot_result + b)
    y_pred = T.argmax(p_y_given_x, axis=-1)
    loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
    dW = T.grad(loss, dot_result)
    classify = theano.function(inputs=[y, b, dot_result],
                               outputs=[loss, y_pred, dW],
                               mode=mode_without_gpu)
    classify_gpu = theano.function(inputs=[y, b, dot_result],
                                   outputs=[loss, y_pred, dW],
                                   mode=mode_with_gpu)
    # theano.printing.debugprint(classify)
    # theano.printing.debugprint(classify_gpu)

    assert any([isinstance(node.op,
                           T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
                for node in classify.maker.fgraph.toposort()])
    assert any([isinstance(node.op,
                           cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias)
                for node in classify_gpu.maker.fgraph.toposort()])

    out = classify(yy, b_values, dot_value)
    gout = classify_gpu(yy, b_values, dot_value)

    assert len(out) == len(gout) == 3
    assert numpy.allclose(out[0], gout[0])
    assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute(
        gout - out).max()
    assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val)
                                             for id, val in enumerate(out[1] -
                                                                      gout[1])
                                             if val != 0]
예제 #54
0
    def __init__(self,
                 n_words=20,
                 n_embedding=100,
                 lr=0.01,
                 momentum=0.9,
                 word_to_id=None,
                 null_word_id=-1,
                 load_from_file=None):
        if load_from_file:
            self.load_model(load_from_file)
        else:
            self.regularization = 0.01
            self.n_embedding = n_embedding
            self.lr = lr
            self.momentum = momentum
            self.n_words = n_words
            self.batch_size = 4

            self.word_to_id = word_to_id
            self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems())
            self.null_word_id = null_word_id

            # Question embedding
            # self.B = init_shared_normal(self.n_words, self.n_embedding, 0.1)

            # Statement input, output embeddings
            self.weights = init_shared_normal_tensor(4, self.n_words,
                                                     self.n_embedding, 0.1)

            # Linear mapping between layers
            self.H = init_shared_normal(self.n_embedding, self.n_embedding,
                                        0.1)

            # Final outut weight matrix
            # self.W = init_shared_normal(self.n_embedding, self.n_words, 0.1)

            # Answer embedding matrix
            self.A = init_shared_normal(self.n_words, self.n_embedding, 0.1)

            # Final scoring matrix
            self.U = init_shared_normal(self.n_embedding, self.n_embedding,
                                        0.1)

        zero_vector = T.vector('zv', dtype=theano.config.floatX)

        # Statement
        x = T.imatrix('x')
        xbatch = T.tensor3('xb', dtype='int32')

        # Positional encoding matrix
        pe = T.tensor3('pe')

        # Question
        q = T.ivector('q')
        qbatch = T.imatrix('qb')

        # True word
        r = T.iscalar('r')
        rbatch = T.ivector('rb')

        # Stacked answer vectors
        a = T.imatrix('a')
        abatch = T.tensor3('ab', dtype='int32')

        memory_cost = self.memnn_cost(x, q, a, pe)
        # memory_loss = -T.log(memory_cost[r]) # cross entropy on softmax
        memory_loss = self.memnn_batch_cost(xbatch, qbatch, rbatch, abatch, pe)

        params = [
            self.weights,
            # self.B,
            # self.W,
            self.H,
            self.A,
            self.U,
        ]

        regularization_cost = reduce(
            lambda x, y: x + y,
            map(lambda x: self.regularization * T.sum(x**2), params))

        cost = memory_loss + regularization_cost

        grads = T.grad(cost, params)

        l_rate = T.scalar('l_rate')

        # Parameter updates
        updates = get_param_updates(params,
                                    grads,
                                    lr=l_rate,
                                    method='adagrad',
                                    momentum=0.9,
                                    constraint=self._constrain_embedding(
                                        self.null_word_id, zero_vector))

        self.train_function = theano.function(
            inputs=[
                xbatch, qbatch, rbatch, abatch, pe,
                theano.Param(l_rate, default=self.lr),
                theano.Param(zero_vector,
                             default=np.zeros((self.n_embedding, ),
                                              theano.config.floatX))
            ],
            outputs=cost,
            updates=updates,
            allow_input_downcast=True,
            # mode='FAST_COMPILE',
            #mode='DebugMode'
            #mode=theano.compile.MonitorMode(pre_func=inspect_inputs,post_func=inspect_outputs)
            on_unused_input='warn')

        self.predict_function = theano.function(
            inputs=[x, q, a, pe],
            outputs=memory_cost,
            allow_input_downcast=True,
            # mode='FAST_COMPILE',
            on_unused_input='warn')
예제 #55
0
    mu_phase, sigma_phase, coeff_phase = _slice_outs(phase_outs)

    target_split = n_out // 2
    mag_target = target[:, :, :target_split]
    phase_target = target[:, :, target_split:]

    mag_cost = single_dimensional_gmms(mag_target, mu_mag, sigma_mag,
                                       coeff_mag)
    phase_cost = single_dimensional_phase_gmms(phase_target, mu_phase,
                                               sigma_phase, coeff_phase)

    cost = mag_cost + phase_cost

    cost = cost * mask
    cost = cost.sum() / cut_len
    grads = tensor.grad(cost, params)
    grads = gradient_clipping(grads, 10.)

    learning_rate = 1E-4

    opt = adam(params, learning_rate)
    updates = opt.updates(params, grads)

    train_function = theano.function([
        X_sym, X_mask_sym, c_sym, c_mask_sym, init_h1, init_h2, init_h3,
        init_kappa, init_w, bias_sym
    ], [cost, h1, h2, h3, kappa, w],
                                     updates=updates)
    cost_function = theano.function([
        X_sym, X_mask_sym, c_sym, c_mask_sym, init_h1, init_h2, init_h3,
        init_kappa, init_w, bias_sym
예제 #56
0
# output = T.nnet.sigmoid(T.dot(output, W2) + b2)
print lasagne.layers.get_output(l_decoder, inputs={
    l_in: x_sym
}).eval({
    x_sym: ftest_x
}).shape

loss_all_target = lasagne.objectives.squared_error(output, t_sym).sum()

loss_mean_target = loss_all_target / n_batch

# print loss_mean_target.eval({x_sym:test_x,mask_x_sym:mask_test_x, t_sym: target_train, mask_t_sym: mask_target_train})

all_params_target = lasagne.layers.get_all_params([l_decoder])
all_grads_target = [
    T.clip(g, -10, 10) for g in T.grad(loss_mean_target, all_params_target)
]
all_grads_target = lasagne.updates.total_norm_constraint(all_grads_target, 10)
updates_target = adam(all_grads_target, all_params_target)

train_model = theano.function([x_sym, t_sym], [loss_mean_target, output],
                              updates=updates_target)

test_model = theano.function([x_sym, t_sym], [loss_mean_target, output])

num_min_batches = 100
n_batch = 100
epochs = 100

for i in range(epochs):
    start_time = time.time()
예제 #57
0
    def trainCompile(self):

        # Activation
        for i in xrange(self.lastArrayNum):
            self.architecture[i].compileActivation(self, i)

        # Sparse penalty
        for i in xrange(self.lastArrayNum):
            l = self.architecture[i]
            if l.sparsity:
                l.compileSparsity(self, i, self.options.minibatch_size)

        # Weight decay penalty
        for i in xrange(self.lastArrayNum):
            l = self.architecture[i]
            if l.weightDecay:
                l.compileWeightDecayPenalty(self, i)

        # Error
        XENT = 1.0 / self.options.minibatch_size * T.sum((self.y - self.varArrayA[-1]) ** 2 * 0.5)
        self.cost = XENT
        for err in self.regularize:
            self.cost += err

        # Update output array
        self.outputArray.append(self.cost)
        self.outputArray.append(XENT)
        self.outputArray.append(self.varArrayA[-1])

        # Derivatives
        # All variables to gradArray list to show to Theano on which variables we need an gradient
        gradArray = []
        for i in xrange(self.lastArrayNum):
            for k in self.varWeights[i].keys():
                gradArray.append(self.varWeights[i][k])
        self.derivativesArray = T.grad(self.cost, gradArray)

        # RMS
        if self.options.rmsProp:
            for i in xrange(len(self.derivativesArray)):
                mmsp = theano.shared(np.tile(0.0, gradArray[i].get_value().shape).astype(theano.config.floatX),
                                     name="mmsp%s" % (i + 1))  # 0.0 - 1.0 maybe
                self.MMSprev.append(mmsp)
                mmsn = self.options.rmsProp * mmsp + (1 - self.options.rmsProp) * self.derivativesArray[i] ** 2
                #mmsn = T.clip(mmsn, self.options.mmsmin, 1e+15)  # Fix nan if rmsProp
                mmsn = T.clip(mmsn, self.options.mmsmin, np.finfo(np.float32).max)  # Fix nan if rmsProp
                self.MMSnew.append(mmsn)

        # Update values
        for i in xrange(len(self.derivativesArray)):
            if self.options.rmsProp:
                updateVar = self.options.learnStep * self.derivativesArray[i] / self.MMSnew[i] ** 0.5
                self.updatesArray.append((self.MMSprev[i], self.MMSnew[i]))
            else:
                updateVar = self.options.learnStep * self.derivativesArray[i]
            self.updatesArray.append((gradArray[i], gradArray[i] - updateVar))

        self.train = theano.function(inputs=[self.x, self.y],
                                     outputs=self.outputArray,
                                     updates=self.updatesArray,
                                     allow_input_downcast=True)
        return self
예제 #58
0
def train_med2vec(seqFile='seqFile.txt',
                  demoFile='demoFile.txt',
                  labelFile='labelFile.txt',
                  outFile='outFile.txt',
                  modelFile='modelFile.txt',
                  L2_reg=0.001,
                  numXcodes=20000,
                  numYcodes=20000,
                  embDimSize=1000,
                  hiddenDimSize=2000,
                  batchSize=100,
                  demoSize=2,
                  logEps=1e-8,
                  windowSize=1,
                  verbose=False,
                  maxEpochs=1000):

    options = locals().copy()
    print('initializing parameters')
    params = init_params(options)
    #params = load_params(options)
    tparams = init_tparams(params)

    print('building models')
    f_grad_shared = None
    f_update = None
    if demoSize > 0 and numYcodes > 0:
        x, d, y, mask, iVector, jVector, cost = build_model(tparams, options)
        grads = T.grad(cost, wrt=list(tparams.values()))
        f_grad_shared, f_update = adadelta(tparams,
                                           grads,
                                           x,
                                           mask,
                                           iVector,
                                           jVector,
                                           cost,
                                           options,
                                           d=d,
                                           y=y)
    elif demoSize == 0 and numYcodes > 0:
        x, y, mask, iVector, jVector, cost = build_model(tparams, options)
        grads = T.grad(cost, wrt=list(tparams.values()))
        f_grad_shared, f_update = adadelta(tparams,
                                           grads,
                                           x,
                                           mask,
                                           iVector,
                                           jVector,
                                           cost,
                                           options,
                                           y=y)
    elif demoSize > 0 and numYcodes == 0:
        x, d, mask, iVector, jVector, cost = build_model(tparams, options)
        grads = T.grad(cost, wrt=list(tparams.values()))
        f_grad_shared, f_update = adadelta(tparams,
                                           grads,
                                           x,
                                           mask,
                                           iVector,
                                           jVector,
                                           cost,
                                           options,
                                           d=d)
    else:
        x, mask, iVector, jVector, cost = build_model(tparams, options)
        grads = T.grad(cost, wrt=list(tparams.values()))
        f_grad_shared, f_update = adadelta(tparams, grads, x, mask, iVector,
                                           jVector, cost, options)

    print('loading data')
    seqs, demos, labels = load_data(seqFile, demoFile, labelFile)
    n_batches = int(np.ceil(float(len(seqs)) / float(batchSize)))

    print('training start')
    for epoch in range(maxEpochs):
        iteration = 0
        costVector = []
        for index in random.sample(list(range(n_batches)), n_batches):
            batchX = seqs[batchSize * index:batchSize * (index + 1)]
            batchY = []
            batchD = []
            if demoSize > 0 and numYcodes > 0:
                batchY = labels[batchSize * index:batchSize * (index + 1)]
                x, y, mask, iVector, jVector = padMatrix(
                    batchX, batchY, options)
                batchD = demos[batchSize * index:batchSize * (index + 1)]
                cost = f_grad_shared(x, batchD, y, mask, iVector, jVector)
            elif demoSize == 0 and numYcodes > 0:
                batchY = labels[batchSize * index:batchSize * (index + 1)]
                x, y, mask, iVector, jVector = padMatrix(
                    batchX, batchY, options)
                cost = f_grad_shared(x, y, mask, iVector, jVector)
            elif demoSize > 0 and numYcodes == 0:
                x, mask, iVector, jVector = padMatrix(batchX, batchY, options)
                batchD = demos[batchSize * index:batchSize * (index + 1)]
                cost = f_grad_shared(x, batchD, mask, iVector, jVector)
            else:
                x, mask, iVector, jVector = padMatrix(batchX, batchY, options)
                cost = f_grad_shared(x, mask, iVector, jVector)
            costVector.append(cost)
            f_update()
            if (iteration % 10 == 0) and verbose:
                print('epoch:%d, iteration:%d/%d, cost:%f' %
                      (epoch, iteration, n_batches, cost))
            iteration += 1
        print('epoch:%d, mean_cost:%f' % (epoch, np.mean(costVector)))
        tempParams = unzip(tparams)
        np.savez_compressed(outFile + '.' + str(epoch), **tempParams)
예제 #59
0
#--------------------
#declare theano variables
#--------------------
X = theano.tensor.matrix('X', dtype='floatX')
Y = theano.tensor.matrix('Y', dtype='floatX')
theta = theano.shared(numpy.zeros((n, 1)), name="theta")

#--------------------
#declare theano expressions for logistic regression regression
#--------------------
#hypothesis
h = T.nnet.sigmoid(T.dot(X, theta))
#cost function
cost = 1.0 / m * T.sum(-Y * T.log(h) - (1 - Y) * T.log(1 - h))
#grad function
gtheta = T.grad(cost, theta)
#train function
train = theano.function(inputs=[X, Y],
                        outputs=[
                            cost,
                        ],
                        updates=((theta, theta - alpha * gtheta), ))

#predict function
predict = theano.function(inputs=[
    X,
], outputs=[
    h > 0.5,
])
#-----------------
#train the logistic regression
예제 #60
0
def test_mlp(learning_rate=0.01,
             L1_reg=0.00,
             L2_reg=0.0001,
             n_epochs=1000,
             dataset='mnist.pkl.gz',
             batch_size=20,
             n_hidden=500):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L1_reg: float
    :param L1_reg: L1-norm's weight when added to the cost (see
    regularization)

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


   """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    rng = numpy.random.RandomState(1234)

    # construct the MLP class
    classifier = MLP(rng=rng,
                     input=x,
                     n_in=28 * 28,
                     n_hidden=n_hidden,
                     n_out=10)

    # start-snippet-4
    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 +
            L2_reg * classifier.L2_sqr)
    # end-snippet-4

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # start-snippet-5
    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = [T.grad(cost, param) for param in classifier.params]

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs

    # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
    # same length, zip generates a list C of same size, where each element
    # is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-5

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)

                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))