Пример #1
0
def run_experiment(constraint_adj=False):
	X,output,cost,P= build_network(8,3,constraint_adj)
	parameters = P.values()
	grads = T.grad(cost,wrt=parameters)
	train = theano.function(
			inputs=[X],
			outputs=cost,
			updates=updates.adadelta(parameters,grads)
			)
	test = theano.function(
			inputs=[X],
			outputs=output,
			)
	data = np.eye(8,dtype=np.int8)
#	data = np.vstack((data,))
	print "Training..."
	for _ in xrange(100000):
		np.random.shuffle(data)
		train(data)

	hidden_activations = theano.function(
			inputs=[X],
			outputs=T.nnet.sigmoid(T.dot(X,P.W_input_hidden)+P.b_hidden)
		)
	#print_arr(test(np.eye(8,dtype=np.int32)))
	#print_arr(1/(1 + np.exp(-parameters[0].get_value())),1)
	return hinton(hidden_activations(np.eye(8,dtype=np.int8)))
Пример #2
0
def make_train(input_size,output_size,mem_size,mem_width,hidden_size=100):
	P = Parameters()

        # Build controller. ctrl is a network that takes an external and read input
        # and returns the output of the network and its hidden layer
	ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_size)

        # Build model that predicts output sequence given input sequence
	predict = model.build(P,mem_size,mem_width,hidden_size,ctrl)

	input_seq = T.matrix('input_sequence')
	output_seq = T.matrix('output_sequence')
        [M,weights,output_seq_pred] = predict(input_seq)

        # Setup for adadelta updates
	cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1)
	params = P.values()
	l2 = T.sum(0)
	for p in params:
		l2 = l2 + (p ** 2).sum()
	cost = T.sum(cross_entropy) + 1e-3*l2
        # clip gradients
	grads  = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ]

	train = theano.function(
			inputs=[input_seq,output_seq],
			outputs=cost,
			updates=updates.adadelta(params,grads)
		)

	return P,train
Пример #3
0
def make_train(input_size, output_size, mem_size, mem_width, hidden_sizes=[100]):
    P = Parameters()
    ctrl = controller.build(P, input_size, output_size,
                            mem_size, mem_width, hidden_sizes)
    predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl)

    input_seq = T.matrix('input_sequence')
    output_seq = T.matrix('output_sequence')
    seqs = predict(input_seq)
    output_seq_pred = seqs[-1]
    cross_entropy = T.sum(T.nnet.binary_crossentropy(
        5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq), axis=1)
    params = P.values()
    l2 = T.sum(0)
    for p in params:
        l2 = l2 + (p ** 2).sum()
    cost = T.sum(cross_entropy) + 1e-3 * l2
    grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)]

    train = theano.function(
        inputs=[input_seq, output_seq],
        outputs=cost,
        updates=updates.adadelta(params, grads)
    )

    return P, train
Пример #4
0
def make_train(input_size,
               output_size,
               mem_size,
               mem_width,
               hidden_sizes=[100]):
    P = Parameters()
    ctrl = controller.build(P, input_size, output_size, mem_size, mem_width,
                            hidden_sizes)
    predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl)

    input_seq = T.matrix('input_sequence')
    output_seq = T.matrix('output_sequence')
    seqs = predict(input_seq)
    output_seq_pred = seqs[-1]
    cross_entropy = T.sum(T.nnet.binary_crossentropy(
        5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq),
                          axis=1)
    cost = T.sum(cross_entropy)  # + 1e-3 * l2
    params = P.values()
    grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)]

    train = theano.function(inputs=[input_seq, output_seq],
                            outputs=T.sum(cross_entropy),
                            updates=updates.adadelta(params, grads))

    return P, train
Пример #5
0
def make_functions(inputs,outputs,params,grads,lr):
	shapes = [ p.get_value().shape for p in params ]
	acc_grads = [ theano.shared(np.zeros(s,dtype=np.float32)) for s in shapes ]
	count = theano.shared(np.float32(0))
	acc_update = [ (a,a+g) for a,g in zip(acc_grads,grads) ] + [ (count,count + 1.) ]

#	deltas = acc_grads
	deltas	  = [ ag / count for ag in acc_grads ]
	grads_norms = [ T.sqrt(T.sum(g**2)) for g in deltas ]
	deltas	  = [ T.switch(T.gt(n,1.),1.*g/n,g) for n,g in zip(grads_norms,deltas) ]
	
#	param_update = [ (p, p - lr * g) for p,g in zip(params,deltas) ]
	param_update = updates.adadelta(params,deltas,learning_rate=lr) # ,learning_rate=lr,rho=np.float32(0.95)

	clear_update = [ 
			(a,np.zeros(s,dtype=np.float32)) 
			for a,s in zip(acc_grads,shapes) 
			] + [ (count,0) ]
	acc = theano.function(
			inputs  = inputs,
			outputs = [outputs,output_ans[ans_lbl]],
			updates = acc_update,
			on_unused_input='warn',
#			mode=theano.compile.MonitorMode(post_func=detect_nan)
		)
	update = theano.function(
			inputs=[lr],
			updates = param_update + clear_update,
			outputs = [ T.sqrt(T.sum(T.sqr(w))) for w in deltas ],
			on_unused_input='warn',
#			mode=theano.compile.MonitorMode(post_func=detect_nan)
		)
	return acc,update
Пример #6
0
def make_train(input_size, output_size, mem_size, mem_width, hidden_size=100):
    P = Parameters()

    # Build controller. ctrl is a network that takes an external and read input
    # and returns the output of the network and its hidden layer
    ctrl = controller.build(P, input_size, output_size, mem_size, mem_width,
                            hidden_size)

    # Build model that predicts output sequence given input sequence
    predict = model.build(P, mem_size, mem_width, hidden_size, ctrl)

    input_seq = T.matrix('input_sequence')
    output_seq = T.matrix('output_sequence')
    [M, weights, output_seq_pred] = predict(input_seq)

    # Setup for adadelta updates
    cross_entropy = T.sum(T.nnet.binary_crossentropy(
        5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq),
                          axis=1)
    params = P.values()
    l2 = T.sum(0)
    for p in params:
        l2 = l2 + (p**2).sum()
    cost = T.sum(cross_entropy) + 1e-3 * l2
    # clip gradients
    grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)]

    train = theano.function(inputs=[input_seq, output_seq],
                            outputs=cost,
                            updates=updates.adadelta(params, grads))

    return P, train
Пример #7
0
def build_model(hidden_size, predict_only=False):
    X = T.matrix('X')
    Y = T.ivector('Y')
    #* (0.001 * U.initial_weights(2,hidden_size) + np.array([[0,0,1,1],[1,1,0,0]])))
    W_input_hidden = U.create_shared(U.initial_weights(2, hidden_size))
    b_hidden = U.create_shared(U.initial_weights(hidden_size))
    W_hidden_predict = U.create_shared(U.initial_weights(hidden_size, 2))
    b_predict = U.create_shared(U.initial_weights(2))

    params = [W_input_hidden, b_hidden, W_hidden_predict, b_predict]
    hidden_lin = T.dot(X, W_input_hidden) + b_hidden
    hidden = T.nnet.sigmoid(hidden_lin)
    predict = T.nnet.softmax(T.dot(hidden, W_hidden_predict) + b_predict)

    cost = -T.mean(T.log(
        predict[T.arange(Y.shape[0]), Y])) + 1e-3 * adjacency_constraint(
            hidden_lin)  # + 1e-4 * sum(T.sum(p**2) for p in params)
    accuracy = T.mean(T.eq(T.argmax(predict, axis=1), Y))
    grad = T.grad(cost, params)

    train = theano.function(
        inputs=[X, Y],
        #updates =  updates.momentum(params,grad,0.9999,0.1) if not predict_only else None,
        #updates =  updates.momentum(params,grad,0.999,0.0005),
        updates=updates.adadelta(params, grad),
        outputs=[accuracy, W_input_hidden, b_hidden, (hidden > 0.5)])
    predict = theano.function(inputs=[X], outputs=predict[:, 0])

    i = T.iscalar('i')
    hidden_p = theano.function(inputs=[X, i], outputs=hidden[:, i])

    return train, predict, hidden_p, params
Пример #8
0
    def turing_updates(cost , lr) :
        params = P.values()
        #whether add P weight decay
        l2 = T.sum(0)
        for p in params:
            l2 = l2 + (p ** 2).sum()
        all_cost = cost + 1e-3 * l2 
        clipper = updates.clip(5.)
        g = T.grad(all_cost, wrt=params)
        grads = clipper(g)
#        grads = [T.clip(g, -5, 5) for g in T.grad(all_cost, wrt=params)]
#        return updates.rmsprop(params, grads, learning_rate=lr)
        return updates.adadelta(params, grads, learning_rate=lr)        
Пример #9
0
def make_train_functions():
    P = Parameters()
    X = T.bvector('X')
    Y = T.ivector('Y')
    aux = {}

    predict = model.build(
        P,
        input_size=128,
        embedding_size=64,
        controller_size=256,
        stack_size=256,
        output_size=128,
    )

    output = predict(X,aux=aux)
    error = - T.log(output[T.arange(Y.shape[0]),((128+1 + Y)%(128+1))])
    error = error[-(Y.shape[0]/2):]
    parameters = P.values()
    gradients = T.grad(T.sum(error),wrt=parameters)
    shapes = [ p.get_value().shape for p in parameters ]
    count = theano.shared(np.float32(0))
    acc_grads  = [
        theano.shared(np.zeros(s,dtype=np.float32))
        for s in shapes
    ]

    acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\
                 [ (count,count + np.float32(1)) ]
    acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\
                [ (count,np.int32(0)) ]
    avg_grads = [ (g / count) for g in acc_grads ]
    avg_grads = [ clip(g,1) for g in acc_grads ]


    acc = theano.function(
            inputs=[X,Y],
            outputs=T.mean(error),
            updates = acc_update,
        )
    update = theano.function(
            inputs=[],
            updates=updates.adadelta(parameters,avg_grads,learning_rate=1e-8) + acc_clear
        )

    test = theano.function(
            inputs=[X],
            outputs=T.argmax(output,axis=1)[-(X.shape[0]/2):],
        )
    return acc,update,test
Пример #10
0
def make_train_functions():
    P = Parameters()
    X = T.bvector('X')
    Y = T.ivector('Y')
    aux = {}

    predict = model.build(
        P,
        input_size=128,
        embedding_size=64,
        controller_size=256,
        stack_size=256,
        output_size=128,
    )

    output = predict(X, aux=aux)
    error = -T.log(output[T.arange(Y.shape[0]), ((128 + 1 + Y) % (128 + 1))])
    error = error[-(Y.shape[0] / 2):]
    parameters = P.values()
    gradients = T.grad(T.sum(error), wrt=parameters)
    shapes = [p.get_value().shape for p in parameters]
    count = theano.shared(np.float32(0))
    acc_grads = [theano.shared(np.zeros(s, dtype=np.float32)) for s in shapes]

    acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\
                 [ (count,count + np.float32(1)) ]
    acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\
                [ (count,np.int32(0)) ]
    avg_grads = [(g / count) for g in acc_grads]
    avg_grads = [clip(g, 1) for g in acc_grads]

    acc = theano.function(
        inputs=[X, Y],
        outputs=T.mean(error),
        updates=acc_update,
    )
    update = theano.function(
        inputs=[],
        updates=updates.adadelta(parameters, avg_grads, learning_rate=1e-8) +
        acc_clear)

    test = theano.function(
        inputs=[X],
        outputs=T.argmax(output, axis=1)[-(X.shape[0] / 2):],
    )
    return acc, update, test
Пример #11
0
def train_model(docs,
                wordvec_size,
                hidden_size,
                error_threshold,
                update_mu=1e-3,
                update_eps=0.95):
    X, parameters, hidden, hidden1_reproduction, input_reproduction, unrolled = build_network(
        wordvec_size, hidden_size)

    #hidden, hidden_rep, input_rep, unrlld  = f(docs)

    error = build_error(X, hidden, hidden1_reproduction, input_reproduction)
    cost = error  # + 1e-6*sum( T.sum(abs(p)) for p in parameters )
    gradients = T.grad(cost, wrt=parameters)

    eps = T.dscalar('eps')
    mu = T.dscalar('mu')

    train = theano.function(inputs=[X, eps, mu],
                            updates=updates.adadelta(parameters, gradients, mu,
                                                     eps),
                            outputs=error)

    error = 10
    count = 0
    for i in range(10):
        start_time = time.time()
        error = 0
        for doc in docs:
            error += train(doc, update_mu, update_eps)
        if count % 1 == 0:
            print "iter=%d" % count, time.time() - start_time, error / len(
                docs)
        count += 1

    f = theano.function(
        inputs=[X],
        outputs=[hidden, hidden1_reproduction, input_reproduction, unrolled])
    print "Finish count=%d error=%f" % (count, error)
    return f
Пример #12
0
def make_functions(inputs, outputs, params, grads, lr):
    shapes = [p.get_value().shape for p in params]
    acc_grads = [theano.shared(np.zeros(s, dtype=np.float32)) for s in shapes]
    count = theano.shared(np.float32(0))
    acc_update = [(a, a + g)
                  for a, g in zip(acc_grads, grads)] + [(count, count + 1.)]

    #	deltas = acc_grads
    deltas = [ag / count for ag in acc_grads]
    grads_norms = [T.sqrt(T.sum(g**2)) for g in deltas]
    deltas = [
        T.switch(T.gt(n, 1.), 1. * g / n, g)
        for n, g in zip(grads_norms, deltas)
    ]

    #	param_update = [ (p, p - lr * g) for p,g in zip(params,deltas) ]
    param_update = updates.adadelta(
        params, deltas,
        learning_rate=lr)  # ,learning_rate=lr,rho=np.float32(0.95)

    clear_update = [(a, np.zeros(s, dtype=np.float32))
                    for a, s in zip(acc_grads, shapes)] + [(count, 0)]
    acc = theano.function(
        inputs=inputs,
        outputs=[outputs, output_ans[ans_lbl]],
        updates=acc_update,
        on_unused_input='warn',
        #			mode=theano.compile.MonitorMode(post_func=detect_nan)
    )
    update = theano.function(
        inputs=[lr],
        updates=param_update + clear_update,
        outputs=[T.sqrt(T.sum(T.sqr(w))) for w in deltas],
        on_unused_input='warn',
        #			mode=theano.compile.MonitorMode(post_func=detect_nan)
    )
    return acc, update
Пример #13
0
def build_model(hidden_size,predict_only=False):
	X = T.matrix('X')
	Y = T.ivector('Y')
		#* (0.001 * U.initial_weights(2,hidden_size) + np.array([[0,0,1,1],[1,1,0,0]])))
	W_input_hidden   = U.create_shared(U.initial_weights(2,hidden_size))
	b_hidden         = U.create_shared(U.initial_weights(hidden_size))
	W_hidden_predict = U.create_shared(U.initial_weights(hidden_size,2))
	b_predict        = U.create_shared(U.initial_weights(2))

	params = [W_input_hidden,b_hidden,W_hidden_predict,b_predict]
	hidden_lin = T.dot(X,W_input_hidden) + b_hidden
	hidden = T.nnet.sigmoid(hidden_lin)
	predict = T.nnet.softmax(T.dot(hidden,W_hidden_predict) + b_predict)
	
	cost = -T.mean(T.log(predict[T.arange(Y.shape[0]),Y])) + 1e-3*adjacency_constraint(hidden_lin)# + 1e-4 * sum(T.sum(p**2) for p in params)
	accuracy = T.mean(T.eq(T.argmax(predict,axis=1),Y))
	grad = T.grad(cost,params)
	
	train = theano.function(
			inputs = [X,Y],
			#updates =  updates.momentum(params,grad,0.9999,0.1) if not predict_only else None,
			#updates =  updates.momentum(params,grad,0.999,0.0005),
			updates =  updates.adadelta(params,grad),
			outputs = [accuracy,W_input_hidden,b_hidden,(hidden>0.5)]
		)
	predict = theano.function(
			inputs  = [X],
			outputs = predict[:,0]
		)

	i = T.iscalar('i')
	hidden_p = theano.function(
			inputs = [X,i],
			outputs = hidden[:,i]
		)

	return train,predict,hidden_p,params
Пример #14
0
def run_experiment(constraint_adj=False):
    X, output, cost, P = build_network(8, 3, constraint_adj)
    parameters = P.values()
    grads = T.grad(cost, wrt=parameters)
    train = theano.function(inputs=[X],
                            outputs=cost,
                            updates=updates.adadelta(parameters, grads))
    test = theano.function(
        inputs=[X],
        outputs=output,
    )
    data = np.eye(8, dtype=np.int8)
    #	data = np.vstack((data,))
    print "Training..."
    for _ in xrange(100000):
        np.random.shuffle(data)
        train(data)

    hidden_activations = theano.function(
        inputs=[X],
        outputs=T.nnet.sigmoid(T.dot(X, P.W_input_hidden) + P.b_hidden))
    #print_arr(test(np.eye(8,dtype=np.int32)))
    #print_arr(1/(1 + np.exp(-parameters[0].get_value())),1)
    return hinton(hidden_activations(np.eye(8, dtype=np.int8)))
Пример #15
0
	X,parameters,hidden,hidden1_reproduction,input_reproduction,unrolled = build_network(8,64)
	f = theano.function(
			inputs  = [X],
			outputs = [hidden,hidden1_reproduction,input_reproduction,unrolled]
		)

	error = build_error(X,hidden,hidden1_reproduction,input_reproduction)
	cost  = error # + 1e-6*sum( T.sum(abs(p)) for p in parameters )
	gradients = T.grad(cost,wrt=parameters)
	
	eps = T.dscalar('eps')
	mu  = T.dscalar('mu')
	
	train = theano.function(
			inputs = [X,eps,mu],
			updates = updates.adadelta(parameters,gradients,mu,eps),
			outputs = error
		)

	#example = np.vstack((np.eye(8),np.eye(8)))
	example = np.eye(8)
	error = 10
	lr = 0.0001
	t = 0
	while error > 0.0001:
		np.random.shuffle(example)
		#error = train(example,lr,min(1 - 3.0/(t+5),0.999))
		error = train(example,1e-6,0.95)
		#error = train(example,lr,0)
		print error
		t += 1
Пример #16
0
    predict = T.nnet.softmax(T.dot(hidden, W_hidden_output) + b_output)

    return X, predict


def label_seq(string):
    idxs = font.indexify(string)
    result = np.ones((len(idxs) * 2 + 1, ), dtype=np.int32) * -1
    result[np.arange(len(idxs)) * 2 + 1] = idxs
    print result
    return result


if __name__ == "__main__":
    P = Parameters()
    X = T.matrix('X')
    Y = T.ivector('Y')
    X, predict = build_model(P, X, 10, 10, 10)

    cost = ctc.cost(predict, Y)
    params = P.values()
    grad = T.grad(cost, wrt=params)
    train = theano.function(inputs=[X, Y],
                            outputs=cost,
                            updates=updates.adadelta(params, grad))

    for _ in xrange(10):
        print train(
            np.eye(10, dtype=np.float32)[::-1], np.arange(10, dtype=np.int32))
Пример #17
0
    predict = T.nnet.softmax(T.dot(hidden, W_hidden_output) + b_output)

    return X, predict


def label_seq(string):
    idxs = font.indexify(string)
    result = np.ones((len(idxs) * 2 + 1,), dtype=np.int32) * -1
    result[np.arange(len(idxs)) * 2 + 1] = idxs
    print result
    return result


if __name__ == "__main__":
    P = Parameters()
    X = T.matrix('X')
    Y = T.ivector('Y')
    X, predict = build_model(P, X, 10, 10, 10)

    cost = ctc.cost(predict, Y)
    params = P.values()
    grad = T.grad(cost, wrt=params)
    train = theano.function(
        inputs=[X, Y],
        outputs=cost,
        updates=updates.adadelta(params, grad)
    )

    for _ in xrange(10):
        print train(np.eye(10, dtype=np.float32)[::-1], np.arange(10, dtype=np.int32))