Пример #1
0
def run():
    # params
    dims = 10
    negrate = 1
    batsize = 300
    epochs = 300

    #paths
    datafileprefix = "../../data/nycfilms/"
    dirfwdsuffix = "direct_forward.plustypes.ssd"

    # get the data and split
    dirfwdf = open(datafileprefix+dirfwdsuffix)
    datadf = readdata(dirfwdf)
    traind, validd, testd = datadf.split((70, 15, 15), random=True)

    numents = int(datadf.ix[:, 0].max())+1
    print numents
    numrels = int(datadf.ix[:, 1].max())+1
    print numrels

    # define model
    inp = Input(T.imatrix())

    eemb = VectorEmbed.indim(numents).outdim(dims).Wreg(l2reg(0.00001))()
    remb = VectorEmbed.indim(numrels).outdim(dims).Wreg(l2reg(0.00001))()

    # for debugging
    eembd = SymTensor(T.fmatrix())
    rembd = SymTensor(T.fmatrix())
    dotp = SymTensor(T.fmatrix())

    out = ((inp[:, 0] >> eemb >> eembd) & (inp[:, 1] >> remb >> rembd)) >> DotProduct() >> dotp >> Tanh()

    # for plotting purposes: relation to relation dot product (or relation-type)
    r2rinp = Input(T.imatrix())
    rel2rel = ((r2rinp[:, 0] >> remb) & (r2rinp[:, 1] >> remb)) >> DotProduct()

    outtest = Output(T.fvector())

    loss = (out & outtest) >> HingeLoss()
    trainer = Trainer\
        .batsize(batsize)\
        .epochs(epochs)\
        .onrun(getonrun())\
        .offrun(offrun)\
        .offepoch(getoffepoch(out, rel2rel))\
        .onbatch(getonbatch(negrate, numents, numrels))\
        .optimizer(sgd(lr=1.))\
        .batchtransformer(transbat)
    trainer\
        .loss(loss)\

    trainer.train(traind.values, validd.values)\
           .test(testd.values)

    explore(eemb, remb)
    # functions for interactive exploration

    embed()
Пример #2
0
 def make_node(self, x, x2, x3, x4, x5):
     # check that the theano version has support for __props__.
     # This next line looks like it has a typo,
     # but it's actually a way to detect the theano version
     # is sufficiently recent to support the use of __props__.
     assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
     x = tensor.as_tensor_variable(x)
     x2 = tensor.as_tensor_variable(x2)
     x3 = tensor.as_tensor_variable(x3)
     x4 = tensor.as_tensor_variable(x4)
     x5 = tensor.as_tensor_variable(x5)
     
     if prm.att_doc:
         if prm.compute_emb:
             td = tensor.itensor4().type()
         else:
             td = tensor.ftensor4().type()
         tm = tensor.ftensor3().type()
     else:
         if prm.compute_emb:
             td = tensor.itensor3().type()
         else:
             td = tensor.ftensor3().type()
         tm = tensor.fmatrix().type()
     return theano.Apply(self, [x,x2,x3,x4,x5], [td, tm, \
                                        tensor.fmatrix().type(), tensor.ivector().type()])
	def __init__(self,
				 word_vec_width,
				 batch_size,
				 num_hidden,
				 learning_rate=0.1):
		self.num_hidden = num_hidden
		self.learning_rate = learning_rate
		self.word_vec_width = word_vec_width
		self.batch_size = batch_size

		self.vocab_mat = T.fmatrix('vocab')
		self.word_onehot = T.fmatrix('word_onehot')
		b = T.fvector('b')
		W = T.fmatrix('W')
		f = 1 / (1 + T.exp(-(W * (self.word_onehot.dot(self.vocab_mat) + b))))
		s = T.sum(f)

		self.exec_fn = theano.function(
			[self.word_onehot, b, W, self.vocab_mat],
			f,
			allow_input_downcast=True)

		self.word_onehot_c = T.fmatrix('word_onehot_c')
		f_c = 1 / (1 + T.exp(-(W * (self.word_onehot_c.dot(self.vocab_mat)) + b)))
		s_c = T.sum(f_c)

		J = T.largest(0, 1 - s + s_c)
		self.grad = theano.grad(J, [b, W, self.vocab_mat])

		self.grad_fn = theano.function(
			[self.word_onehot, self.word_onehot_c, b, W, self.vocab_mat],
			self.grad,
			allow_input_downcast=True)
Пример #4
0
    def __init__(self, input_layers, *args, **kwargs):
        super(LogLossObjective, self).__init__(input_layers, *args, **kwargs)
        self.input_systole = input_layers["systole:onehot"]
        self.input_diastole = input_layers["diastole:onehot"]

        self.target_vars["systole:onehot"]  = T.fmatrix("systole_target_onehot")
        self.target_vars["diastole:onehot"] = T.fmatrix("diastole_target_onehot")
def test_pickle_unpickle_without_reoptimization():
    mode = theano.config.mode
    if mode in ["DEBUG_MODE", "DebugMode"]:
        mode = "FAST_RUN"
    x1 = T.fmatrix('x1')
    x2 = T.fmatrix('x2')
    x3 = theano.shared(numpy.ones((10, 10), dtype=floatX))
    x4 = theano.shared(numpy.ones((10, 10), dtype=floatX))
    y = T.sum(T.sum(T.sum(x1**2 + x2) + x3) + x4)

    updates = OrderedDict()
    updates[x3] = x3 + 1
    updates[x4] = x4 + 1
    f = theano.function([x1, x2], y, updates=updates, mode=mode)

    # now pickle the compiled theano fn
    string_pkl = pickle.dumps(f, -1)

    # compute f value
    in1 = numpy.ones((10, 10), dtype=floatX)
    in2 = numpy.ones((10, 10), dtype=floatX)

    # test unpickle without optimization
    default = theano.config.reoptimize_unpickled_function
    try:
        # the default is True
        theano.config.reoptimize_unpickled_function = False
        f_ = pickle.loads(string_pkl)
        assert f(in1, in2) == f_(in1, in2)
    finally:
        theano.config.reoptimize_unpickled_function = default
Пример #6
0
    def cmp(a_shp, b_shp):
        a0 = my_rand(*a_shp)
        a = tcn.shared_constructor(a0, 'a')

        b = tensor.fmatrix('b')
        c = tensor.fmatrix('c')

        f = pfunc([b, c], [], updates=[(a, tensor.dot(a, b) + tensor.exp(c))],
                mode=mode_with_gpu)
        assert any([node.op == tcn.blas.gpu_gemm_inplace
            for node in f.maker.fgraph.toposort()])

        bval = my_rand(*b_shp)
        cval = my_rand(a_shp[0], b_shp[1])
        f(bval, cval)

        assert numpy.allclose(numpy.dot(a0, bval) + numpy.exp(cval),
                a.get_value())

        # Try with a matrix equal to a0, but with strides in both dims
        a.set_value(a0)
        a.set_value(
                a.get_value(borrow=True,
                    return_internal_type=True)[::-1, ::-1],
                borrow=True)
        f(bval, cval)
Пример #7
0
    def cmp(a_shp, b_shp):
        a = tensor.fmatrix()
        b = tensor.fmatrix()
        scalar = tensor.fscalar()
        av = my_rand(*a_shp)
        bv = my_rand(*b_shp)

        f = theano.function(
                [a, b],
                tensor.dot(a, b) * numpy.asarray(4, 'float32'),
                mode=mode_with_gpu)
        f2 = theano.function(
                [a, b],
                tensor.dot(a, b) * numpy.asarray(4, 'float32'))
        t = f.maker.fgraph.toposort()
        assert len(t) == 4
        assert isinstance(t[0].op, tcn.GpuFromHost)
        assert isinstance(t[1].op, tcn.GpuFromHost)
        assert isinstance(t[2].op, tcn.blas.GpuDot22Scalar)
        assert isinstance(t[3].op, tcn.HostFromGpu)
        assert numpy.allclose(f(av, bv), f2(av, bv))

        f = theano.function([a, b, scalar], tensor.dot(a, b) * scalar,
                mode=mode_with_gpu)
        f2 = theano.function([a, b, scalar], tensor.dot(a, b) * scalar)
        t = f.maker.fgraph.toposort()
        assert len(t) == 4
        assert isinstance(t[0].op, tcn.GpuFromHost)
        assert isinstance(t[1].op, tcn.GpuFromHost)
        assert isinstance(t[2].op, tcn.blas.GpuDot22Scalar)
        assert isinstance(t[3].op, tcn.HostFromGpu)
        assert numpy.allclose(f(av, bv, 0.5), f2(av, bv, 0.5))
Пример #8
0
def test_gpujoin_gpualloc():
    a = T.fmatrix('a')
    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
    b = T.fmatrix('b')
    b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32')

    f = theano.function([a, b], T.join(0, T.zeros_like(a),T.ones_like(b)) + 4,
                        mode=mode_without_gpu)
    f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)),
                            mode=mode_with_gpu)
    f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a),
                                           T.ones_like(b)) + 4,
                             mode=mode_with_gpu)

    assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 2
    assert sum([node.op == T.join for node in f.maker.env.toposort()]) == 1
    assert sum([node.op == B.gpu_alloc
                for node in f_gpu.maker.env.toposort()]) == 2
    assert sum([node.op == B.gpu_join
                for node in f_gpu.maker.env.toposort()]) == 1
    assert sum([node.op == B.gpu_alloc
                for node in f_gpu2.maker.env.toposort()]) == 2
    assert sum([node.op == B.gpu_join
                for node in f_gpu2.maker.env.toposort()]) == 1
    assert numpy.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
Пример #9
0
def test_local_gpu_elemwise_0():
    """
    Test local_gpu_elemwise_0 when there is a dtype upcastable to float32
    """
    a = tensor.bmatrix()
    b = tensor.fmatrix()
    c = tensor.fmatrix()

    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")

    # Due to optimization order, this composite is created when all
    # the op are on the gpu.
    f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    f(a_v, b_v, c_v)

    # Now test with the composite already on the cpu before we move it
    # to the gpu
    a_s = theano.scalar.int8()
    b_s = theano.scalar.float32()
    c_s = theano.scalar.float32()
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
    out_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    f(a_v, b_v, c_v)
Пример #10
0
def show_patches_on_frames(ims, locations_, scales_,
                           image_shape=(100, 100), patch_shape=(16, 16)):
    hyperparameters = {}
    hyperparameters["cutoff"] = 3
    hyperparameters["batched_window"] = True
    location = T.fmatrix()
    scale = T.fmatrix()
    x = T.fvector()
    cropper = LocallySoftRectangularCropper(
        patch_shape=patch_shape,
        hyperparameters=hyperparameters,
        kernel=Gaussian())
    patch = cropper.apply(
        x.reshape((1, 1,) + image_shape),
        np.array([list(image_shape)]),
        location,
        scale)
    get_patch = theano.function([x, location, scale], patch,
                                allow_input_downcast=True)
    final_shape = (image_shape[0], image_shape[0] + patch_shape[0] + 5)
    ret = np.ones((ims.shape[0], ) + final_shape + (3,), dtype=np.float32)
    for i in range(ims.shape[0]):
        im = ims[i]
        location_ = locations_[i]
        scale_ = scales_[i]
        patch_on_frame = show_patch_on_frame(im, location_, scale_)
        ret[i, :, :image_shape[1], :] = patch_on_frame
        ret[i, -patch_shape[0]:, image_shape[1] + 5:, :] = to_rgb1(
            get_patch(im, [location_], [scale_])[0, 0])
    return ret
Пример #11
0
def test_elemwise_composite_float64():
    # test that we don't fuse composite elemwise with float64 somewhere inside
    # nvcc by default downcast them to float32. We would need to tell him not
    # to do so, but that possible only on some device.
    a = tensor.fmatrix()
    b = tensor.fmatrix()
    av = theano._asarray(numpy.random.rand(4, 4), dtype='float32')
    bv = numpy.ones((4, 4), dtype='float32')

    def get_all_basic_scalar(composite_op):
        l = []
        for i in composite_op.env.toposort():
            if isinstance(i, theano.scalar.Composite):
                l += get_all_basic_scalar(i)
            else:
                l.append(i)
        return l
    for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'),
                 mode_with_gpu.excluding('elemwise_fusion')]:
        f = pfunc([a, b],
                  tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2,
                                               b),
                                     'float32'), mode=mode)

        out = f(av, bv)
        assert numpy.all(out == ((av ** 2) < bv))
        for node in f.maker.env.toposort():
            if isinstance(node.op, cuda.GpuElemwise):
                if isinstance(node.op.scalar_op, theano.scalar.Composite):
                    scals = get_all_basic_scalar(node.op.scalar_op)
                    for s in scals:
                        assert not any([i.type.dtype == 'float64'
                                        for i in s.inputs + s.outputs])
Пример #12
0
    def __init__(self, name, input_neurons, output_neurons):
        self.input_neurons=input_neurons
        self.output_neurons=output_neurons

        self.name = name

        #Initialize theano variables:
        self.W_forget_theano = T.fmatrix(self.name + '_forget_weight')
        self.W_input_theano = T.fmatrix(self.name + '_input_weight')
        self.W_candidate_theano = T.fmatrix(self.name + '_candidate_weight')
        self.W_output_theano = T.fmatrix(self.name + '_output_weight')

        #Initialize python variables:

        high_init = np.sqrt(6)/np.sqrt(self.input_neurons + 2*self.output_neurons)
        low_init = -high_init
        
        s = (self.output_neurons, self.input_neurons + self.output_neurons + 1)
        self.W_forget = np.random.uniform(low=low_init, high=high_init, size=s).astype(np.float32)
        self.W_input = np.random.uniform(low=low_init, high=high_init, size=s).astype(np.float32)
        self.W_candidate = np.random.uniform(low=low_init, high=high_init, size=s).astype(np.float32)
        self.W_output = np.random.uniform(low=low_init, high=high_init, size=s).astype(np.float32)

        #Initialize forget bias to one:
        self.W_forget[-1] = np.ones_like(self.W_forget[-1], dtype=np.float32)
Пример #13
0
    def __theano_build__(self):
        params = self.params
        param_names = self.param_names
        hidden_dim = self.hidden_dim

        x1  = T.imatrix('x1')    # first sentence
        x2  = T.imatrix('x2')    # second sentence
        x1_mask = T.fmatrix('x1_mask')    #mask
        x2_mask = T.fmatrix('x2_mask')
        y   = T.ivector('y')     # label
        y_c = T.ivector('y_c')   # class weights 
        
        # Embdding words
        _E1 = params["E"].dot(params["W"][0]) + params["B"][0]
        _E2 = params["E"].dot(params["W"][1]) + params["B"][1]
        statex1 = _E1[x1.flatten(), :].reshape([x1.shape[0], x1.shape[1], hidden_dim])
        statex2 = _E2[x2.flatten(), :].reshape([x2.shape[0], x2.shape[1], hidden_dim])
        
        def rnn_cell(x, mx, ph, Wh):
            h = T.tanh(ph.dot(Wh) + x)
            h = mx[:, None] * h + (1-mx[:, None]) * ph
            return [h] 
            
        [h1], updates = theano.scan(
            fn=rnn_cell,
            sequences=[statex1, x1_mask],
            truncate_gradient=self.truncate,
            outputs_info=[dict(initial=T.zeros([self.batch_size, self.hidden_dim]))],
            non_sequences=params["W"][2])
        
        [h2], updates = theano.scan(
            fn=rnn_cell,
            sequences=[statex2, x2_mask],
            truncate_gradient=self.truncate,
            outputs_info=[dict(initial=h1[-1])],
            non_sequences=params["W"][3])
       
        #predict
        _s = T.nnet.softmax(h1[-1].dot(params["lrW"][0]) + h2[-1].dot(params["lrW"][1]) + params["lrb"])
        _p = T.argmax(_s, axis=1)
        _c = T.nnet.categorical_crossentropy(_s, y)
        _c = T.sum(_c * y_c)
        _l = T.sum(params["lrW"]**2)
        _cost = _c + 0.01 * _l
        
        # SGD parameters
        learning_rate = T.scalar('learning_rate')
        decay = T.scalar('decay')
        
        # Gradients and updates
        _grads, _updates = rms_prop(_cost, param_names, params, learning_rate, decay)
        
        # Assign functions
        self.bptt = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _grads)
        self.loss = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _c)
        self.weights = theano.function([x1, x2, x1_mask, x2_mask], _s)
        self.predictions = theano.function([x1, x2, x1_mask, x2_mask], _p)
        self.sgd_step = theano.function(
            [x1, x2, x1_mask, x2_mask, y, y_c, learning_rate, decay],
            updates=_updates)
Пример #14
0
	def setup_theano(self):
		self.vocab_mat = T.fmatrix('vocab')
		self.sample = T.fmatrix('sample')
		b = T.fvector('b')
		W = T.fmatrix('W')
		f = self.transform_function(
			W, 
			b, 
			self.wordvec_transform(self.sample, self.vocab_mat))
		s = T.sum(f)

		self.corrupt_sample = T.fmatrix('corrupt-sample')
		f_corrupt = self.transform_function(
			W,
			b,
			self.wordvec_transform(self.corrupt_sample, self.vocab_mat))
		s_corrupt = T.sum(f_corrupt)
		J = T.largest(0, 1 - s + s_corrupt)
		self.grad = theano.grad(J, [b, W, self.vocab_mat])

		self.grad_fn = theano.function(
			[self.sample, self.corrupt_sample, b, W, self.vocab_mat],
			self.grad,
			allow_input_downcast=True)

		self.exec_fn = theano.function([self.sample, b, W, self.vocab_mat],
			f,
			allow_input_downcast=True)
Пример #15
0
    def build_loss_graph(self, saved_graph=None):
        print("Building loss graph...")

        for l in self.layers:
            l.set_training(False)

        Sentence = T.fmatrix('Sentence')
        Characters = T.ftensor3('Characters')
        WordLengths = T.ivector('WordLengths')
        GoldPredictions = T.fmatrix('GoldPredictions')
        
        weight_list = self.get_theano_weight_list()

        if self.feature_mode == 'character':
            result = self.theano_sentence_loss(Characters, WordLengths, GoldPredictions)
            input_list = [Characters, WordLengths, GoldPredictions] + list(weight_list)
        elif self.feature_mode == 'sentence':
            result = self.theano_sentence_loss(Sentence, GoldPredictions)
            input_list = [Sentence, GoldPredictions] + list(weight_list)
        elif self.feature_mode == 'both':
            result = self.theano_sentence_loss(Sentence, Characters, WordLengths, GoldPredictions)
            input_list = [Sentence, Characters, WordLengths, GoldPredictions] + list(weight_list)

        cgraph = theano.function(inputs=input_list, outputs=result, mode='FAST_RUN', allow_input_downcast=True)

        print("Done building graph.")
        
        return cgraph
Пример #16
0
 def _training_DNN(self):   
     trX, trY, self.missing_filename_list,  = read_features(self.test_number, self.n_input_f, self.n_output_f)     
     trX = trX[:,1:self.n_input_f]
     trY = trY[:,1:self.n_output_f]
     print trX.shape
     print trY.shape   
     print self.nloop, self.n_hidden_layer, self.n_input_f, self.n_hidden_f, self.n_output_f
     
     X = T.fmatrix()
     Y = T.fmatrix()
     py_x = self._model(X, self.params, self.bias)
     y_x = py_x
     cost = T.mean(T.sqr(py_x - Y))
     updates = self._sgd(cost, self.params, self.bias)
     train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
     self.predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)
           
     for i in range(self.nloop, self.nloop + 0 ):
         print i
         #logging.debug('loop' + str(i))
         error_total = 0
         arr_X_Y = zip(range(0, len(trX), 128), range(128, len(trX), 128))
         for start, end in arr_X_Y:
             cost = train(trX[start:end], trY[start:end])
             error_total += cost
             #print cost
         last_element = arr_X_Y[len(arr_X_Y)-1][0] 
         if last_element < len(trX):
             cost = train(trX[last_element: len(trX)], trY[last_element:len(trY)])    
             error_total += cost
         print error_total / len(trX)
         save_weight_info( self.filename, i, self.n_hidden_layer, self.n_input_f, self.n_hidden_f, self.n_output_f, self.params, error_total, self.bias)
         self.id_file = 1 - self.id_file
         self.filename = self.weight_folder + 'id_' + str(self.id_file) + ".txt"            
Пример #17
0
def test_graph_opt_caching():
    opt_db_file = os.path.join(theano.config.compiledir, 'optimized_graphs.pkl')
    if os.path.exists(opt_db_file):
        os.remove(opt_db_file)

    mode = theano.config.mode
    if mode in ["DEBUG_MODE", "DebugMode"]:
        mode = "FAST_RUN"
    default = theano.config.cache_optimizations
    try:
        theano.config.cache_optimizations = True
        a = T.fmatrix('a')
        b = T.fmatrix('b')
        c = theano.shared(np.ones((10, 10), dtype=floatX))
        d = theano.shared(np.ones((10, 10), dtype=floatX))
        e = T.sum(T.sum(T.sum(a ** 2 + b) + c) + d)
        f1 = theano.function([a, b], e, mode=mode)

        m = T.fmatrix('x1')
        n = T.fmatrix('x2')
        p = theano.shared(np.ones((10, 10), dtype=floatX))
        q = theano.shared(np.ones((10, 10), dtype=floatX))
        j = T.sum(T.sum(T.sum(m ** 2 + n) + p) + q)
        f2 = theano.function([m, n], j, mode=mode)

        in1 = np.ones((10, 10), dtype=floatX)
        in2 = np.ones((10, 10), dtype=floatX)
        assert f1(in1, in2) == f2(in1, in2)
    finally:
        theano.config.cache_optimizations = default
Пример #18
0
def multiclass_logistic_regr(mnist):
    
    def floatX(X):
        return np.asarray(X, dtype=theano.config.floatX)

    def init_weights(shape):
        return theano.shared(floatX(np.random.randn(*shape)*0.01))

    def model(X, w):
        return T.nnet.softmax(T.dot(X, w))

    # each image is 28x28
    # trX: 60,000x784
    # trY: 60,000x10
    trX, teX, trY, teY = mnist(onehot=True)

    X = T.fmatrix()
    Y = T.fmatrix()

    w = init_weights([784, 10])

    py_x = model(X, w)
    y_pred = T.argmax(py_x, axis=1)

    cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
    gradient = T.grad(cost=cost, wrt=w)
    update = [[w, w - 0.05*gradient]]

    train = theano.function(inputs=[X, Y], output=cost, updates=update, allow_input_downcast=True)
    predict = theano.function(inputs=X, output=y_pred, allow_input_downcast=True)

    mbsize = 128
    for start, end in zip(xrange(0, len(trX), mbsize), xrange(mbsize, len(trX), mbsize)):
        c = train(trX[start:end], trY[start:end])
        print i, np.mean(np.argmax(teY, axis=1) == predict(teX))
Пример #19
0
    def build_ann(self, number_of_input_nodes, no, nr_of_hidden_layers, nr_of_nodes_in_layers, act_functions):
        weights = []
        a = theano.shared(np.random.uniform(low=-.1, high=.1, size=(number_of_input_nodes, nr_of_nodes_in_layers[0])))
        weights.append(a)
        for i in range(1, nr_of_hidden_layers):
            weights.append(theano.shared(np.random.uniform(low=-.1, high=.1, size=(nr_of_nodes_in_layers[i-1], nr_of_nodes_in_layers[i]))))
        weights.append(theano.shared(np.random.uniform(low=-.1, high=.1, size=(nr_of_nodes_in_layers[-1], no))))

        input = T.fmatrix()
        target = T.fmatrix()

        layers = []
        # First hidden layer
        self.add_layer_activation_function(act_functions[0], layers, input, weights[0])
        # Next layers
        for j in range(nr_of_hidden_layers):
            self.add_layer_activation_function(act_functions[j+1], layers, layers[j], weights[j+1])


        error = T.sum(pow((target - layers[-1]), 2)) # Sum of squared errors
        params = [w for w in weights]
        gradients = T.grad(error, params)
        backprops = self.backprop_acts(params, gradients)

        #self.get_x1 = theano.function(inputs=[input, target], outputs=error, allow_input_downcast=True)
        self.trainer = theano.function(inputs=[input, target], outputs=error, updates=backprops, allow_input_downcast=True)
        self.predictor = theano.function(inputs=[input], outputs=layers[-1], allow_input_downcast=True)
def create_encoder_decoder_func(layers, apply_updates=False):
    X = T.fmatrix('X')
    X_batch = T.fmatrix('X_batch')

    X_hat = get_output(layers['l_decoder_out'], X, deterministic=False)

    # reconstruction loss
    encoder_decoder_loss = T.mean(
        T.mean(T.sqr(X - X_hat), axis=1)
    )

    if apply_updates:
        # all layers that participate in the forward pass should be updated
        encoder_decoder_params = get_all_params(
            layers['l_decoder_out'], trainable=True)

        encoder_decoder_updates = nesterov_momentum(
            encoder_decoder_loss, encoder_decoder_params, 0.01, 0.9)
    else:
        encoder_decoder_updates = None

    encoder_decoder_func = theano.function(
        inputs=[theano.In(X_batch)],
        outputs=encoder_decoder_loss,
        updates=encoder_decoder_updates,
        givens={
            X: X_batch,
        },
    )

    return encoder_decoder_func
Пример #21
0
def main_train():
    trX, teX, trY, teY = mnist(onehot=True)

    X = T.fmatrix()
    Y = T.fmatrix()

    w_h = init_weights((784, 625))
    w_h2 = init_weights((625, 625))
    w_o = init_weights((625, 10))
    params = [w_h, w_h2, w_o]

    noise_h, noise_h2, noise_py_x = model(X, params, 0.2, 0.5)
    h, h2, py_x = model(X, params, 0., 0.)
    y_x = T.argmax(py_x, axis=1)

    cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
    updates = RMSprop(cost, params, lr=0.001)

    train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
    predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

    for i in range(100):
        for start, end in zip(range(0, len(trX), 128), range(128, len(trX), 128)):
            cost = train(trX[start:end], trY[start:end])
        print np.mean(np.argmax(teY, axis=1) == predict(teX))
        if i % 10 == 0:
            name = 'media/model/modnet-{0}.model'.format(str(i))
            save_model(name, params)
    name = 'media/model/modnet-final.model'
    save_model(name, params)
Пример #22
0
    def __init__(self, dimX, dimZ, hls, acts):
    	self.dimZ = dimZ
        self.f = MLP(dimX, dimZ, [1200], [tanh, tanh])
        self.g = MLP(dimZ, dimX, [1200], [tanh, sigm])
        self.generator = MLP(dimZ, dimX, [1200, 1200], [tanh, tanh, sigm])
        self.params = self.f.params + self.g.params + self.generator.params
        x = T.fmatrix('x')
        lr = T.scalar('lr')
        noise = T.scalar('noise')
        z = self.f(2*x-1)
        rx = self.g(z)
        cost_recons = ce(rx, x).mean(axis=1).mean(axis=0)

        rand = rng_theano.uniform(low=0, high=1, size=z.shape)
        nz = self.nearest_neighbour_of_in(rand, z) # nn of rand in z
        xnz = self.g(nz)
        rxx = self.generator(rand)
        cost_gen = ce(rxx, xnz).mean(axis=1).mean(axis=0)
        grads_f = T.grad(cost_recons, self.f.params)
        grads_g = T.grad(cost_recons, self.g.params)
        grads_gen = T.grad(cost_gen, self.generator.params)
        grads = grads_f + grads_g + grads_gen
        updates = map(lambda (param, grad): (param, param - lr * grad), zip(self.params, grads))
        nnd = self.nearest_neighbour_distances(z)
        self.train_fn = theano.function([x, lr], [cost_recons, cost_gen, nnd.mean(), nnd.std()], updates=updates)

        z = T.fmatrix('z')
        self.sample_fn = theano.function([z], self.g(z), allow_input_downcast=True)
        self.infer_fn = theano.function([x], self.f(2*x-1), allow_input_downcast=True)
        self.generator_fn = theano.function([z], self.g(z), allow_input_downcast=True)
Пример #23
0
def test_does_not_crash():
  Z = T.ftensor3('Z')
  W_re = T.fmatrix('W_re')
  W_att_in = T.fmatrix('W_att_in')
  c = T.fmatrix('c') #initial state
  y0 = T.fmatrix('y0') #initial activation
  i = T.matrix('i',dtype='int8')
  Y, H, d = LSTMCustomTestOpNoInplaceInstance(Z, c, y0, i, W_re, W_att_in)

  f = theano.function(inputs=[Z, c, y0, i, W_re, W_att_in], outputs=Y)

  n_T = 5
  n_batch = 4
  n_inp_dim = 3
  n_cells = 8
  numpy.random.seed(1234)
  Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32')
  W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  #i_val = numpy.ones((n_T, n_batch), dtype='int8')
  i_val = numpy.array([[1,1,1,1,1], [0,0,1,1,1], [0,0,1,1,1], [0,0,1,0,0]], dtype='int8').T

  Y_val = numpy.asarray(f(Z_val, c_val, y0_val, i_val, W_re_val, W_att_in_val))
  #print Y_val
  print("success")
Пример #24
0
    def predict_df(self, input_df = None ):

        f = open('/tmp/obj.save', 'rb')
        neural_model = cPickle.load(f)
        f.close()

        X, y = neural_model['enc'].transform(input_df)
        # X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42)
        trX, teX, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42)

        trY = one_hot(Y_train, n=2)
        teY = one_hot(Y_test, n=2)

        X = T.fmatrix()
        Y = T.fmatrix()

        h, h2, py_x = model(X, neural_model['w_h'], neural_model['w_h2'], neural_model['w_o'], 0., 0.)
        y_pred = T.argmax(py_x, axis=1)

        cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
        gradient = T.grad(cost=cost, wrt=w)
        update = [[w, w - gradient * 0.05]]

        train = theano.function(inputs=[X, Y], outputs=cost, updates=update, allow_input_downcast=True)
        predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True)
        print('Loaded precision:' , np.mean(np.argmax(teY, axis=1) == predict(teX)))


        return predict(teX)
    def train(self, trX, teX, trY, teY, plot=True, epochs=TIMES, shortcard=SHORTCARD, speed=SPEED, drop_input=DROP_INPUT, drop_hidden=DROP_HIDDEN, step_show=STEP_SHOW, rho=RHO, epsilon=EPSILON):
        X = T.fmatrix()
        Y = T.fmatrix()
        train_set_n = len(trY)
        test_set_n = len(teY)
        accuracy_arr = []
        diff_arr = []
        i_arr = []

        noise_py_x = self._model(X, drop_input, drop_hidden)
        cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
        updates = self._RMSprop(cost, lr=speed, rho=rho, epsilon=epsilon)

        train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)

        for i in range(TIMES):
            for start, end in zip(range(0, train_set_n, shortcard), range(shortcard, train_set_n, shortcard)):
                cost = train(trX[start:end], trY[start:end])
            if i % step_show == 0:
                acc = np.mean(np.argmax(teY, axis=1) == self.predict(teX))
                accuracy_arr.append(acc)
                di = self.get_diff(teX, teY)
                diff_arr.append(di)
                i_arr.append(i)
                print "{0} {1:.3f}% {2:.1f}".format(i, acc * 100, di)
        if plot:
            self._name = "Epochs: {0}, Shortcard: {1}, Speed: {2:.5f}\n Structure: {3}\n Train: {4}, Test: {5}".format(epochs, shortcard, speed, self._struct, train_set_n, test_set_n)
            self._name_f = "epochs_{0}_shortcard_{1}_speed_{2:.5f}_structure_{3}_train_{4}_test_{5}".format(epochs, shortcard, speed, self._struct, train_set_n, test_set_n)
            self._plot(i_arr, accuracy_arr, diff_arr)       
Пример #26
0
    def get_adadelta_trainer(self, debug=False):
        batch_x1 = T.fmatrix('batch_x1')
        batch_x2 = T.fmatrix('batch_x2')
        batch_y = T.ivector('batch_y')
        # compute the gradients with respect to the model parameters
        cost = self.cost
        gparams = T.grad(cost, self.params)

        # compute list of weights updates
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads,
                self._accudeltas, self.params, gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad

        outputs = cost
        if debug:
            outputs = [cost] + self.params + gparams +\
                    [updates[param] for param in self.params]

        train_fn = theano.function(inputs=[theano.Param(batch_x1), 
            theano.Param(batch_x2), theano.Param(batch_y)],
            outputs=outputs,
            updates=updates,
            givens={self.x1: batch_x1, self.x2: batch_x2, self.y: batch_y})

        return train_fn
Пример #27
0
def test_pycuda_elemwise_kernel():
    x=T.fmatrix('x')
    y=T.fmatrix('y')
    f=theano.function([x,y],x+y, mode=mode_with_gpu)
    print f.maker.env.toposort()
    f2 = theano.function([x,y],x+y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
    print f2.maker.env.toposort()

    assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
    assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f2.maker.env.toposort()])

    val1 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
    val2 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
    #val1 = numpy.ones((5,5))
    #val2 = numpy.arange(25).reshape(5,5)
    assert (f(val1,val2) == f2(val1,val2)).all()
    print f(val1,val2)
    print f2(val1,val2)


    x3=T.ftensor3('x')
    y3=T.ftensor3('y')
    z3=T.ftensor3('y')

    f4 = theano.function([x3,y3,z3],x3*y3+z3, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
    print f4.maker.env.toposort()
    assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f4.maker.env.toposort()])

    val1 = numpy.random.rand(2,2,2)
    print val1
    print f4(val1,val1,val1)
    assert numpy.allclose(f4(val1,val1,val1),val1*val1+val1)
Пример #28
0
def test_fwd_pass_compatible_with_OpLSTM():
  Z = T.ftensor3('Z')
  W_re = T.fmatrix('W_re')
  W_att_in = T.fmatrix('W_att_in')
  c = T.fmatrix('c') #initial state
  y0 = T.fmatrix('y0') #initial activation
  i = T.matrix('i',dtype='int8')

  Y, H, d = LSTMCustomTestOpNoInplaceInstance(Z, c, y0, i, W_re, W_att_in)
  W_re_modified = W_re + W_att_in
  Z_modified = T.inc_subtensor(Z[0], T.dot(y0,W_re_modified))
  Y2, H2, d2 = LSTMOpInstance(Z_modified, W_re_modified, c, i)

  f = theano.function(inputs=[Z, c, y0, i, W_re, W_att_in], outputs=Y)
  g = theano.function(inputs=[Z, W_re, c, y0, i, W_att_in], outputs=Y2)

  n_T = 5
  n_batch = 4
  n_inp_dim = 3
  n_cells = 8
  numpy.random.seed(1234)
  Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32')
  W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  #i_val = numpy.ones((n_T, n_batch), dtype='int8')
  i_val = numpy.array([[1,1,1,1,1], [0,0,1,1,1], [0,0,1,1,1], [0,0,1,0,0]], dtype='int8').T

  Y_val = numpy.asarray(f(Z_val, c_val, y0_val, i_val, W_re_val, W_att_in_val))
  Y2_val = numpy.asarray(g(Z_val, W_re_val, c_val, y0_val, i_val, W_att_in_val))
  assert numpy.allclose(Y_val, Y2_val)
  print("success")
Пример #29
0
    def __init__(self, input_layers, *args, **kwargs):
        super(KaggleObjective, self).__init__(input_layers, *args, **kwargs)
        self.input_systole = input_layers["systole"]
        self.input_diastole = input_layers["diastole"]

        self.target_vars["systole"]  = T.fmatrix("systole_target")
        self.target_vars["diastole"] = T.fmatrix("diastole_target")
Пример #30
0
    def get_SGD_trainer(self, debug=False):
        """ Returns a plain SGD minibatch trainer with learning rate as param.
        """
        batch_x1 = T.fmatrix('batch_x1')
        batch_x2 = T.fmatrix('batch_x2')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        # compute the gradients with respect to the model parameters
        # using mean_cost so that the learning rate is not too dependent on the batch size
        cost = self.mean_cos_sim_cost
        gparams = T.grad(cost, self.params)

        # compute list of weights updates
        updates = OrderedDict()
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * learning_rate 

        outputs = cost
        if debug:
            outputs = [cost] + self.params + gparams +\
                    [updates[param] for param in self.params]

        train_fn = theano.function(inputs=[theano.Param(batch_x1), 
            theano.Param(batch_x2), theano.Param(batch_y),
            theano.Param(learning_rate)],
            outputs=outputs,
            updates=updates,
            givens={self.x1: batch_x1, self.x2: batch_x2, self.y: batch_y})

        return train_fn
Пример #31
0
    def __init__(
            self,
            Nbranches=1,  # number of branches (parallel models to be fused)
            Nlayers=1,  # number of layers
            Ndirs=1,  # unidirectional or bidirectional
            Nx=100,  # input size
            Nh=100,  # hidden layer size
            Ny=100,  # output size
            Ah="relu",  # hidden unit activation (e.g. relu, tanh, lstm)
            Ay="linear",  # output unit activation (e.g. linear, sigmoid, softmax)
            predictPer="frame",  # frame or sequence
            loss=None,  # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge)
            L1reg=0.0,  # L1 regularization
            L2reg=0.0,  # L2 regularization
            multiReg=0.0,  # regularization of agreement of predictions on data of different conditions
            momentum=0.0,  # SGD momentum
            seed=15213,  # random seed for initializing the weights
            frontEnd=None,  # a lambda function for transforming the input
            filename=None,  # initialize from file
            initParams=None,  # initialize from given dict
    ):

        if filename is not None:  # load parameters from file
            with smart_open(filename, "rb") as f:
                initParams = dill.load(f)
        if initParams is not None:  # load parameters from given dict
            self.paramNames = []
            self.params = []
            for k, v in initParams.iteritems():
                if type(v) is numpy.ndarray:
                    self.addParam(k, v)
                else:
                    setattr(self, k, v)
                    self.paramNames.append(k)
            # F*ck, locals()[k] = v doesn't work; I have to do this statically
            Nbranches, Nlayers, Ndirs, Nx, Nh, Ny, Ah, Ay, predictPer, loss, L1reg, L2reg, momentum, frontEnd \
                = self.Nbranches, self.Nlayers, self.Ndirs, self.Nx, self.Nh, self.Ny, self.Ah, self.Ay, self.predictPer, self.loss, self.L1reg, self.L2reg, self.momentum, self.frontEnd
        else:  # Initialize parameters randomly
            # Names of parameters to save to file
            self.paramNames = [
                "Nbranches", "Nlayers", "Ndirs", "Nx", "Nh", "Ny", "Ah", "Ay",
                "predictPer", "loss", "L1reg", "L2reg", "momentum", "frontEnd"
            ]
            for name in self.paramNames:
                value = locals()[name]
                setattr(self, name, value)

            # Values of parameters for building the computational graph
            self.params = []

            # Initialize random number generators
            global rng
            rng = numpy.random.RandomState(seed)

            # Construct parameter matrices
            Nlstm = 4 if Ah == 'lstm' else 1
            self.addParam("Win",
                          rand_init((Nbranches, Nx, Nh * Ndirs * Nlstm), Ah))
            self.addParam(
                "Wrec",
                rand_init((Nbranches, Nlayers, Ndirs, Nh, Nh * Nlstm), Ah))
            self.addParam(
                "Wup",
                rand_init(
                    (Nbranches, Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm),
                    Ah))
            self.addParam("Wout", rand_init((Nbranches, Nh * Ndirs, Ny), Ay))
            if Ah != "lstm":
                self.addParam("Bhid", zeros((Nbranches, Nlayers, Nh * Ndirs)))
            else:
                self.addParam(
                    "Bhid",
                    numpy.tile(
                        numpy.concatenate([
                            full((Nbranches, Nlayers, Nh), 1.0),
                            zeros((Nbranches, Nlayers, Nh * 3))
                        ], 2), (1, 1, Ndirs)))
            self.addParam("Bout", zeros((Nbranches, Ny)))
            self.addParam("h0", zeros((Nbranches, Nlayers, Ndirs, Nh)))
            if Ah == "lstm":
                self.addParam("c0", zeros((Nbranches, Nlayers, Ndirs, Nh)))

        # Compute total number of parameters
        self.nParams = sum(x.get_value().size for x in self.params)

        # Initialize gradient tensors when using momentum
        if momentum > 0:
            self.dparams = [
                theano.shared(zeros(x.get_value().shape)) for x in self.params
            ]

        # Build computation graph
        input = T.ftensor3()
        mask = T.imatrix()
        mask_int = [(mask % 2).nonzero(), (mask >= 2).nonzero()]
        mask_float = [
            T.cast((mask % 2).dimshuffle((1, 0)).reshape(
                (mask.shape[1], mask.shape[0], 1)), theano.config.floatX),
            T.cast((mask >= 2).dimshuffle((1, 0)).reshape(
                (mask.shape[1], mask.shape[0], 1)), theano.config.floatX)
        ]

        # mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()]
        # mask_float = [T.cast((mask & 1).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX),
        #               T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)]

        def step_rnn(x_t, mask, h_tm1, W, h0):
            h_tm1 = T.switch(mask, h0, h_tm1)
            return [ACTIVATION[Ah](x_t + h_tm1.dot(W))]

        def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0):
            c_tm1 = T.switch(mask, c0, c_tm1)
            h_tm1 = T.switch(mask, h0, h_tm1)
            a = x_t + h_tm1.dot(W)
            f_t = T.nnet.sigmoid(a[:, :Nh])
            i_t = T.nnet.sigmoid(a[:, Nh:Nh * 2])
            o_t = T.nnet.sigmoid(a[:, Nh * 2:Nh * 3])
            c_t = T.tanh(a[:, Nh * 3:]) * i_t + c_tm1 * f_t
            h_t = T.tanh(c_t) * o_t
            return [c_t, h_t]

        x = input if frontEnd is None else frontEnd(input)
        outputs = []
        for k in range(Nbranches):
            for i in range(Nlayers):
                h = (x.dimshuffle((1, 0, 2)).dot(self.Win[k]) if i == 0 else
                     h.dot(self.Wup[k, i - 1])) + self.Bhid[k, i]
                rep = lambda x: T.extra_ops.repeat(
                    x.reshape((1, -1)), h.shape[1], axis=0)
                if Ah != "lstm":
                    h = T.concatenate([
                        theano.scan(
                            fn=step_rnn,
                            sequences=[
                                h[:, :, Nh * d:Nh * (d + 1)], mask_float[d]
                            ],
                            outputs_info=[rep(self.h0[k, i, d])],
                            non_sequences=[
                                self.Wrec[k, i, d],
                                rep(self.h0[k, i, d])
                            ],
                            go_backwards=(d == 1),
                        )[0][::(1 if d == 0 else -1)] for d in range(Ndirs)
                    ],
                                      axis=2)
                else:
                    h = T.concatenate([
                        theano.scan(
                            fn=step_lstm,
                            sequences=[
                                h[:, :, Nh * 4 * d:Nh * 4 *
                                  (d + 1)], mask_float[d]
                            ],
                            outputs_info=[
                                rep(self.c0[k, i, d]),
                                rep(self.h0[k, i, d])
                            ],
                            non_sequences=[
                                self.Wrec[k, i, d],
                                rep(self.c0[k, i, d]),
                                rep(self.h0[k, i, d])
                            ],
                            go_backwards=(d == 1),
                        )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs)
                    ],
                                      axis=2)
            h = h.dimshuffle((1, 0, 2))
            if predictPer == "sequence":
                h = T.concatenate([
                    h[mask_int[1 - d]][:, Nh * d:Nh * (d + 1)]
                    for d in range(Ndirs)
                ],
                                  axis=1)
            outputs.append(ACTIVATION[Ay](h.dot(self.Wout[k]) + self.Bout[k]))
        output = T.stack(
            *outputs)  # Deprecated in Theano 0.8 but accepted in Theano 0.7
        output_mean = output.mean(axis=0)
        output_var = output.var(axis=0)

        # Compute loss function
        if loss is None:
            loss = {
                "linear": "mse",
                "sigmoid": "ce",
                "softmax": "ce_group"
            }[self.Ay]
        if loss == "ctc":
            label = T.imatrix()
            label_time = T.imatrix()
            tol = T.iscalar()
            cost = sum(
                ctc_cost(prob, mask, label, label_time, tol)
                for prob in outputs) / Nbranches
        else:
            if predictPer == "sequence":
                label = T.fmatrix()
                y = output_mean
                t = label
            elif predictPer == "frame":
                label = T.ftensor3()
                indices = (mask >= 0).nonzero()
                y = output_mean[indices]
                t = label[indices]
            cost = T.mean({
                "ce":
                -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis=1),
                "ce_group":
                -T.log((y * t).sum(axis=1)),
                "mse":
                T.mean((y - t)**2, axis=1),
                "hinge":
                T.mean(relu(1 - y * (t * 2 - 1)), axis=1),
                "squared_hinge":
                T.mean(relu(1 - y * (t * 2 - 1))**2, axis=1),
            }[loss])

        # Add regularization
        cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg
        cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg
        if predictPer == "sequence":
            cost += output_var.mean() * multiReg
        else:
            indices = (mask >= 0).nonzero()
            cost += output_var[indices].mean() * multiReg

        # Compute updates for network parameters
        updates = []
        lrate = T.fscalar()
        clip = T.fscalar()
        grad = T.grad(cost, self.params)
        grad_clipped = [T.maximum(T.minimum(g, clip), -clip) for g in grad]
        if momentum > 0:
            for w, d, g in zip(self.params, self.dparams, grad_clipped):
                updates.append(
                    (w,
                     w + momentum * momentum * d - (1 + momentum) * lrate * g))
                updates.append((d, momentum * d - lrate * g))
        else:
            for w, g in zip(self.params, grad_clipped):
                updates.append((w, w - lrate * g))

        # Create functions to be called from outside
        if loss == "ctc":
            inputs = [input, mask, label, label_time, tol, lrate, clip]
        else:
            inputs = [input, mask, label, lrate, clip]
        self.train = theano.function(
            inputs=inputs,
            outputs=cost,
            updates=updates,
        )

        self.predict = theano.function(inputs=[input, mask], outputs=output)
Пример #32
0
def build_encoder_z(li, nc, num_hidden, lr):
    z_var = T.fmatrix('z_var')
    input_var = T.tensor4('inputs')
    encoder = {}
    details = [['Layer Name', 'Dims in', 'shape of layer', 'Dims out']]

    input_shape = (None, nc, li, li)
    name = 'input'
    encoder[name] = lasagne.layers.InputLayer(shape=input_shape,
                                              input_var=input_var)
    output_dims = input_shape

    filter_size = 5
    num_filters = li / 4

    repeat_num = int(np.log2(np.array(li)) - 3) + 1

    for n in range(0, repeat_num):
        num_filters = num_filters * 2
        prev_name = name
        name = 'conv' + str(n)
        prev_num_filters = lasagne.layers.get_output_shape(
            encoder[prev_name])[1]
        encoder[name] = lasagne.layers.batch_norm(
            lasagne.layers.Conv2DLayer(
                encoder[prev_name],
                num_filters,
                filter_size,
                stride=2,
                pad='same',
                nonlinearity=lasagne.nonlinearities.rectify))
        prev_output_dims = output_dims
        output_dims = lasagne.layers.get_output_shape(encoder[name])
        details.append([
            name,
            str(prev_output_dims),
            str((num_filters, prev_num_filters, filter_size, filter_size)),
            str(output_dims)
        ])

    prev_name = name
    name = 'fc'
    num_units = int(li * li)

    encoder[name] = lasagne.layers.DenseLayer(
        encoder[prev_name],
        num_units=num_units,
        nonlinearity=lasagne.nonlinearities.rectify)

    prev_output_dims = output_dims
    output_dims = lasagne.layers.get_output_shape(encoder[name])
    details.append([
        name,
        str(prev_output_dims),
        str((product(prev_output_dims[1:]), num_units)),
        str(output_dims)
    ])

    prev_name = name
    name = 'out'
    num_units = num_hidden

    # We restrict output to tanh domain (same as input noise)
    encoder[name] = lasagne.layers.DenseLayer(
        encoder[prev_name],
        num_units=num_units,
        nonlinearity=lasagne.nonlinearities.tanh)

    prev_output_dims = output_dims
    output_dims = lasagne.layers.get_output_shape(encoder[name])
    details.append([
        name,
        str(prev_output_dims),
        str((product(prev_output_dims[1:]), num_units)),
        str(output_dims)
    ])

    train_out = lasagne.layers.get_output(encoder['out'])
    val_out = lasagne.layers.get_output(encoder['out'], deterministic=True)

    loss = lasagne.objectives.squared_error(train_out, z_var).mean()
    params = lasagne.layers.get_all_params(encoder['out'], trainable=True)
    updates = lasagne.updates.adam(loss, params, learning_rate=lr, beta1=0.5)
    train_fn = theano.function([input_var, z_var], [loss], updates=updates)
    val_fn = theano.function([input_var], [val_out])

    try:
        from tabulate import tabulate
        print(tabulate(details))
    except ImportError:
        pass
    return encoder, train_fn, val_fn
Пример #33
0
    def __init__(self, We_initial, params):
        self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))
        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word,
                                                backwards=True)

        l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, hidden))
        l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, hidden))
        concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb])
        l_local = lasagne.layers.DenseLayer(
            concat2, num_units=25, nonlinearity=lasagne.nonlinearities.linear)
        ### the above is for the uniary term energy

        if params.emb == 1:
            f = open('F.pickle')
        else:
            f = open('F0_new.pickle')

        para = pickle.load(f)
        f.close()
        f_params = lasagne.layers.get_all_params(l_local, trainable=True)
        for idx, p in enumerate(f_params):
            p.set_value(para[idx])

        Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32')
        Wyy = theano.shared(Wyy0)
        d_params = lasagne.layers.get_all_params(l_local, trainable=True)
        d_params.append(Wyy)
        self.d_params = d_params

        l_in_word_a = lasagne.layers.InputLayer((None, None))
        l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None))
        l_emb_word_a = lasagne_embedding_layer_2(l_in_word_a, embsize,
                                                 l_emb_word.W)
        #l_emb_word_a = lasagne.layers.EmbeddingLayer(l_in_word_a,  input_size=We_initial.shape[0] , output_size = embsize, W =We)
        if params.dropout:
            l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5)

        l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                  hidden,
                                                  mask_input=l_mask_word_a)
        l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                  hidden,
                                                  mask_input=l_mask_word_a,
                                                  backwards=True)
        l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a,
                                                   (-1, hidden))
        l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a,
                                                   (-1, hidden))
        concat2_a = lasagne.layers.ConcatLayer([l_reshapef_a, l_reshapeb_a])
        if params.dropout:
            concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5)
        l_local_a = lasagne.layers.DenseLayer(
            concat2_a,
            num_units=25,
            nonlinearity=lasagne.nonlinearities.softmax)

        a_params = lasagne.layers.get_all_params(l_local_a, trainable=True)
        self.a_params = a_params
        if params.emb == 1:
            f = open('F.pickle')
        else:
            f = open('F0_new.pickle')
        PARA = pickle.load(f)
        f.close()
        for idx, p in enumerate(a_params):
            p.set_value(PARA[idx])

        y_in = T.ftensor3()
        y = T.imatrix()
        g = T.imatrix()
        gmask = T.fmatrix()
        y_mask = T.fmatrix()
        length = T.iscalar()

        predy0 = lasagne.layers.get_output(l_local_a, {
            l_in_word_a: g,
            l_mask_word_a: gmask
        })
        predy = predy0.reshape((-1, length, 25))

        #predy = predy * gmask[:,:,None]
        #newpredy = T.concatenate([predy, y0] , axis=2)
        # n , L, 46, 46
        # predy0: n, L, 25

        # energy loss
        def inner_function(targets_one_step, mask_one_step, prev_label,
                           tg_energy):
            """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """
            new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1])
            new_ta_energy = tg_energy + T.sum(new_ta_energy * targets_one_step,
                                              axis=1)
            tg_energy_t = T.switch(mask_one_step, new_ta_energy, tg_energy)
            return [targets_one_step, new_ta_energy]

        # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels)
        local_energy = lasagne.layers.get_output(l_local, {
            l_in_word: g,
            l_mask_word: gmask
        })
        local_energy = local_energy.reshape((-1, length, 25))
        local_energy = local_energy * gmask[:, :, None]
        targets_shuffled = y_in.dimshuffle(1, 0, 2)
        masks_shuffled = gmask.dimshuffle(1, 0)
        # initials should be energies_shuffles[0, :, -1, :]

        target_time0 = targets_shuffled[0]
        initial_energy0 = T.dot(target_time0, Wyy[-1, :-1])
        length_index = T.sum(gmask, axis=1) - 1
        length_index = T.cast(length_index, 'int32')

        initials = [target_time0, initial_energy0]
        [_, target_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials,
            sequences=[targets_shuffled[1:], masks_shuffled[1:]])
        pos_end_target = y_in[T.arange(length_index.shape[0]), length_index]
        pos_cost = target_energies[-1] + T.sum(
            T.sum(local_energy * y_in, axis=2) * gmask, axis=1) + T.dot(
                pos_end_target, Wyy[:-1, -1])
        check = T.sum(T.sum(local_energy * y_in, axis=2) * gmask, axis=1)

        negtargets_shuffled = predy.dimshuffle(1, 0, 2)
        negtarget_time0 = negtargets_shuffled[0]
        neginitial_energy0 = T.dot(negtarget_time0, Wyy[-1, :-1])
        neginitials = [negtarget_time0, neginitial_energy0]
        [_, negtarget_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=neginitials,
            sequences=[negtargets_shuffled[1:], masks_shuffled[1:]])
        neg_end_target = predy[T.arange(length_index.shape[0]), length_index]
        neg_cost = negtarget_energies[-1] + T.sum(
            T.sum(local_energy * predy, axis=2) * gmask, axis=1) + T.dot(
                neg_end_target, Wyy[:-1, -1])

        y_f = y.flatten()
        predy_f = predy.reshape((-1, 25))

        ce_hinge = lasagne.objectives.categorical_crossentropy(
            predy_f + eps, y_f)
        ce_hinge = ce_hinge.reshape((-1, length))
        ce_hinge = T.sum(ce_hinge * gmask, axis=1)

        entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1)
        entropy_term = entropy_term.reshape((-1, length))
        entropy_term = T.sum(entropy_term * gmask, axis=1)

        delta0 = T.sum(abs((y_in - predy)), axis=2) * gmask
        delta0 = T.sum(delta0, axis=1)

        if (params.margin_type == 1):
            hinge_cost = 1 + neg_cost - pos_cost
        elif (params.margin_type == 2):
            hinge_cost = neg_cost - pos_cost
        elif (params.margin_type == 0):
            hinge_cost = delta0 + neg_cost - pos_cost
        elif (params.margin_type == 3):
            hinge_cost = delta0 * (1.0 + neg_cost - pos_cost)

        hinge_cost = hinge_cost * T.gt(hinge_cost, 0)
        d_cost = T.mean(hinge_cost)
        d_cost0 = d_cost
        l2_term = sum(
            lasagne.regularization.l2(x - PARA[index])
            for index, x in enumerate(a_params))
        """select different regulizer"""
        g_cost = -d_cost0 + params.l2 * sum(
            lasagne.regularization.l2(x)
            for x in a_params) + params.l3 * T.mean(ce_hinge)
        d_cost = d_cost0 + params.l2 * sum(
            lasagne.regularization.l2(x) for x in d_params)

        self.a_params = a_params
        updates_g = lasagne.updates.sgd(g_cost, a_params, params.eta)
        updates_g = lasagne.updates.apply_momentum(updates_g,
                                                   a_params,
                                                   momentum=0.9)

        self.train_g = theano.function(
            [g, gmask, y, y_in, length],
            [g_cost, d_cost0, pos_cost, neg_cost, delta0, check],
            updates=updates_g,
            on_unused_input='ignore')
        updates_d = lasagne.updates.adam(d_cost, d_params, 0.001)
        self.train_d = theano.function(
            [g, gmask, y, y_in, length],
            [d_cost, d_cost0, pos_cost, neg_cost, delta0, check],
            updates=updates_d,
            on_unused_input='ignore')
        # test the model and retuning the model

        predy_test = lasagne.layers.get_output(l_local_a, {
            l_in_word_a: g,
            l_mask_word_a: gmask
        },
                                               deterministic=True)
        predy_test = predy_test.reshape((-1, length, 25))
        pred = T.argmax(predy_test, axis=2)
        pg = T.eq(pred, y)
        pg = pg * gmask
        acc = 1.0 * T.sum(pg) / T.sum(gmask)

        negtargets_shuffled_test = predy_test.dimshuffle(1, 0, 2)
        negtarget_time0_test = negtargets_shuffled_test[0]
        neginitial_energy0_test = T.dot(negtarget_time0_test, Wyy[-1, :-1])
        neginitials_test = [negtarget_time0_test, neginitial_energy0_test]
        [_, negtarget_energies_test], _ = theano.scan(
            fn=inner_function,
            outputs_info=neginitials_test,
            sequences=[negtargets_shuffled_test[1:], masks_shuffled[1:]])
        end_test_target = predy_test[T.arange(length_index.shape[0]),
                                     length_index]
        neg_cost_test = negtarget_energies_test[-1] + T.sum(
            T.sum(local_energy * predy_test, axis=2) * gmask, axis=1) + T.dot(
                end_test_target, Wyy[:-1, -1])
        """ce regulizer"""
        test_cost = -T.mean(neg_cost_test) + params.l3 * T.mean(ce_hinge)
        test_updates = lasagne.updates.sgd(test_cost, a_params, params.eta)
        test_updates = lasagne.updates.apply_momentum(test_updates,
                                                      a_params,
                                                      momentum=0.9)
        self.test_time_turning = theano.function([g, gmask, y, length],
                                                 test_cost,
                                                 updates=test_updates,
                                                 on_unused_input='ignore')
        self.test_time1 = theano.function([g, gmask, y, y_in, length], [
            acc,
            T.mean(neg_cost),
            T.mean(pos_cost), params.l3 * T.mean(ce_hinge)
        ],
                                          on_unused_input='ignore')
        self.test_time = theano.function([g, gmask, y, length], acc)
Пример #34
0
    predict_data = predict_data - T.log(
        T.sum(T.exp(predict_data), axis=-1, keepdims=True))
    inputs = [input_data, input_cond, input_mask]
    predict_fn = theano.function(inputs=inputs, outputs=[predict_data])
    return predict_fn


if __name__ == '__main__':
    parser = get_arg_parser()
    args = parser.parse_args()

    print(args, file=sys.stderr)

    input_data = T.ftensor3('input_data')
    input_cond = T.ftensor3('input_cond')
    input_mask = T.fmatrix('input_mask')

    network = deep_projection_ivector_ln_model_fix(
        input_var=input_data,
        cond_var=input_cond,
        mask_var=input_mask,
        num_inputs=input_dim,
        num_outputs=output_dim,
        num_conds=args.num_conds,
        num_layers=args.num_layers,
        num_factors=args.num_factors,
        num_units=args.num_units,
        grad_clipping=args.grad_clipping,
        dropout=args.dropout)[0]

    network_params = get_all_params(network, trainable=True)
    def _build_network(self, load_params: bool = False):
        """Build network, including inputs, weights and the whole structure."""
        # Tweet variables
        self.tweet_input = T.itensor3()
        self.targets_input = T.ivector()
        self.t_mask_input = T.fmatrix()

        self.params = t2v.init_params(n_chars=self.n_char)
        # classification params
        self.params["W_cl"] = theano.shared(
            np.random.normal(
                loc=0.,
                scale=settings_char.SCALE,
                size=(settings_char.WDIM, self.n_classes),
            ).astype("float32"),
            name="W_cl",
        )
        self.params["b_cl"] = theano.shared(np.zeros(
            (self.n_classes, )).astype("float32"),
                                            name="b_cl")

        if load_params:
            self._load_weights()
        # network for prediction
        predictions, net, embeddings = self._classify(
            self.tweet_input,
            self.t_mask_input,
            self.params,
            self.n_classes,
            self.n_char,
        )

        # Theano function
        self._print("Compiling theano functions...")
        self.predict = theano.function([self.tweet_input, self.t_mask_input],
                                       predictions)
        self.encode = theano.function([self.tweet_input, self.t_mask_input],
                                      embeddings)
        self.net = net
        self._print("Building network...")

        # batch loss
        loss = lasagne.objectives.categorical_crossentropy(
            predictions, self.targets_input)
        cost = T.mean(
            loss
        ) + settings_char.REGULARIZATION * lasagne.regularization.regularize_network_params(
            self.net, lasagne.regularization.l2)
        cost_only = T.mean(loss)

        # params and updates
        self._print("Computing updates...")
        lr = settings_char.LEARNING_RATE
        mu = settings_char.MOMENTUM
        updates = lasagne.updates.nesterov_momentum(
            cost, lasagne.layers.get_all_params(self.net), lr, momentum=mu)

        # Theano function
        self._print("Compiling theano functions...")

        inps = [self.tweet_input, self.t_mask_input, self.targets_input]
        self.cost_val = theano.function(inps, [cost_only, embeddings])
        self.train = theano.function(inps, cost, updates=updates)
Пример #36
0
import time
import theano
import numpy as np

from theano import tensor as T
from theano.tensor import tanh
import mkl_simplernn_bw_op
from mkl_simplernn_bw_op import SimpleRNN_bw 

X = T.ftensor3('X')
W_x = T.fmatrix('W_x')
W_h = T.fmatrix('W_h')
B = T.fvector('B')
B_mkl = T.fmatrix('B_mkl')
hid = T.fmatrix('hid')
o_real = T.ftensor3('o_real') 

def step(x, h_tm1):
    global W_h, B
    h_t = tanh(x + T.dot(h_tm1, W_h) + B)
    return h_t

def SimpleRNN_theano():
    global X, W_x, hid
    X_r = T.dot(X, W_x)
    fn = lambda x_r, h_tm1: step(x_r, h_tm1)
    result, updates = theano.scan(fn, sequences=[ X_r], outputs_info=hid, name='test_theano_gru_scan')
    return result


if __name__ == '__main__':
Пример #37
0
    def __init__(
            self,
            Nlayers=1,  # number of layers
            Ndirs=1,  # unidirectional or bidirectional
            Nx=100,  # input size
            Nh=100,  # hidden layer size
            Ny=100,  # output size
            Ah='relu',  # hidden unit activation (e.g. relu, tanh, lstm)
            Ay='linear',  # output unit activation (e.g. linear, sigmoid, softmax)
            predictPer='frame',  # frame or sequence
            loss=None,  # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge)
            L1reg=0.0,  # L1 regularization
            L2reg=0.0,  # L2 regularization
            momentum=0.0,  # SGD momentum
            seed=15213,  # random seed for initializing the weights
            frontEnd=None,  # a lambda function for transforming the input
            filename=None,  # initialize from file
            initParams=None,  # initialize from given dict
    ):
        if filename is not None:  # load parameters from file
            with open(filename, 'rb') as f:
                initParams = cPickle.load(f)
        if initParams is not None:  # load parameters from given dict
            self.paramNames = []
            self.params = []
            for k, v in initParams.iteritems():
                if type(v) is numpy.ndarray:
                    self.addParam(k, v)
                else:
                    setattr(self, k, v)
                    self.paramNames.append(k)
        else:  # Initialize parameters randomly
            # Names of parameters to save to file
            self.paramNames = [
                'Nlayers', 'Ndirs', 'Nx', 'Nh', 'Ny', 'Ah', 'Ay', 'predictPer',
                'loss', 'L1reg', 'L2reg', 'momentum', 'frontEnd'
            ]
            for name in self.paramNames:
                value = locals()[name]
                if isinstance(value, basestring):
                    value = value.lower()
                locals()[name] = value
                setattr(self, name, value)

            # Values of parameters for building the computational graph
            self.params = []

            # Initialize random number generators
            global rng
            rng = numpy.random.RandomState(seed)

            # Construct parameter matrices
            Nlstm = 4 if Ah == 'lstm' else 1
            self.addParam('Win', rand_init((Nx, Nh * Ndirs * Nlstm), Ah))
            self.addParam('Wrec',
                          rand_init((Nlayers, Ndirs, Nh, Nh * Nlstm), Ah))
            self.addParam(
                'Wup',
                rand_init((Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah))
            self.addParam('Wout', rand_init((Nh * Ndirs, Ny), Ay))
            if Ah != 'lstm':
                self.addParam('Bhid', zeros((Nlayers, Nh * Ndirs)))
            else:
                self.addParam(
                    'Bhid',
                    numpy.tile(
                        numpy.hstack([
                            full((Nlayers, Nh), 1.0),
                            zeros((Nlayers, Nh * 3))
                        ]), (1, Ndirs)))
            self.addParam('Bout', zeros(Ny))
            self.addParam('h0', zeros((Nlayers, Ndirs, Nh)))
            if Ah == 'lstm':
                self.addParam('c0', zeros((Nlayers, Ndirs, Nh)))

        # Compute total number of parameters
        self.nParams = sum(x.get_value().size for x in self.params)

        # Initialize gradient tensors when using momentum
        if momentum > 0:
            self.dparams = [
                theano.shared(zeros(x.get_value().shape)) for x in self.params
            ]

        # Build computation graph
        input = T.ftensor3()
        mask = T.imatrix()
        mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()]
        mask_float = [
            T.cast((mask & 1).xdimshuffle((1, 0)).reshape(
                (mask.shape[1], mask.shape[0], 1)), theano.config.floatX),
            T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape(
                (mask.shape[1], mask.shape[0], 1)), theano.config.floatX)
        ]

        def step_rnn(x_t, mask, h_tm1, W, h0):
            h_tm1 = T.switch(mask, h0, h_tm1)
            return [ACTIVATION[Ah](x_t + h_tm1.dot(W))]

        def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0):
            c_tm1 = T.switch(mask, c0, c_tm1)
            h_tm1 = T.switch(mask, h0, h_tm1)
            a = x_t + h_tm1.dot(W)
            f_t = T.nnet.sigmoid(a[:, :Nh])
            i_t = T.nnet.sigmoid(a[:, Nh:Nh * 2])
            o_t = T.nnet.sigmoid(a[:, Nh * 2:Nh * 3])
            c_t = T.tanh(a[:, Nh * 3:]) * i_t + c_tm1 * f_t
            h_t = T.tanh(c_t) * o_t
            return [c_t, h_t]

        x = input if frontEnd is None else frontEnd(input)
        for i in range(Nlayers):
            h = (x.dimshuffle((1, 0, 2)).dot(self.Win)
                 if i == 0 else h.dot(self.Wup[i - 1])) + self.Bhid[i]
            rep = lambda x: T.extra_ops.repeat(
                x.reshape((1, -1)), h.shape[1], axis=0)
            if Ah != 'lstm':
                h = T.concatenate([
                    theano.scan(
                        fn=step_rnn,
                        sequences=[
                            h[:, :, Nh * d:Nh * (d + 1)], mask_float[d]
                        ],
                        outputs_info=[rep(self.h0[i, d])],
                        non_sequences=[self.Wrec[i, d],
                                       rep(self.h0[i, d])],
                        go_backwards=(d == 1),
                    )[0][::(1 if d == 0 else -1)] for d in range(Ndirs)
                ],
                                  axis=2)
            else:
                h = T.concatenate([
                    theano.scan(
                        fn=step_lstm,
                        sequences=[
                            h[:, :, Nh * 4 * d:Nh * 4 * (d + 1)], mask_float[d]
                        ],
                        outputs_info=[rep(self.c0[i, d]),
                                      rep(self.h0[i, d])],
                        non_sequences=[
                            self.Wrec[i, d],
                            rep(self.c0[i, d]),
                            rep(self.h0[i, d])
                        ],
                        go_backwards=(d == 1),
                    )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs)
                ],
                                  axis=2)
        h = h.dimshuffle((1, 0, 2))
        if predictPer == 'sequence':
            h = T.concatenate([
                h[mask_int[1 - d]][:, Nh * d:Nh * (d + 1)]
                for d in range(Ndirs)
            ],
                              axis=1)
        output = ACTIVATION[Ay](h.dot(self.Wout) + self.Bout)

        # Compute loss function
        if loss is None:
            loss = {
                'linear': 'mse',
                'sigmoid': 'ce',
                'softmax': 'ce_group'
            }[self.Ay]
        if predictPer == 'sequence':
            label = T.fmatrix()
            y = output
            t = label
        elif predictPer == 'frame':
            label = T.ftensor3()
            indices = (mask >= 0).nonzero()
            y = output[indices]
            t = label[indices]
        cost = T.mean({
            'ce':
            -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis=1),
            'ce_group':
            -T.log((y * t).sum(axis=1)),
            'mse':
            T.mean((y - t)**2, axis=1),
            'hinge':
            T.mean(relu(1 - y * (t * 2 - 1)), axis=1),
            'squared_hinge':
            T.mean(relu(1 - y * (t * 2 - 1))**2, axis=1),
        }[loss])

        # Add regularization
        cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg
        cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg

        # Compute updates for network parameters
        updates = []
        gradient = []
        lrate = T.fscalar()
        if momentum > 0:
            for w, d, g in zip(self.params, self.dparams,
                               T.grad(cost, self.params)):
                updates.append(
                    (w,
                     w + momentum * momentum * d - (1 + momentum) * lrate * g))
                updates.append((d, momentum * d - lrate * g))
                gradient.append(g)
        else:
            for w, g in zip(self.params, T.grad(cost, self.params)):
                updates.append((w, w - lrate * g))
                gradient.append(g)

        # Create functions to be called from outside
        self.train = theano.function(
            inputs=[input, mask, label, lrate],
            outputs=[cost, y, gradient[5], h, t,
                     h.dot(self.Wout), self.Wout],
            updates=updates,
        )

        self.predict = theano.function(inputs=[input, mask], outputs=output)
Пример #38
0
    def fit(self, X, Y, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=100, show_fig=False):
        D = X[0].shape[1] # X is of size N x T(n) x D
        K = len(set(Y.flatten()))
        N = len(Y)
        M = self.M
        self.f = activation

        # initial weights
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, K)
        bo = np.zeros(K)

        # make them theano shared
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

        thX = T.fmatrix('X')
        thY = T.ivector('Y')

        def recurrence(x_t, h_t1):
            # returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        [h, y], _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0, None],
            sequences=thX,
            n_steps=thX.shape[0],
        )

        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ]

        self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction, y],
            updates=updates
        )

        costs = []
        for i in xrange(epochs):
            X, Y = shuffle(X, Y)
            n_correct = 0
            cost = 0
            for j in xrange(N):
                c, p, rout = self.train_op(X[j], Y[j])
                # print "p:", p
                cost += c
                if p[-1] == Y[j,-1]:
                    n_correct += 1
            print "shape y:", rout.shape
            print "i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N)
            costs.append(cost)
            if n_correct == N:
                break

        if show_fig:
            plt.plot(costs)
            plt.show()
Пример #39
0
    def __init__(self, We_initial, char_embedd_table_initial, params):
        self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)
        embsize = We_initial.shape[1]
        hidden = params.hidden

        char_embedd_dim = params.char_embedd_dim
        char_dic_size = len(params.char_dic)
        char_embedd_table = theano.shared(char_embedd_table_initial)

        trans = np.random.uniform(
            -0.01, 0.01,
            (params.num_labels + 1, params.num_labels + 1)).astype('float32')
        transition = theano.shared(trans)

        input_var = T.imatrix(name='inputs')
        target_var = T.imatrix(name='targets')
        mask_var = T.fmatrix(name='masks')
        mask_var1 = T.fmatrix(name='masks1')
        length = T.iscalar()
        char_input_var = T.itensor3(name='char-inputs')

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We,
                name='word_embedding')
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        layer_char_input = lasagne.layers.InputLayer(shape=(None, None,
                                                            Max_Char_Length),
                                                     input_var=char_input_var,
                                                     name='char-input')

        layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(
            layer_char,
            input_size=char_dic_size,
            output_size=char_embedd_dim,
            W=char_embedd_table,
            name='char_embedding')

        layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding,
                                                    pattern=(0, 2, 1))

        # first get some necessary dimensions or parameters
        conv_window = 3
        num_filters = params.num_filters
        #_, sent_length, _ = incoming2.output_shape

        # dropout before cnn?
        if params.dropout:
            layer_char = lasagne.layers.DropoutLayer(layer_char, p=0.5)

# construct convolution layer
        cnn_layer = lasagne.layers.Conv1DLayer(
            layer_char,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        _, _, pool_size = cnn_layer.output_shape
        print pool_size
        # construct max pool layer
        pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer,
                                                   pool_size=pool_size)
        # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
        output_cnn_layer = lasagne.layers.reshape(pool_layer,
                                                  (-1, length, [1]))

        # finally, concatenate the two incoming layers together.
        incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word],
                                         axis=2)
        if params.dropout:
            incoming = lasagne.layers.DropoutLayer(incoming, p=0.5)

        l_lstm_wordf = lasagne.layers.LSTMLayer(incoming,
                                                hidden,
                                                mask_input=l_mask_word,
                                                grad_clipping=5.)
        l_lstm_wordb = lasagne.layers.LSTMLayer(incoming,
                                                hidden,
                                                mask_input=l_mask_word,
                                                grad_clipping=5.,
                                                backwards=True)

        concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)

        if params.dropout:
            concat = lasagne.layers.DropoutLayer(concat, p=0.5)

        l_reshape_concat = lasagne.layers.ReshapeLayer(concat,
                                                       (-1, 2 * hidden))

        l_local = lasagne.layers.DenseLayer(
            l_reshape_concat,
            num_units=params.num_labels,
            nonlinearity=lasagne.nonlinearities.linear)

        #bi_lstm_crf = CRFLayer(concat, params.num_labels, mask_input= l_mask_word)

        local_energy = lasagne.layers.get_output(
            l_local, {
                l_in_word: input_var,
                l_mask_word: mask_var,
                layer_char_input: char_input_var
            })
        local_energy = local_energy.reshape((-1, length, params.num_labels))
        local_energy = local_energy * mask_var[:, :, None]

        end_term = transition[:-1, -1]
        local_energy = local_energy + end_term.dimshuffle(
            'x', 'x', 0) * mask_var1[:, :, None]

        local_energy_eval = lasagne.layers.get_output(
            l_local, {
                l_in_word: input_var,
                l_mask_word: mask_var,
                layer_char_input: char_input_var
            },
            deterministic=True)
        local_energy_eval = local_energy_eval.reshape(
            (-1, length, params.num_labels))
        local_energy_eval = local_energy_eval * mask_var[:, :, None]
        local_energy_eval = local_energy_eval + end_term.dimshuffle(
            'x', 'x', 0) * mask_var1[:, :, None]

        length_index = T.sum(mask_var, axis=1)

        loss_train = crf_loss0(local_energy, transition, target_var,
                               mask_var).mean()
        #loss_train = T.dot(loss_train, length_index)/T.sum(length_index)

        #loss_train = crf_loss0(local_energy, transition, target_var, mask_var).mean()

        prediction, corr = crf_accuracy0(local_energy_eval, transition,
                                         target_var, mask_var)

        ##loss_train = crf_loss(energies_train, target_var, mask_var).mean()

        ##prediction, corr = crf_accuracy(energies_train, target_var)

        corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        num_tokens = mask_var.sum(dtype=theano.config.floatX)

        network_params = lasagne.layers.get_all_params(l_local, trainable=True)
        network_params.append(transition)

        print network_params
        self.network_params = network_params

        loss_train = loss_train + params.L2 * sum(
            lasagne.regularization.l2(x) for x in network_params)

        updates = lasagne.updates.sgd(loss_train, network_params, params.eta)
        updates = lasagne.updates.apply_momentum(updates,
                                                 network_params,
                                                 momentum=0.9)

        self.train_fn = theano.function([
            input_var, char_input_var, target_var, mask_var, mask_var1, length
        ],
                                        loss_train,
                                        updates=updates,
                                        on_unused_input='ignore')

        self.eval_fn = theano.function([
            input_var, char_input_var, target_var, mask_var, mask_var1, length
        ], [corr_train, num_tokens, prediction],
                                       on_unused_input='ignore')
Пример #40
0
from theano import tensor

from dagbldr.datasets import load_digits
from dagbldr.utils import convert_to_one_hot
from dagbldr.nodes import binary_crossentropy, binary_entropy
from dagbldr.nodes import categorical_crossentropy, abs_error
from dagbldr.nodes import squared_error, gaussian_error, log_gaussian_error
from dagbldr.nodes import masked_cost, gaussian_kl, gaussian_log_kl

# Common between tests
digits = load_digits()
X = digits["data"].astype("float32")
y = digits["target"]
n_classes = len(set(y))
y = convert_to_one_hot(y, n_classes).astype("float32")
X_sym = tensor.fmatrix()
y_sym = tensor.fmatrix()


def test_binary_crossentropy():
    cost = binary_crossentropy(.99 * X_sym, X_sym)
    theano.function([X_sym], cost, mode="FAST_COMPILE")


def test_binary_entropy():
    cost = binary_entropy(X_sym)
    theano.function([X_sym], cost, mode="FAST_COMPILE")


def test_categorical_crossentropy():
    cost = categorical_crossentropy(.99 * y_sym + .001, y_sym)
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    emb_size=40,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)
    '''
    combine train and dev
    '''
    train_sents = np.concatenate([train_sents, dev_sents], axis=0)
    train_masks = np.concatenate([train_masks, dev_masks], axis=0)
    train_labels = np.concatenate([train_labels, dev_labels], axis=0)
    train_size = train_size + dev_size

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec',
        emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec'
    ], 40)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]

    # NN_para = multiCNN_para+ACNN_para

    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    LR_input = T.concatenate([sent_embeddings, sent_embeddings2, bow_emb],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 12, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)

    LR_att_input = T.concatenate([gru_sent_embeddings, bow_emb], axis=1)
    LR_att_input_size = hidden_size[0] + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_att_a = create_ensemble_para(
        rng, 12, LR_att_input_size)  # the weight matrix hidden_size*2
    LR_att_b = theano.shared(value=np.zeros((12, ),
                                            dtype=theano.config.floatX),
                             name='LR_b',
                             borrow=True)  #bias for each target class
    LR_att_para = [U_att_a, LR_att_b]
    layer_att_LR = LogisticRegression(
        rng,
        input=LR_att_input,
        n_in=LR_att_input_size,
        n_out=12,
        W=U_att_a,
        b=LR_att_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax)  #batch * 12
    att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix,
                           att_score_matrix)

    att_loss = -T.mean(T.log(att_prob_pos))
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    acnn_LR_input = T.concatenate(
        [sent_att_embeddings, sent_att_embeddings2, bow_emb], axis=1)
    acnn_LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a = create_ensemble_para(
        rng, 12, acnn_LR_input_size)  # the weight matrix hidden_size*2
    acnn_LR_b = theano.shared(value=np.zeros((12, ),
                                             dtype=theano.config.floatX),
                              name='LR_b',
                              borrow=True)  #bias for each target class
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)

    acnn_loss = -T.mean(T.log(acnn_prob_pos))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    params = multiCNN_para + LR_para + GRU_NN_para + LR_att_para + ACNN_para + acnn_LR_para  # put all model parameters together
    cost = loss + att_loss + acnn_loss + 1e-4 * ((conv_W**2).sum() +
                                                 (conv_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = T.max(T.concatenate([
        att_score_matrix.dimshuffle('x', 0, 1),
        score_matrix.dimshuffle('x', 0, 1),
        acnn_score_matrix.dimshuffle('x', 0, 1)
    ],
                                             axis=0),
                               axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = 0.6 * ensemble_NN_scores + 0.4 * 0.5 * (
        cosine_score_matrix + top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        binarize_prob,
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)
    cost_i = 0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch], label_sent,
                                  label_mask)

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Пример #42
0
 def __init__(self,
              gen_fn_dcgan,
              disc_fn_dcgan,
              gen_params_dcgan,
              disc_params_dcgan,
              gen_fn_p2p,
              disc_fn_p2p,
              gen_params_p2p,
              disc_params_p2p,
              in_shp,
              latent_dim,
              is_a_grayscale,
              is_b_grayscale,
              alpha=100,
              opt=adam,
              opt_args={'learning_rate': theano.shared(floatX(1e-3))},
              train_mode='both',
              reconstruction='l1',
              sampler=np.random.rand,
              lsgan=False,
              verbose=True):
     """
     Two-stage DCGAN/pix2pix GAN. Given training data (pairs) in the form [A,B], the DCGAN
       maps from prior samples z -> A, and the pix2pix GAN synthesises B images from A images.
     gen_fn_dcgan: a function that returns the architecture (concretely, the last layer)
       of the DCGAN. This function should have the signature (latent_dim, is_a_grayscale, ...),
       where `latent_dim` is the latent dimension, `is_a_grayscale` denotes whether the 'A'
       image is grayscale or not, and ... denotes optional kwargs.
     gen_params_dcgan: kwargs to pass to `gen_fn_dcgan`.
     disc_fn_dcgan: discriminator for the DCGAN. This function should have the signature
       (in_shp, is_a_grayscale, ...) where `in_shp` denotes the width/height of the
       generated/real image.
     disc_params_dcgan: kwargs to pass to `disc_fn_dcgan`.
     gen_fn_p2p: a function that returns the p2p architecture. This function should have
       the signature (in_shp, is_a_grayscale, is_b_grayscale, ...).
     disc_fn_p2p: should have the signature (in_shp, is_a_grayscale, is_b_grayscale)
       as well. Since this function requires two inputs (the A and B image), it
       returns a dictionary instead of a Lasagne layer (see `discriminator` in
       architectures/p2p.py).
     in_shp: dimensions (width/height) of the A and B image.
     latent_dim: prior sampling dimension for the DCGAN.
     is_a_grayscale: is the A image grayscale?
     is_b_grayscale: is the B image grayscale?
     alpha: weight of the reconstruction loss for the pix2pix
     opt: Lasagne optimiser
     opt_args: kwargs for the optimiser
     train_mode: if 'both', train both dcgan and p2p at the same time. If 'p2p', train
       p2p only, if 'dcgan', train DCGAN only.
     reconstruction: if 'l1', use L1 reconstruction. If 'l2', use L2.
     sampler: random generator for sampling from the prior distribution.
     lsgan: use LSGAN formulation? (Generally more stable than regular GAN.)
     verbose:
     """
     assert train_mode in ['dcgan', 'p2p', 'both']
     self.is_a_grayscale = is_a_grayscale
     self.is_b_grayscale = is_b_grayscale
     self.latent_dim = latent_dim
     self.sampler = sampler
     self.in_shp = in_shp
     self.verbose = verbose
     self.train_mode = train_mode
     # get the networks for the dcgan network
     dcgan_gen = gen_fn_dcgan(latent_dim, is_a_grayscale,
                              **gen_params_dcgan)
     dcgan_disc = disc_fn_dcgan(in_shp, is_a_grayscale, **disc_params_dcgan)
     # get the networks for the p2p network
     p2p_gen = gen_fn_p2p(in_shp, is_a_grayscale, is_b_grayscale,
                          **gen_params_p2p)
     p2p_disc = disc_fn_p2p(in_shp, is_a_grayscale, is_b_grayscale,
                            **disc_params_p2p)
     if verbose:
         print("p2p gen:")
         self._print_network(dcgan_gen)
         print("p2p disc:")
         self._print_network(dcgan_disc)
         print("p2p gen:")
         self._print_network(p2p_gen)
         print("p2p disc:")
         self._print_network(p2p_disc["out"])
     Z = T.fmatrix('Z')  # noise var
     X = T.tensor4('X')  # A
     Y = T.tensor4('Y')  # B
     # construct theano stuff for dcgan gen/disc
     dcgan = {'gen': dcgan_gen, 'disc': dcgan_disc}
     dcgan['gen_out'] = get_output(dcgan_gen, Z)  # G(z)
     dcgan['gen_out_det'] = get_output(dcgan_gen, Z, deterministic=True)
     dcgan['disc_out_real'] = get_output(dcgan_disc, X)  # D(x)
     dcgan['disc_out_fake'] = get_output(dcgan_disc,
                                         dcgan['gen_out'])  # D(G(z))
     # construct theano stuff for the p2p gen/disc
     p2p = {'gen': p2p_gen, 'disc': p2p_disc["out"]}
     p2p['disc_out_real'] = get_output(p2p_disc["out"], {
         p2p_disc["inputs"][0]: X,
         p2p_disc["inputs"][1]: Y
     })  # D(X,Y)
     p2p['gen_out'] = get_output(p2p_gen, X)
     p2p['gen_out_det'] = get_output(p2p_gen, X, deterministic=True)
     p2p['disc_out_fake'] = get_output(p2p_disc["out"], {
         p2p_disc["inputs"][0]: X,
         p2p_disc["inputs"][1]: p2p['gen_out']
     })  # D(X, X_to_y(X))
     if lsgan:
         adv_loss = squared_error
     else:
         adv_loss = binary_crossentropy
     # dcgan loss definitions
     gen_loss_dcgan = adv_loss(dcgan['disc_out_fake'], 1.).mean()
     disc_loss_dcgan = adv_loss(dcgan['disc_out_real'],
                                1.).mean() + adv_loss(
                                    dcgan['disc_out_fake'], 0.).mean()
     # p2p loss definitions
     gen_loss_p2p = adv_loss(p2p['disc_out_fake'], 1.).mean()
     assert reconstruction in ['l1', 'l2']
     if reconstruction == 'l2':
         recon_loss = squared_error(p2p['gen_out'], Y).mean()
     else:
         recon_loss = T.abs_(p2p['gen_out'] - Y).mean()
     #if not reconstruction_only:
     gen_total_loss_p2p = gen_loss_p2p + alpha * recon_loss
     #else:
     #    #log("GAN disabled, using only pixel-wise reconstruction loss...")
     #    gen_total_loss_p2p = recon_loss
     disc_loss_p2p = adv_loss(p2p['disc_out_real'], 1.).mean() + adv_loss(
         p2p['disc_out_fake'], 0.).mean()
     # dcgan params
     gen_params_dcgan = get_all_params(dcgan_gen, trainable=True)
     disc_params_dcgan = get_all_params(dcgan_disc, trainable=True)
     # pix2pix params
     gen_params_p2p = get_all_params(p2p_gen, trainable=True)
     disc_params_p2p = get_all_params(p2p_disc["out"], trainable=True)
     # --------------------
     if verbose:
         print("train_mode: %s" % train_mode)
     if train_mode == 'both':
         updates = opt(gen_loss_dcgan, gen_params_dcgan,
                       **opt_args)  # update dcgan generator
         updates.update(opt(disc_loss_dcgan, disc_params_dcgan,
                            **opt_args))  # update dcgan discriminator
         updates.update(opt(gen_total_loss_p2p, gen_params_p2p,
                            **opt_args))  # update p2p generator
         updates.update(opt(disc_loss_p2p, disc_params_p2p,
                            **opt_args))  # update p2p discriminator
     elif train_mode == 'dcgan':
         updates = opt(gen_loss_dcgan, gen_params_dcgan,
                       **opt_args)  # update dcgan generator
         updates.update(opt(disc_loss_dcgan, disc_params_dcgan,
                            **opt_args))  # update dcgan discriminator
     else:
         updates = opt(gen_total_loss_p2p, gen_params_p2p,
                       **opt_args)  # update p2p generator
         updates.update(opt(disc_loss_p2p, disc_params_p2p,
                            **opt_args))  # update p2p discriminator
     train_fn = theano.function([Z, X, Y], [
         gen_loss_dcgan, disc_loss_dcgan, gen_loss_p2p, recon_loss,
         disc_loss_p2p
     ],
                                updates=updates,
                                on_unused_input='warn')
     loss_fn = theano.function([Z, X, Y], [
         gen_loss_dcgan, disc_loss_dcgan, gen_loss_p2p, recon_loss,
         disc_loss_p2p
     ],
                               on_unused_input='warn')
     gen_fn = theano.function([X], p2p['gen_out'])
     gen_fn_det = theano.function([X], p2p['gen_out_det'])
     z_fn = theano.function([Z], dcgan['gen_out'])
     z_fn_det = theano.function([Z], dcgan['gen_out_det'])
     self.train_fn = train_fn
     self.loss_fn = loss_fn
     self.gen_fn = gen_fn
     self.gen_fn_det = gen_fn_det
     self.z_fn = z_fn
     self.z_fn_det = z_fn_det
     self.dcgan = dcgan
     self.p2p = p2p
     self.lr = opt_args['learning_rate']
     self.train_keys = [
         'dcgan_gen', 'dcgan_disc', 'p2p_gen', 'p2p_recon', 'p2p_disc'
     ]
Пример #43
0
    l2 = dropout(l2, p_drop_conv)

    #l3a = rectify(conv2d(T.cast(l2,'float64'), T.cast(w3,'float64')))
    #l3b = pool_2d(l3a, (2, 2))
    l3 = T.flatten(l2, outdim=2)
    #l3 = dropout(l3, p_drop_conv)

    l4 = rectify(T.dot(l3, w4))
    l4 = dropout(l4, p_drop_hidden)

    pyx = softmax(T.dot(l4, w_o))
    return l1, l2, l3, l4, pyx


X = T.ftensor4()
Y = T.fmatrix()
V = T.fscalar()

w = init_weights((32, 1, 3, 3))
w2 = init_weights((64, 32, 3, 3))
#w3 = init_weights((128, 64, 3, 3))
w4 = init_weights((64 * 6 * 6, 625))
w_o = init_weights((625, 10))

noise_l1, noise_l2, noise_l3, noise_l4, noise_py_x = model(
    X, w, w2, w4, 0.2, 0.5)
l1, l2, l3, l4, py_x = model(X, w, w2, w4, 0., 0.)
y_x = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w, w2, w4, w_o]
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3,
                    maxSentLength=30, emb_size=300, hidden_size=[300,10],
                    margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/';
    rng = numpy.random.RandomState(23455)
    datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength)
    vocab_size=len(word2id)+1
    mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt')
    wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2]
    indices_train_r=indices_train[1::2]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2]
    indices_test_r=indices_test[1::2]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    train_size = len(indices_train_l)
    test_size = len(indices_test_l)
    
    train_batch_start=range(train_size)
    test_batch_start=range(test_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int32')
#     indices_train_r=T.cast(indices_train_r, 'int32')
#     indices_test_l=T.cast(indices_test_l, 'int32')
#     indices_test_r=T.cast(indices_test_r, 'int32')
    


    rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng)
#     rand_values[0]=numpy.array(numpy.zeros(emb_size))
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True)      
    

    
    # allocate symbolic variables for the data
#     index = T.iscalar()
    x_index_l = T.imatrix()   # now, x is the index matrix, must be integer
    x_index_r = T.imatrix()
    y = T.ivector()  
    left_l=T.iscalar()
    right_l=T.iscalar()
    left_r=T.iscalar()
    right_r=T.iscalar()
    length_l=T.iscalar()
    length_r=T.iscalar()
    norm_length_l=T.fscalar()
    norm_length_r=T.fscalar()
    mts=T.fmatrix()
    wmf=T.fmatrix()
#     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))
    conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0]))
    layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0]))
    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)
    

    
    
    
    
    
    sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size))
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size))
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    HL_layer_1_input=T.concatenate([
#                                 mts, 
                                eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
                                uni_cosine,
#                                 sum_uni_l,
#                                 sum_uni_r,
#                                 sum_uni_l+sum_uni_r,
                                1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)),
                                cosine(layer0_l_output_maxpool, layer0_r_output_maxpool),
                                layer0_l_output_maxpool,
                                layer0_r_output_maxpool,
                                T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10),
                                
                                layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
                                layer1.output_cosine,
                                layer1.output_vector_l,
                                layer1.output_vector_r,
                                T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10),
#                                 len_l, len_r
                                layer1.output_attentions
#                                 wmf,
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input,
                                mts, len_l, len_r
#                                 wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_size=1+1+   1+1+3* nkerns[0]   +1+1+3*nkerns[0]+10*10
    
    HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2
    
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh)
    
    LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1)
    LR_layer_input_with_extra=T.concatenate([HL_layer_2.output,  HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output,
    
    LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2)
#     LR_layer_input=HL_layer_2.output
#     LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2)

#     layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()
#     diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix)
    cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg
    

    test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True)



    params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params 
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        clipped_grad = T.clip(grad_i, -0.5, 0.5)
        acc = acc_i + T.sqr(clipped_grad)
        updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True)

    train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True)



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is


    best_params = None
    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    nn_max_acc=0.0
    best_iter=0
    cost_tmp=0.0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data

        for index in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * train_size + minibatch_index +1

            minibatch_index=minibatch_index+1

#             if iter%update_freq != 0:
#                 cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
#                 #print 'cost_ij: ', cost_ij
#                 cost_tmp+=cost_ij
#                 error_sum+=error_ij
#             else:

            cost_i, error_i= train_model(indices_train_l[index: index + batch_size],
                                                              indices_train_r[index: index + batch_size],
                                                              trainY[index: index + batch_size],
                                                              trainLeftPad_l[index],
                                                              trainRightPad_l[index],
                                                              trainLeftPad_r[index],
                                                              trainRightPad_r[index],
                                                              trainLengths_l[index],
                                                              trainLengths_r[index],
                                                              normalized_train_length_l[index],
                                                              normalized_train_length_r[index],
                                                              mt_train[index: index + batch_size],
                                                              wm_train[index: index + batch_size])
            cost_tmp+=cost_i
            if iter < 6000 and iter %100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
            if iter >= 6000 and iter % 100 == 0:
#             if iter%100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
                test_losses=[]
                test_y=[]
                test_features=[]
                for index in test_batch_start:
                    test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size],
                                                                  indices_test_r[index: index + batch_size],
                                                                  testY[index: index + batch_size],
                                                                  testLeftPad_l[index],
                                                                  testRightPad_l[index],
                                                                  testLeftPad_r[index],
                                                                  testRightPad_r[index],
                                                                  testLengths_l[index],
                                                                  testLengths_r[index],
                                                                  normalized_test_length_l[index],
                                                                  normalized_test_length_r[index],
                                                                  mt_test[index: index + batch_size],
                                                                  wm_test[index: index + batch_size])
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_acc = (1-test_score) * 100.
                if test_acc > nn_max_acc:
                    nn_max_acc = test_acc
                print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc

                #now, see the results of svm
                if use_svm:
                    train_y=[]
                    train_features=[]
                    for index in train_batch_start: 
                        cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size],
                                                                  indices_train_r[index: index + batch_size],
                                                                  trainY[index: index + batch_size],
                                                                  trainLeftPad_l[index],
                                                                  trainRightPad_l[index],
                                                                  trainLeftPad_r[index],
                                                                  trainRightPad_r[index],
                                                                  trainLengths_l[index],
                                                                  trainLengths_r[index],
                                                                  normalized_train_length_l[index],
                                                                  normalized_train_length_r[index],
                                                                  mt_train[index: index + batch_size],
                                                                  wm_train[index: index + batch_size])
                        train_y.append(y[0])
                        train_features.append(layer3_input[0])
                        #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n')
                    #write_feature.close()
     
                    clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                    clf.fit(train_features, train_y)
                    results=clf.predict(test_features)
                    lr=LinearRegression().fit(train_features, train_y)
                    results_lr=lr.predict(test_features)
                    corr_count=0
                    corr_lr=0
                    test_size=len(test_y)
                    for i in range(test_size):
                        if results[i]==test_y[i]:
                            corr_count+=1
                        if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                            corr_lr+=1
                    acc=corr_count*1.0/test_size
                    acc_lr=corr_lr*1.0/test_size
                    if acc > max_acc:
                        max_acc=acc
                        best_iter=iter
                    if acc_lr> max_acc:
                        max_acc=acc_lr
                        best_iter=iter
                    print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ',    max_acc , ' at iter: ', best_iter

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Пример #45
0
import numpy as np
import pandas as pd
from materials import iris_dataset

# data
iris_data = iris_dataset()
iris_data = iris_data.reindex(np.random.permutation(iris_data.index))
iris_x = iris_data[iris_data.columns[:4]].as_matrix()
iris_y = pd.get_dummies(iris_data[iris_data.columns[4]]).values

input_dim = iris_x.shape[1]
hidden_dim = 9
output_dim = iris_y.shape[1]

# models
X = T.fmatrix('x')
Y = T.fmatrix('y')

W_i = theano.shared(np.random.randn(input_dim, hidden_dim), name='W')
b_i = theano.shared(np.zeros((hidden_dim, )), name='b')
W_h = theano.shared(np.random.randn(hidden_dim, output_dim), name='W')
b_h = theano.shared(np.zeros((output_dim, )), name='b')

o_h = T.nnet.sigmoid(T.dot(X, W_i) + b_i)
p_y_given_x = T.nnet.sigmoid(T.dot(o_h, W_h) + b_h)

# 训练设置
params = [W_i, b_i, W_h, b_h]
predict_func = theano.function(inputs=[X],
                               outputs=p_y_given_x,
                               allow_input_downcast=True)
Пример #46
0
    dense_1     = DenseLayer(input_state,
                             num_units    = n_input,
                             nonlinearity = tanh)

    dense_2     = DenseLayer(dense_1,
                             num_units    = n_input,
                             nonlinearity = tanh)

    probs       = DenseLayer(dense_2,
                             num_units    = n_output,
                             nonlinearity = softmax)

    return probs

X_state          = T.fmatrix()
X_action         = T.bvector()
X_reward         = T.fvector()

X_action_hot = to_one_hot(X_action, n_output)

prob_values = policy_network(X_state)

policy_ = get_output(prob_values)
policy  = theano.function(inputs               = [X_state],
                          outputs              = policy_,
                          allow_input_downcast = True)

loss = categorical_crossentropy(policy_, X_action_hot) * X_reward
loss = loss.mean()
Пример #47
0
def test_local_gpu_elemwise():
    """
    Test local_gpu_elemwise when there is a dtype upcastable to float32
    """
    a = tensor.bmatrix()
    b = tensor.fmatrix()
    c = tensor.fmatrix()

    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")

    # Due to optimization order, this composite is created when all
    # the op are on the gpu.
    f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)

    # Now test with the composite already on the cpu before we move it
    # to the gpu
    a_s = theano.scalar.int8()
    b_s = theano.scalar.float32()
    c_s = theano.scalar.float32()
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
    out_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)

    return  # Not yet implemeted
    # Test multiple output
    a_s = theano.scalar.float32()
    a = tensor.fmatrix()
    from theano.scalar.basic import identity
    out_s = theano.scalar.Composite(
        [a_s, b_s, c_s],
        [identity(a_s), identity(c_s),
         identity(b_s)])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v)
    utt.assert_allclose(out[1], c_v)
    utt.assert_allclose(out[2], b_v)

    # Test multiple output
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v + b_v)
    utt.assert_allclose(out[1], a_v * c_v)

    # Test non-contiguous input
    c = gpuarray_shared_constructor(numpy.asarray(c_v, dtype='float32'))
    f = theano.function([a, b],
                        outs_op(a[::2], b[::2], c[::2]),
                        mode=mode_with_gpu)
    out = f(a_v, b_v)
    utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
    utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
Пример #48
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              cap_dim,
              training=True,
              **kwargs
              ):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)

        # Number of capitalization features
        if cap_dim:
            n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        alpha_mask = T.fmatrix(name='alpha_mask')
        char_for_ids = T.imatrix(name='char_for_ids')
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        tag_ids = T.ivector(name='tag_ids')
        if cap_dim:
            cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings from %s...' % pre_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]
                        ).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word.lower()) in pretrained:
                        new_weights[i] = pretrained[
                            re.sub('\d', '0', word.lower())
                        ]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained embeddings.' % len(pretrained)
                print ('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained embeddings.') % (
                            c_found + c_lower + c_zeros, n_words,
                            100. * (c_found + c_lower + c_zeros) / n_words
                      )
                print ('%i found directly, %i after lowercasing, '
                       '%i after lowercasing + zero.') % (
                          c_found, c_lower, c_zeros
                      )

        #
        # Chars inputs
        #
        if char_dim:
            input_dim += char_lstm_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')

            char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_for')
            char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True,
                                 name='char_lstm_rev')

            char_lstm_for.link(char_layer.link(char_for_ids))
            char_lstm_rev.link(char_layer.link(char_rev_ids))

            char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]
            char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[
                T.arange(s_len), char_pos_ids
            ]

            inputs.append(char_for_output)
            if char_bidirect:
                inputs.append(char_rev_output)
                input_dim += char_lstm_dim

        #
        # Capitalization feature
        #
        if cap_dim:
            input_dim += cap_dim
            cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
            inputs.append(cap_layer.link(cap_ids))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate(
                [word_for_output, word_rev_output],
                axis=1
            )
            tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
                                     name='tanh_layer', activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))],
                axis=1
            )
            observations = T.concatenate(
                [b_s, observations, e_s],
                axis=0
            )

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]
            ].sum()

            all_paths_scores = forward(observations, transitions)
            cost = - (real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(char_layer)
            self.add_component(char_lstm_for)
            params.extend(char_layer.params)
            params.extend(char_lstm_for.params)
            if char_bidirect:
                self.add_component(char_lstm_rev)
                params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if cap_dim:
            self.add_component(cap_layer)
            params.extend(cap_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        if cap_dim:
            eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]
        conf_inputs = eval_inputs + [alpha_mask]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(
                inputs=train_inputs,
                outputs=cost,
                updates=updates,
                givens=({is_train: np.cast['int32'](1)} if dropout else {})
            )
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores,
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward(observations, transitions, viterbi=True,
                                return_alpha=False, return_best_sequence=True),
                givens=({is_train: np.cast['int32'](0)} if dropout else {})
            )

            f_conf = theano.function(
                inputs=conf_inputs,
                outputs=conf(observations, transitions, alpha_mask),
                givens=({is_train: np.cast['int32'](0)} if dropout else {}),
                on_unused_input='ignore'
            )


        return f_train, f_eval, f_conf
Пример #49
0
    if type_mod is "alexnet":
        dim_in = 9216
    if type_mod is "vgg_16":
        dim_in = 25088
    if type_mod is "vgg_19":
        dim_in = 9216
    if type_mod is "googlenet":
        dim_in = 9216
    faceset = "lfpw"
    fd_data = "../../inout/data/face/" + faceset + "_data/"
    path_valid = fd_data + type_mod + "valid.pkl"
    w, h = 50, 50
    if type_mod is not None and type_mod is not "":
        w, h = dim_in, 1
    input = T.tensor4("x_input")
    output = T.fmatrix("y_output")

    # Create mixed data
    nbr_sup, nbr_xx, nbr_yy = 676, 0, 0
    id_data = type_mod + "ch_tr_" + str(nbr_sup) + '_' + str(nbr_xx) + '_' +\
        str(nbr_yy)
    # List train chuncks
    l_ch_tr = [
        fd_data + id_data + "_" + str(i) + ".pkl" for i in range(0, 1)]

    time_exp = DT.datetime.now().strftime('%m_%d_%Y_%H_%M_%s')
    fold_exp = "../../exps/" + faceset + "_deep_convaeIN_" + time_exp
    if not os.path.exists(fold_exp):
        os.makedirs(fold_exp)
    nbr_layers = 5
    init_w_path = "../../inout/init_weights/deep_conv_ae_IN_" +\
Пример #50
0
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        updates.append([p, p - g * lr])
    return updates


def model(X, w_h, w_o):
    h = T.nnet.sigmoid(T.dot(X, w_h))
    pyx = T.nnet.softmax(T.dot(h, w_o))
    return pyx


trX, teX, trY, teY = mnist(onehot=True)

X = T.fmatrix()
Y = T.fmatrix()

w_h = init_weights((784, 625))
w_o = init_weights((625, 10))

py_x = model(X, w_h, w_o)
y_x = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
params = [w_h, w_o]
updates = sgd(cost, params)

train = theano.function(inputs=[X, Y],
                        outputs=cost,
                        updates=updates,
Пример #51
0
def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    """
    This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias

    We check that we loop when their is too much threads

    """

    n_in = 1000
    batch_size = 4097
    n_out = 1250

    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
        n_in = 4098
        n_out = 4099

    y = T.lvector('y')

    b = T.fvector('b')

    # we precompute the dot with big shape before to allow the test of
    # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
    # (the launch timed out and was terminated) on GPU card not
    # powerful enough. We need the big shape to check for corner
    # case.
    dot_result = T.fmatrix('dot_result')

    # Seed numpy.random with config.unittests.rseed
    utt.seed_rng()

    xx = np.asarray(np.random.rand(batch_size, n_in), dtype=np.float32)
    yy = np.ones((batch_size, ), dtype='int32')
    b_values = np.zeros((n_out, ), dtype='float32')
    W_values = np.asarray(np.random.rand(n_in, n_out), dtype='float32')

    dot_value = np.asarray(np.dot(xx, W_values), dtype='float32')
    del W_values
    p_y_given_x = T.nnet.softmax(dot_result + b)
    y_pred = T.argmax(p_y_given_x, axis=-1)
    loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
    dW = T.grad(loss, dot_result)
    classify = theano.function(inputs=[y, b, dot_result],
                               outputs=[loss, y_pred, dW],
                               mode=mode_without_gpu)
    classify_gpu = theano.function(inputs=[y, b, dot_result],
                                   outputs=[loss, y_pred, dW],
                                   mode=mode_with_gpu)

    assert any([
        isinstance(node.op, T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
        for node in classify.maker.fgraph.toposort()
    ])
    assert any([
        isinstance(node.op, GpuCrossentropySoftmaxArgmax1HotWithBias)
        for node in classify_gpu.maker.fgraph.toposort()
    ])

    out = classify(yy, b_values, dot_value)
    gout = classify_gpu(yy, b_values, dot_value)

    assert len(out) == len(gout) == 3
    utt.assert_allclose(out[0], gout[0])
    utt.assert_allclose(out[2], gout[2], atol=3e-6)
    utt.assert_allclose(out[1], gout[1])
Пример #52
0
        unsup_weight_var = T.scalar('unsup_weight')
    
    learning_rate_var = T.scalar('learning_rate')
    adam_beta1_var = T.scalar('adam_beta1')
    
#    #Left sdp length
#    left_sdp_length=T.imatrix('left_sdp_length')
#    #Sentences length
#    sen_length=T.imatrix('sen_length')
    
    #negative loss
    negative_loss_alpha=T.fvector("negative_loss_alpha")
    negative_loss_lamda=T.fscalar("negative_loss_lamda") 
    
    #input attention entity and root
    input_root=T.fmatrix("input_root")
    input_e1=T.fmatrix("input_e1")
    input_e2=T.fmatrix("input_e2")
    epoch_att=T.iscalar("epoch_att")
    
    """
    2.
    Bulit GRU network
    ADAM
    """
    gru_network,l_in,l_mask,l_gru_forward,l_split_cnn=model.bulit_gru(input_var,mask_var)
    
    #mask_train_input: where "1" is pass. where "0" isn't pass.
    mask_train_input=kbp_data.mask_train_input(training_label,num_labels=model.num_labels)
    
    # Create a loss expression for training, i.e., a scalar objective we want
Пример #53
0
def make_predict_next(net): 
    out_prev = T.imatrix()
    rep_prev = T.fmatrix()
    rep = net.LM(rep_prev, net.Embed(out_prev))
    out = softmax3d(net.Embed.unembed(net.ToTxt(rep)))
    return theano.function([rep_prev, out_prev], [last(rep), out])
Пример #54
0
if __name__ == '__main__':
    import os
    os.environ[
        'THEANO_FLAGS'] = "floatX=float32, mode=FAST_RUN, lib.cnmem=0, warn_float64='raise'"
    import numpy as np, time
    import theano
    from lasagne_ext.objectives import CTC_Logscale
    from theano import tensor
    from torch.autograd import Variable
    # from ctc import best_path_decode
    # np.random.seed(33)
    B = 10
    C = 50
    L = 10
    T = 500
    x1, x2, x3, x4, x5 = tensor.fmatrix(name='queryseq'), \
                         tensor.tensor3(dtype='float32', name='scorematrix'), \
                         tensor.fmatrix(name='queryseq_mask'),\
                         tensor.fmatrix(name='scorematrix_mask'), \
                         tensor.fscalar(name='blank_symbol')

    scorematrix = np.random.rand(T, C + 1, B).astype(np.float32)
    query = np.random.randint(0, C, (L, B)).astype(np.float32)
    query_mask = np.random.rand(L, B) > 0.1
    sm_mask = np.random.rand(T, B) > 0.1

    result = CTC_Logscale.cost(x1, x2, x3, x4, x5, align='pre')
    f2 = theano.function([x1, x2, x3, x4, x5], result, on_unused_input='warn')

    time2 = time.time()
    result = f2(query, scorematrix, query_mask.astype(np.float32),
    def __init__(self, We_initial, char_embedd_table_initial, params):
        self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)
        We_inf = theano.shared(We_initial)

        embsize = We_initial.shape[1]
        hidden = params.hidden

        hidden_inf = params.hidden_inf

        input_var = T.imatrix(name='inputs')
        target_var = T.imatrix(name='targets')
        mask_var = T.fmatrix(name='masks')
        mask_var1 = T.fmatrix(name='masks1')
        length = T.iscalar()
        t_t = T.fscalar()

        Wyy0 = np.random.uniform(
            -0.02, 0.02,
            (params.num_labels + 1, params.num_labels)).astype('float32')
        Wyy = theano.shared(Wyy0)

        char_input_var = T.itensor3()

        char_embedd_dim = params.char_embedd_dim
        char_dic_size = len(params.char_dic)
        char_embedd_table = theano.shared(char_embedd_table_initial)
        char_embedd_table_inf = theano.shared(char_embedd_table_initial)

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        layer_char_input = lasagne.layers.InputLayer(shape=(None, None,
                                                            Max_Char_Length),
                                                     input_var=char_input_var,
                                                     name='char-input')

        layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(
            layer_char,
            input_size=char_dic_size,
            output_size=char_embedd_dim,
            W=char_embedd_table,
            name='char_embedding')

        layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding,
                                                    pattern=(0, 2, 1))

        # first get some necessary dimensions or parameters
        conv_window = 3
        num_filters = params.num_filters

        # construct convolution layer
        cnn_layer = lasagne.layers.Conv1DLayer(
            layer_char,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        _, _, pool_size = cnn_layer.output_shape

        # construct max pool layer
        pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer,
                                                   pool_size=pool_size)
        # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
        output_cnn_layer = lasagne.layers.reshape(pool_layer,
                                                  (-1, length, [1]))

        # finally, concatenate the two incoming layers together.
        l_emb_word = lasagne.layers.concat([output_cnn_layer, l_emb_word],
                                           axis=2)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word,
                                                backwards=True)

        concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)

        l_reshape_concat = lasagne.layers.ReshapeLayer(concat,
                                                       (-1, 2 * hidden))

        l_local = lasagne.layers.DenseLayer(
            l_reshape_concat,
            num_units=params.num_labels,
            nonlinearity=lasagne.nonlinearities.linear)

        network_params = lasagne.layers.get_all_params(l_local, trainable=True)
        network_params.append(Wyy)

        print len(network_params)
        f = open(
            'ccctag_BiLSTM_CNN_CRF_num_filters_30_dropout_1_LearningRate_0.01_0.0_400_emb_1_tagversoin_2.pickle',
            'r')
        data = pickle.load(f)
        f.close()

        for idx, p in enumerate(network_params):

            p.set_value(data[idx])

        l_in_word_a = lasagne.layers.InputLayer((None, None))
        l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None))

        l_emb_word_a = lasagne.layers.EmbeddingLayer(
            l_in_word_a,
            input_size=We_initial.shape[0],
            output_size=embsize,
            W=We_inf,
            name='inf_word_embedding')

        layer_char_input_a = lasagne.layers.InputLayer(
            shape=(None, None, Max_Char_Length),
            input_var=char_input_var,
            name='char-input')

        layer_char_a = lasagne.layers.reshape(layer_char_input_a, (-1, [2]))
        layer_char_embedding_a = lasagne.layers.EmbeddingLayer(
            layer_char_a,
            input_size=char_dic_size,
            output_size=char_embedd_dim,
            W=char_embedd_table_inf,
            name='char_embedding')

        layer_char_a = lasagne.layers.DimshuffleLayer(layer_char_embedding_a,
                                                      pattern=(0, 2, 1))

        # first get some necessary dimensions or parameters
        conv_window = 3
        num_filters = params.num_filters
        #_, sent_length, _ = incoming2.output_shape

        # dropout before cnn?
        if params.dropout:
            layer_char_a = lasagne.layers.DropoutLayer(layer_char_a, p=0.5)

# construct convolution layer
        cnn_layer_a = lasagne.layers.Conv1DLayer(
            layer_char_a,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        #_, _, pool_size = cnn_layer.output_shape

        # construct max pool layer
        pool_layer_a = lasagne.layers.MaxPool1DLayer(cnn_layer_a,
                                                     pool_size=pool_size)
        # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
        output_cnn_layer_a = lasagne.layers.reshape(pool_layer_a,
                                                    (-1, length, [1]))

        # finally, concatenate the two incoming layers together.
        l_emb_word_a = lasagne.layers.concat(
            [output_cnn_layer_a, l_emb_word_a], axis=2)

        if params.dropout:
            l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5)

        if (params.inf == 0):
            l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                      hidden_inf,
                                                      mask_input=l_mask_word_a)
            l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a,
                                                      hidden_inf,
                                                      mask_input=l_mask_word_a,
                                                      backwards=True)

            l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a,
                                                       (-1, hidden_inf))
            l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a,
                                                       (-1, hidden_inf))
            concat2_a = lasagne.layers.ConcatLayer(
                [l_reshapef_a, l_reshapeb_a])
        else:
            """
			### unigram
                        l_cnn_input_a = lasagne.layers.DimshuffleLayer(l_emb_word_a, (0, 2, 1))
			#l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 3, 1, pad = 'same')
                        #l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 1, 1, pad = 'same')
                        #l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a], axis=1)
                        l_cnn_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 1, 1, pad = 'same')
                        concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1))
                        #concat2_a = lasagne.layers.ConcatLayer([l_emb_word, concat2], axis =2)
                        concat2_a = lasagne.layers.ReshapeLayer(concat2_a ,(-1, hidden))
			"""
            """
			#### unigram + trigram
			l_cnn_input_a = lasagne.layers.DimshuffleLayer(l_emb_word_a, (0, 2, 1))
			l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 3, 1, pad = 'same')
			l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 1, 1, pad = 'same')
			l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a], axis=1)
			concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1))
			concat2_a = lasagne.layers.ReshapeLayer(concat2_a ,(-1, 2*hidden))
			"""
            #### unigram + 5-gram
            l_cnn_input_a = lasagne.layers.DimshuffleLayer(
                l_emb_word_a, (0, 2, 1))
            l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a,
                                                   hidden_inf,
                                                   3,
                                                   1,
                                                   pad='same')
            l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a,
                                                   hidden_inf,
                                                   1,
                                                   1,
                                                   pad='same')
            l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a],
                                                 axis=1)
            concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1))
            concat2_a = lasagne.layers.ReshapeLayer(concat2_a,
                                                    (-1, 2 * hidden_inf))

        if params.dropout:
            concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5)

        l_local_a = lasagne.layers.DenseLayer(
            concat2_a,
            num_units=params.num_labels,
            nonlinearity=lasagne.nonlinearities.softmax)

        a_params = lasagne.layers.get_all_params(l_local_a, trainable=True)
        self.a_params = a_params

        def inner_function(targets_one_step, mask_one_step, prev_label,
                           tg_energy):
            """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """
            new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1])
            new_ta_energy_t = tg_energy + T.sum(
                new_ta_energy * targets_one_step, axis=1)
            tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy)

            return [targets_one_step, tg_energy_t]

        local_energy = lasagne.layers.get_output(
            l_local, {
                l_in_word: input_var,
                l_mask_word: mask_var,
                layer_char_input_a: char_input_var
            })
        local_energy = local_energy.reshape((-1, length, params.num_labels))
        local_energy = local_energy * mask_var[:, :, None]

        #####################
        # for the end symbole of a sequence
        ####################

        end_term = Wyy[:-1, -1]
        local_energy = local_energy + end_term.dimshuffle(
            'x', 'x', 0) * mask_var1[:, :, None]

        predy0 = lasagne.layers.get_output(
            l_local_a, {
                l_in_word_a: input_var,
                l_mask_word_a: mask_var,
                layer_char_input_a: char_input_var
            })
        predy_inf = lasagne.layers.get_output(
            l_local_a, {
                l_in_word_a: input_var,
                l_mask_word_a: mask_var,
                layer_char_input_a: char_input_var
            },
            deterministic=True)
        predy_inf = predy_inf.reshape((-1, length, params.num_labels))

        predy_in = T.argmax(predy0, axis=1)
        A = T.extra_ops.to_one_hot(predy_in, params.num_labels)
        A = A.reshape((-1, length, params.num_labels))

        predy = predy0.reshape((-1, length, params.num_labels))
        predy = predy * mask_var[:, :, None]

        targets_shuffled = predy.dimshuffle(1, 0, 2)
        target_time0 = targets_shuffled[0]

        masks_shuffled = mask_var.dimshuffle(1, 0)

        initial_energy0 = T.dot(target_time0, Wyy[-1, :-1])

        initials = [target_time0, initial_energy0]
        [_, target_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials,
            sequences=[targets_shuffled[1:], masks_shuffled[1:]])
        cost11 = target_energies[-1] + T.sum(
            T.sum(local_energy * predy, axis=2) * mask_var, axis=1)

        # compute the ground-truth energy

        targets_shuffled0 = A.dimshuffle(1, 0, 2)
        target_time00 = targets_shuffled0[0]

        initial_energy00 = T.dot(target_time00, Wyy[-1, :-1])

        initials0 = [target_time00, initial_energy00]
        [_, target_energies0], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials0,
            sequences=[targets_shuffled0[1:], masks_shuffled[1:]])
        cost110 = target_energies0[-1] + T.sum(
            T.sum(local_energy * A, axis=2) * mask_var, axis=1)

        predy_f = predy.reshape((-1, params.num_labels))
        y_f = target_var.flatten()

        if (params.annealing == 0):
            lamb = params.L3
        elif (params.annealing == 1):
            lamb = params.L3 * (1 - 0.01 * t_t)

        if (params.regutype == 0):
            ce_hinge = lasagne.objectives.categorical_crossentropy(
                predy_f + eps, y_f)
            ce_hinge = ce_hinge.reshape((-1, length))
            ce_hinge = T.sum(ce_hinge * mask_var, axis=1)
            cost = T.mean(-cost11) + lamb * T.mean(ce_hinge)
        else:

            entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1)
            entropy_term = entropy_term.reshape((-1, length))
            entropy_term = T.sum(entropy_term * mask_var, axis=1)
            cost = T.mean(-cost11) - lamb * T.mean(entropy_term)

        #from adam import adam
        #updates_a = adam(cost, a_params, params.eta)

        updates_a = lasagne.updates.sgd(cost, a_params, params.eta)
        updates_a = lasagne.updates.apply_momentum(updates_a,
                                                   a_params,
                                                   momentum=0.9)

        if (params.regutype == 0):
            self.train_fn = theano.function([
                input_var, char_input_var, target_var, mask_var, mask_var1,
                length, t_t
            ], [cost, ce_hinge],
                                            updates=updates_a,
                                            on_unused_input='ignore')
        else:
            self.train_fn = theano.function([
                input_var, char_input_var, target_var, mask_var, mask_var1,
                length, t_t
            ], [cost, entropy_term],
                                            updates=updates_a,
                                            on_unused_input='ignore')

        prediction = T.argmax(predy_inf, axis=2)
        corr = T.eq(prediction, target_var)
        corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        num_tokens = mask_var.sum(dtype=theano.config.floatX)

        self.eval_fn = theano.function([
            input_var, char_input_var, target_var, mask_var, mask_var1, length
        ], [corr_train, num_tokens, prediction],
                                       on_unused_input='ignore')
Пример #56
0
    def test_theano_grad(self):
        class AttentionLayer(object):
            def __init__(self, u, mask=None):
                self.u = theano.shared(value=u)
                self.mask = mask

            def get_output_expr(self, input_expr):
                input_expr = input_expr.dimshuffle(0, 2, 1)
                pre_a = T.dot(input_expr, self.u)[:, :, 0]
                if self.mask:
                    pre_a = self.mask * pre_a - \
                            (1 - self.mask) * 3.402823466e+38
                a = T.nnet.softmax(pre_a)[:, :, np.newaxis]
                return T.sum(a * input_expr, axis=1)

        class LogisticRegressionLayer(object):
            def __init__(self, W, b):
                self.W = theano.shared(value=W)
                if b is not None:
                    self.b = theano.shared(value=b[0])

            def get_output_expr(self, input_expr):
                if hasattr(self, 'b'):
                    return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b)
                else:
                    return T.nnet.sigmoid(T.dot(input_expr, self.W))

        r = []
        for i in xrange(self.N):
            batch_size = self.rng.random_integers(500)
            x_dim = self.rng.random_integers(3000)
            n_ts = self.rng.random_integers(100)
            x = [
                self.rng.rand(batch_size, x_dim).astype(np.float32)
                for _ in xrange(n_ts)
            ]
            u = self.get_orthogonal_matrix(x_dim, 1)
            lr_dot_W = self.get_orthogonal_matrix(x_dim, 1)
            lr_dot_b = self.rng.rand(1, 1).astype(
                np.float32) if self.rng.randint(2) else None
            true_labels = self.rng.randint(2, size=(batch_size,
                                                    1)).astype(np.float32)
            mask = self.rng.randint(2, size=(batch_size, n_ts)).astype(
                np.float32) if self.rng.randint(2) else None
            device_id = 0

            # Theano model
            state = self.rng.get_state()
            th_x = T.ftensor3()
            th_mask = T.fmatrix() if mask is not None else None

            th_true_labels = T.fmatrix()
            attnt_layer = AttentionLayer(u, th_mask)
            lr_layer = LogisticRegressionLayer(lr_dot_W, lr_dot_b)
            probs = th_x
            for layer in [attnt_layer, lr_layer]:
                probs = layer.get_output_expr(probs)
            loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels))

            params = [lr_layer.W, attnt_layer.u, th_x]
            if hasattr(lr_layer, 'b'):
                params.append(lr_layer.b)
            th_grads = T.grad(loss, wrt=params)
            get_theano_grads = theano.function(
                [th_x, th_true_labels] +
                ([th_mask] if mask is not None else []), th_grads)
            th_grads = get_theano_grads(
                *([np.dstack(x), true_labels] +
                  ([mask] if mask is not None else [])))

            # quagga model
            self.rng.set_state(state)
            x = List([Connector(Matrix.from_npa(e), device_id) for e in x])
            u = Connector(Matrix.from_npa(u), device_id)
            lr_dot_W = Connector(Matrix.from_npa(lr_dot_W), device_id)
            lr_dot_b = Connector(
                Matrix.from_npa(lr_dot_b),
                device_id) if lr_dot_b is not None else lr_dot_b
            true_labels = Connector(Matrix.from_npa(true_labels))
            if mask is not None:
                mask = Connector(Matrix.from_npa(mask))

            attnt_block = AttentionBlock(x, u, mask)
            lrdot_block = DotBlock(lr_dot_W, lr_dot_b, attnt_block.output)
            sce_block = SigmoidCeBlock(lrdot_block.output, true_labels)

            x.fprop()
            true_labels.fprop()
            u.fprop()
            lr_dot_W.fprop()
            if lr_dot_b:
                lr_dot_b.fprop()
            attnt_block.fprop()
            lrdot_block.fprop()
            sce_block.fprop()
            sce_block.bprop()
            lrdot_block.bprop()
            attnt_block.bprop()
            q_grads = [
                lr_dot_W.backward_matrix.to_host(),
                u.backward_matrix.to_host(),
                np.dstack([e.backward_matrix.to_host() for e in x])
            ]
            if lr_dot_b:
                q_grads.append(lr_dot_b.backward_matrix.to_host())

            for th_grad, q_grad in izip(th_grads, q_grads):
                r.append(np.allclose(th_grad, q_grad, atol=1.e-7))
                print r[-1]

        self.assertEqual(sum(r), len(r))
Пример #57
0
def train():
    global logfile_path
    global trainfile
    global train0file
    global test1file

    batch_size = int(256)
    embedding_size = 300
    learning_rate = 0.005
    n_epochs = 20000
    words_num_dim = 1200
    validation_freq = 10
    filter_sizes = [1, 2, 3, 5]
    num_filters = 500
    margin_size = 0.05

    logfile_path = os.path.join(logfile_path, 'LSTM-' + GetNowTime() + '-' \
                   + 'batch_size-' + str(batch_size) + '-' \
                   + 'num_filters-' + str(num_filters) + '-' \
                   + 'embedding_size-' + str(embedding_size) + '-' \
                   + 'n_epochs-' + str(n_epochs) + '-' \
                   + 'freq-' + str(validation_freq) + '-' \
                   + '-log.txt')

    log("New start ...", logfile_path)
    log(str(time.asctime(time.localtime(time.time()))), logfile_path)
    log("batch_size = " + str(batch_size), logfile_path)
    log("filter_sizes = " + str(filter_sizes), logfile_path)
    log("num_filters = " + str(num_filters), logfile_path)
    log("embedding_size = " + str(embedding_size), logfile_path)
    log("learning_rate = " + str(learning_rate), logfile_path)
    log("words_num_dim = " + str(words_num_dim), logfile_path)
    log("n_epochs = " + str(n_epochs), logfile_path)
    log("margin_size = " + str(margin_size), logfile_path)
    log("validation_freq = " + str(validation_freq), logfile_path)
    log("train_1_file = " + str(trainfile.split('/')[-1]), logfile_path)
    log("train_0_file = " + str(train0file.split('/')[-1]), logfile_path)
    log("test_file = " + str(test1file.split('/')[-1]), logfile_path)
    log("vector_file = " + str(vectorsfile.split('/')[-1]), logfile_path)

    vocab = build_vocab()
    word_embeddings = load_word_embeddings(vocab, embedding_size)
    trainList = load_train_list()
    testList = load_test_list()
    train0Dict = load_train0_dict()
    train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_train_data_from_2files(train0Dict, trainList, vocab, batch_size, words_num_dim)
    x1, x2, x3 = T.fmatrix('x1'), T.fmatrix('x2'), T.fmatrix('x3')
    m1, m2, m3 = T.fmatrix('m1'), T.fmatrix('m2'), T.fmatrix('m3')
    model = LSTM(
        input1=x1, input2=x2, input3=x3,
        mask1=m1, mask2=m2, mask3=m3,
        word_embeddings=word_embeddings,
        batch_size=batch_size,
        sequence_len=train_x1.shape[0], #row is sequence_len
        embedding_size=embedding_size,
        filter_sizes=filter_sizes,
        num_filters=num_filters,
        margin_size = margin_size)

    cost, cos12, cos13 = model.cost, model.cos12, model.cos13
    params, accuracy = model.params, model.accuracy
    grads = T.grad(cost, params)
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    p1, p2, p3 = T.fmatrix('p1'), T.fmatrix('p2'), T.fmatrix('p3')
    q1, q2, q3 = T.fmatrix('q1'), T.fmatrix('q2'), T.fmatrix('q3')
    train_model = theano.function(
        [p1, p2, p3, q1, q2, q3], 
        [cost, accuracy], 
        updates=updates,
        givens={
            x1: p1, x2: p2, x3: p3, m1: q1, m2: q2, m3: q3
        }
    )

    v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3')
    u1, u2, u3 = T.matrix('u1'), T.matrix('u2'), T.matrix('u3')
    validate_model = theano.function(
        inputs=[v1, v2, v3, u1, u2, u3],
        outputs=[cos12, cos13],
        #updates=updates,
        givens={
            x1: v1, x2: v2, x3: v3, m1: u1, m2: u2, m3: u3
        }
    )

    epoch = 0
    done_looping = False
    while (epoch < n_epochs) and (not done_looping):
        epoch += 1
        train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_train_data_from_2files(train0Dict, trainList, vocab, batch_size, words_num_dim)
        #print('train_x1, train_x2, train_x3')
        #print(train_x1.shape, train_x2.shape, train_x3.shape)
        cost_ij, acc = train_model(train_x1, train_x2, train_x3, mask1, mask2, mask3)
        log('load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc), logfile_path)
        if epoch % validation_freq == 0:
            log('Evaluation ......', logfile_path)
            validation(validate_model, testList, vocab, batch_size, words_num_dim)
Пример #58
0
    def _prepare_networks(self, n_items):
        ''' Prepares the building blocks of the RNN, but does not compile them:
		self.l_in : input layer
		self.l_mask : mask of the input layer
		self.target : target of the network
		self.l_out : output of the network
		self.cost : cost function
		'''

        self.n_items = n_items

        # Theano tensor for the targets
        input_var = theano.sparse.csr_matrix('input_var')
        self.target = T.ivector('target_output')
        self.exclude = T.fmatrix('excluded_items')
        self.samples = T.ivector('samples')
        self.cluster_samples = T.ivector('cluster_samples')

        # The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie
        self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size,
                                                     self.n_items),
                                              input_var=input_var)

        l_user_rep = SparseLayer(self.l_in,
                                 num_units=self.n_hidden,
                                 nonlinearity=None,
                                 b=None)

        self.user_representation_layer = l_user_rep

        # The sliced output is then passed through linear layer to obtain the right output size
        self.l_out = BlackoutLayer(l_user_rep,
                                   num_units=self.n_items,
                                   num_outputs=self.n_samples,
                                   nonlinearity=None,
                                   W=lasagne.init.GlorotUniform())

        # lasagne.layers.get_output produces a variable for the output of the net
        network_output = lasagne.layers.get_output(self.l_out,
                                                   targets=self.target,
                                                   samples=self.samples)

        # loss function
        self.cost = self._loss(network_output, self.batch_size).mean()
        if self.reg > 0.:
            self.cost += self.reg * lasagne.regularization.regularize_network_params(
                self.l_out, lasagne.regularization.l2)
        elif self.reg < 0.:
            self.cost -= self.reg * lasagne.regularization.regularize_network_params(
                self.l_out, lasagne.regularization.l1)

        # Cluster learning
        self.T_scale = theano.shared(self.effective_scale)
        scaled_softmax = lambda x: lasagne.nonlinearities.softmax(x * self.
                                                                  T_scale)

        self.cluster_selection_layer = lasagne.layers.DenseLayer(
            l_user_rep, b=None, num_units=self.n_clusters, nonlinearity=None)
        cluster_selection = lasagne.layers.get_output(
            self.cluster_selection_layer)
        if self.cluster_selection_noise > 0.:
            cluster_selection = cluster_selection + self._srng.normal(
                cluster_selection.shape,
                avg=0.0,
                std=self.cluster_selection_noise)
        cluster_selection = scaled_softmax(cluster_selection)

        self.cluster_repartition = theano.shared(
            (0.1 * np.random.randn(self.n_items, self.n_clusters)).astype(
                theano.config.floatX))
        if self.cluster_type == 'softmax':
            target_and_samples_clusters = scaled_softmax(
                self.cluster_repartition[
                    T.concatenate([self.target, self.cluster_samples]), :])
        elif self.cluster_type == 'mix':
            target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) + \
             T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :])
        else:
            target_and_samples_clusters = T.nnet.sigmoid(
                self.T_scale * self.cluster_repartition[
                    T.concatenate([self.target, self.cluster_samples]), :])
        cluster_score = cluster_selection.dot(target_and_samples_clusters.T)

        self.cost_clusters = self._loss(cluster_score, self.batch_size).mean()
Пример #59
0
def main(options):
    print 'Build and compile network'
    input_data = T.ftensor3('input_data')
    input_mask = T.fmatrix('input_mask')
    target_data = T.imatrix('target_data')
    target_mask = T.fmatrix('target_mask')

    network = build_network(
        input_data=input_data,
        input_mask=input_mask,
        num_inputs=options['num_inputs'],
        num_inner_units_list=options['num_inner_units_list'],
        num_factor_units_list=options['num_factor_units_list'],
        num_outer_units_list=options['num_outer_units_list'],
        num_outputs=options['num_outputs'],
        gating_nonlinearity=options['gating_nonlinearity'],
        dropout_ratio=options['dropout_ratio'],
        weight_noise=options['weight_noise'],
        use_layer_norm=options['use_layer_norm'],
        peepholes=options['peepholes'],
        learn_init=options['learn_init'],
        grad_clipping=options['grad_clipping'])

    network_params = get_all_params(network, trainable=True)

    if options['reload_model']:
        print('Loading Parameters...')
        pretrain_network_params_val, pretrain_update_params_val, pretrain_total_batch_cnt = pickle.load(
            open(options['reload_model'], 'rb'))

        print('Applying Parameters...')
        set_model_param_value(network_params, pretrain_network_params_val)
    else:
        pretrain_update_params_val = None
        pretrain_total_batch_cnt = 0

    print 'Build network trainer'
    training_fn, trainer_params = set_network_trainer(
        input_data=input_data,
        input_mask=input_mask,
        target_data=target_data,
        target_mask=target_mask,
        num_outputs=options['num_outputs'],
        network=network,
        updater=options['updater'],
        learning_rate=options['lr'],
        grad_max_norm=options['grad_norm'],
        l2_lambda=options['l2_lambda'],
        load_updater_params=pretrain_update_params_val)

    print 'Build network predictor'
    predict_fn = set_network_predictor(input_data=input_data,
                                       input_mask=input_mask,
                                       target_data=target_data,
                                       target_mask=target_mask,
                                       num_outputs=options['num_outputs'],
                                       network=network)

    print 'Load data stream'
    train_datastream = get_datastream(path=options['data_path'],
                                      which_set='train_si84',
                                      batch_size=options['batch_size'])

    print 'Start training'
    if os.path.exists(options['save_path'] + '_eval_history.npz'):
        evaluation_history = numpy.load(
            options['save_path'] +
            '_eval_history.npz')['eval_history'].tolist()
    else:
        evaluation_history = [[[10.0, 10.0, 1.0], [10.0, 10.0, 1.0]]]
    early_stop_flag = False
    early_stop_cnt = 0
    total_batch_cnt = 0

    try:
        # for each epoch
        for e_idx in range(options['num_epochs']):
            # for each batch
            for b_idx, data in enumerate(
                    train_datastream.get_epoch_iterator()):
                total_batch_cnt += 1
                if pretrain_total_batch_cnt >= total_batch_cnt:
                    continue

                # get input, target data
                input_data = data[0].astype(floatX)
                input_mask = data[1].astype(floatX)

                # get target data
                target_data = data[2]
                target_mask = data[3].astype(floatX)

                # get output
                train_output = training_fn(input_data, input_mask, target_data,
                                           target_mask)
                train_predict_cost = train_output[0]
                network_grads_norm = train_output[1]

                # show intermediate result
                if total_batch_cnt % options[
                        'train_disp_freq'] == 0 and total_batch_cnt != 0:
                    best_idx = numpy.asarray(evaluation_history)[:, 1,
                                                                 2].argmin()
                    print '============================================================================================'
                    print 'Model Name: ', options['save_path'].split('/')[-1]
                    print '============================================================================================'
                    print 'Epoch: ', str(e_idx), ', Update: ', str(
                        total_batch_cnt)
                    print '--------------------------------------------------------------------------------------------'
                    print 'Prediction Cost: ', str(train_predict_cost)
                    print 'Gradient Norm: ', str(network_grads_norm)
                    print '--------------------------------------------------------------------------------------------'
                    print 'Train NLL: ', str(
                        evaluation_history[-1][0][0]), ', BPC: ', str(
                            evaluation_history[-1][0][1]), ', FER: ', str(
                                evaluation_history[-1][0][2])
                    print 'Valid NLL: ', str(
                        evaluation_history[-1][1][0]), ', BPC: ', str(
                            evaluation_history[-1][1][1]), ', FER: ', str(
                                evaluation_history[-1][1][2])
                    print '--------------------------------------------------------------------------------------------'
                    print 'Best NLL: ', str(
                        evaluation_history[best_idx][1][0]), ', BPC: ', str(
                            evaluation_history[best_idx][1]
                            [1]), ', FER: ', str(
                                evaluation_history[best_idx][1][2])

                # evaluation
                if total_batch_cnt % options[
                        'train_eval_freq'] == 0 and total_batch_cnt != 0:
                    train_eval_datastream = get_datastream(
                        path=options['data_path'],
                        which_set='train_si84',
                        batch_size=options['eval_batch_size'])
                    valid_eval_datastream = get_datastream(
                        path=options['data_path'],
                        which_set='test_dev93',
                        batch_size=options['eval_batch_size'])
                    train_nll, train_bpc, train_fer = network_evaluation(
                        predict_fn, train_eval_datastream)
                    valid_nll, valid_bpc, valid_fer = network_evaluation(
                        predict_fn, valid_eval_datastream)

                    # check over-fitting
                    if valid_fer > numpy.asarray(evaluation_history)[:, 1,
                                                                     2].min():
                        early_stop_cnt += 1.
                    else:
                        early_stop_cnt = 0.
                        best_network_params_vals = get_model_param_values(
                            network_params)
                        pickle.dump(
                            best_network_params_vals,
                            open(options['save_path'] + '_best_model.pkl',
                                 'wb'))

                    if early_stop_cnt > 10:
                        early_stop_flag = True
                        break

                    # save results
                    evaluation_history.append(
                        [[train_nll, train_bpc, train_fer],
                         [valid_nll, valid_bpc, valid_fer]])
                    numpy.savez(options['save_path'] + '_eval_history',
                                eval_history=evaluation_history)

                # save network
                if total_batch_cnt % options[
                        'train_save_freq'] == 0 and total_batch_cnt != 0:
                    cur_network_params_val = get_model_param_values(
                        network_params)
                    cur_trainer_params_val = get_update_params_values(
                        trainer_params)
                    cur_total_batch_cnt = total_batch_cnt
                    pickle.dump([
                        cur_network_params_val, cur_trainer_params_val,
                        cur_total_batch_cnt
                    ], open(options['save_path'] + '_last_model.pkl', 'wb'))

            if early_stop_flag:
                break

    except KeyboardInterrupt:
        print 'Training Interrupted'
        cur_network_params_val = get_model_param_values(network_params)
        cur_trainer_params_val = get_update_params_values(trainer_params)
        cur_total_batch_cnt = total_batch_cnt
        pickle.dump([
            cur_network_params_val, cur_trainer_params_val, cur_total_batch_cnt
        ], open(options['save_path'] + '_last_model.pkl', 'wb'))
Пример #60
0
def orig_model(filters_list, outdim, cost, input_dims = (1, 23, 23), activation="rectify", **kwargs):
    #Emean, Estd, max_mol_size, num_dist_basis, c_len, num_species,
    #    num_interaction_passes, num_hidden_neurons, values_to_predict,cost):

    # path to targets_file is not NONE
    # sym_coulomb = T.imatrix()
    sym_coulomb = T.ftensor4()
    sym_y = T.fmatrix()
    sym_learn_rate = T.scalar()

    try:
        nonlinearity = getattr(lasagne.nonlinearities, activation)
    except AttributeError as e:
        print(e)
        raise RuntimeError("Activation {} missing in lasagne.nonlinearities.".format(activation))
    
    # layer_input_dims = (None, *input_dims) # (None, 1, 23, 23) if input_dims == (1, 23, 23)
    layer_input_dims = [None]
    layer_input_dims.extend(input_dims)
    
    layers = []
    layers.append(lasagne.layers.InputLayer(layer_input_dims, name="layer_input"))

    for idx, num_filters in enumerate(filters_list):
            layers.append(Conv2DLayer(layers[-1],
                num_filters = num_filters,
                filter_size = 3,
                pad = "same",
                flip_filters = False,
                nonlinearity = nonlinearity,
                name="layer_conv_{}_1".format(idx)
                ))
            layers.append(Conv2DLayer(layers[-1],
                num_filters = num_filters,
                filter_size = 3,
                pad = "same",
                flip_filters = False,
                nonlinearity = nonlinearity,
                name="layer_conv_{}_2".format(idx)
                ))
            layers.append(Conv2DLayer(layers[-1],
                num_filters = num_filters,
                filter_size = 3,
                pad = "same",
                flip_filters = False,
                nonlinearity = nonlinearity,
                name="layer_conv_{}_3".format(idx)
                ))
            layers.append(MaxPool2DLayer(layers[-1],
                pool_size = 2,
                name="layer_maxpool_1"
                ))
    
    layers.append(FlattenLayer(layers[-1]))
    layers.append(DenseLayer(layers[-1], num_units = outdim, nonlinearity=lasagne.nonlinearities.linear))
    l_out = layers[-1] 

    # l_in_Z = lasagne.layers.InputLayer((None, max_mol_size))
    # l_in_D = lasagne.layers.InputLayer((None, max_mol_size, max_mol_size, num_dist_basis))
    # l_mask = MaskLayer(l_in_Z)
    # l_c0 = SwitchLayer(l_in_Z, num_species, c_len, W=lasagne.init.Uniform(1.0/np.sqrt(c_len)))

    # l_cT = RecurrentLayer(l_c0, l_in_D, l_mask, num_passes=num_interaction_passes, num_hidden=num_hidden_neurons)

    # # Compute energy contribution from each atom
    # l_atom1 = lasagne.layers.DenseLayer(l_cT, 15, nonlinearity=lasagne.nonlinearities.tanh, num_leading_axes=2) # outdim (-1, 23, 15)
    # l_atom2 = lasagne.layers.DenseLayer(l_atom1, values_to_predict, nonlinearity=None, num_leading_axes=2) # outdim (-1, 23, values_to_predict)
    # l_atomE = lasagne.layers.ExpressionLayer(l_atom2, lambda x: (x*Estd+Emean)) # Scale and shift by mean and std deviation
    # l_mask = lasagne.layers.ReshapeLayer(l_mask, ([0], [1], 1)) # add an extra dimension so that l_atomE (-1, 23, 16) l_mask "after reshape" (-1, 23, 1) can be multiplied
    # l_out = SumMaskedLayer(l_atomE, l_mask)

    params = lasagne.layers.get_all_params(l_out, trainable=True)
    for p in params:
        logger.debug("%s, %s" % (p, p.get_value().shape))

    # out_train = lasagne.layers.get_output(l_out, {l_in_Z: sym_Z, l_in_D: sym_D}, deterministic=False)
    # out_test = lasagne.layers.get_output(l_out, {l_in_Z: sym_Z, l_in_D: sym_D}, deterministic=True)
    out_train = lasagne.layers.get_output(l_out, {layers[0] : sym_coulomb}, deterministic=False)
    out_test = lasagne.layers.get_output(l_out, {layers[0] : sym_coulomb}, deterministic=True)
    if cost == "mae":
        cost_train = T.mean(np.abs(out_train-sym_y))
        cost_test = T.mean(np.abs(out_test-sym_y))
        logger.info("Used MAE cost")
    elif cost == "rmse":
        cost_train = T.mean(lasagne.objectives.squared_error(out_train, sym_y))
        cost_test = T.mean(lasagne.objectives.squared_error(out_test, sym_y))
        logger.info("Used MSE cost")
    else:
        raise ValueError("unknown cost function {}".format(cost))

    updates = lasagne.updates.adam(cost_train, params, learning_rate=sym_learn_rate)

    f_train = theano.function(
            inputs = [sym_coulomb, sym_y, sym_learn_rate],
            outputs = cost_train,
            updates = updates
            )

    f_eval_test = theano.function(
            inputs = [sym_coulomb],
            outputs = out_test
            )

    f_test = theano.function(
            inputs = [sym_coulomb, sym_y],
            outputs = cost_test,
            )



    # f_train = theano.function(
    #         inputs = [sym_Z, sym_D, sym_y, sym_learn_rate],
    #         outputs = cost_train,
    #         updates = updates
    #         )

    # f_eval_test = theano.function(
    #         inputs = [sym_Z, sym_D],
    #         outputs = out_test
    #         )

    # f_test = theano.function(
    #         inputs = [sym_Z, sym_D, sym_y],
    #         outputs = cost_test,
    #         )

    return f_train, f_eval_test, f_test, l_out