Exemplo n.º 1
0
    def test_train_step_with_backprop(self):
        net = PredCodMLP([3, 2, 2])
        input = np.array([[1, 0, 1]])
        X = add_bias_col(input)
        params = [p.copy() for p in net.params]
        output = np.array([1])

        h = np.maximum(0, X.dot(params[0]))
        h = add_bias_col(h)
        scores = h.dot(params[1])
        loss, dscores = cross_entropy_loss(scores, output)
        #print('pred: ', pred)
        #print('net pred: ', net.predict(input))
        dw2 = h.T.dot(dscores)
        #print('dw2: ', dw2)
        dh = dscores.dot(params[1][:-1, :].T)
        dh[h[:, :-1] <= 0] = 0
        dw1 = X.T.dot(dh)
        configs = [{}, {}]
        params[0], _ = adam(params[0], dw1, configs[0])
        params[1], _ = adam(params[1], dw2, configs[1])

        pc_params, layers = net._PredCodMLP__train_step(
            net.params, input, output, [{}, {}])
        np.testing.assert_array_almost_equal(pc_params[0], params[0])
        np.testing.assert_array_equal(pc_params[1], params[1])
Exemplo n.º 2
0
  def build(self, deterministic = False):
    self.model()
    self.predict['pspout'] = lasagne.layers.get_output(self.network['pspout'])
    self.predict['angleout'] = lasagne.layers.get_output(self.network['angleout'])

    self.loss['pspout'] = T.sqrt(lasagne.objectives.squared_error(self.predict['pspout'], self.psp_var).mean())
    self.loss['angleout'] = T.sqrt(lasagne.objectives.squared_error(self.predict['angleout'], self.angler_var).mean())
    self.loss['train'] = self.loss['pspout'] + self.loss['angleout']

    _inputs = [self.input_var, self.psp_var, self.angler_var]
    paras = lasagne.layers.get_all_params(self.network['angleout'], trainable = True)

    print('Building training functions...')
    lrvars = []
    self.layer_rep.append(len(paras))
    for lr, i in zip(self.learning_rate, range(len(self.layer_rep) - 1)):
      lrvars += [theano.shared(np.float32(lr)) for j in range(self.layer_rep[i], self.layer_rep[i + 1])]
    self.updates, self.update_params = adam(self.loss['train'], paras, lrvars)
    self.ftrain = theano.function(_inputs, [], updates = self.updates)

    print('Building validation functions...')
    self.vals['pspout'] = abs((self.predict['pspout'] - self.psp_var) / self.psp_var).mean()
    self.vals['angleout'] = abs((self.predict['angleout'] - self.angler_var) / self.angler_var).mean()
    self.fval = theano.function(_inputs, [self.loss['train']] + 
      [self.vals['pspout'], self.vals['angleout']])
Exemplo n.º 3
0
 def __train_step(self,
                  params: List[np.ndarray],
                  input: np.ndarray,
                  output: np.ndarray,
                  optim_configs: List[dict],
                  lr=0.01) -> Tuple[List[np.ndarray], List[np.ndarray]]:
     scores, layers = self.__predict(params, input)
     #layers[-1] = output
     X = add_bias_col(input)
     for t in range(self.num_layers - 1):
         curr_mu = X.dot(params[0])
         curr_err = layers[1] - curr_mu
         if t == self.num_layers - 2:
             N = X.shape[0]
             dW = X.T.dot(curr_err) / N
             params[0], optim_configs[0] = adam(params[0], dW,
                                                optim_configs[0])
         """
         for i in range(1, self.num_layers - 2):
             update_W = t == self.num_layers - 2 - i
             #print(t, i)
             config = optim_configs[i]
             layers[i], params[i], curr_err, optim_configs[i] = self.__update_layer(layers[i], curr_err, params[i], layers[i+1], update_W, config, lr)
         """
         update_W = t == 0
         loss, dscores = cross_entropy_loss(scores, output)
         layers[1], params[1], optim_configs[1] = self.__update_final_layer(
             layers[1], curr_err, params[1], dscores, update_W,
             optim_configs[1], lr)
     return params, layers
Exemplo n.º 4
0
 def __update_final_layer(self, X:np.ndarray, err: np.ndarray, W: np.ndarray, next_err: np.ndarray, update_W: bool, config=None, lr=0.1) \
     -> Tuple[np.ndarray, np.ndarray, dict]:
     h = add_bias_col(np.maximum(0, X))
     relu_mask = X > 0
     X += -err + relu_mask * next_err.dot(W.T[:, :-1])
     if update_W:
         N = X.shape[0]
         dW = h.T.dot(next_err) / N
         W, config = adam(W, dW, config)
     return X, W, config
Exemplo n.º 5
0
    def build(self, deterministic=False):
        self.model()
        self.predict["pspout"] = lasagne.layers.get_output(self.network["pspout"], deterministic=deterministic)
        self.predict["pspimg"] = lasagne.layers.get_output(self.network["pspimg"], deterministic=deterministic)
        self.predict["angleout"] = lasagne.layers.get_output(self.network["angleout"], deterministic=deterministic)
        self.predict["angleimg"] = lasagne.layers.get_output(self.network["angleimg"], deterministic=deterministic)
        self.predict["mal"] = lasagne.layers.get_output(self.network["mal"], deterministic=deterministic)

        self.loss["pspout"] = T.sqrt(lasagne.objectives.squared_error(self.predict["pspout"], self.psp_var).mean())
        self.loss["angleout"] = lasagne.objectives.categorical_crossentropy(
            self.predict["angleout"], self.angle_var
        ).mean()
        # self.loss['mal'] = self.predict['mal'].mean()
        self.loss["mal"] = 0.01 * T.sqrt(
            lasagne.objectives.squared_error(T.argmax(self.predict["angleout"]), self.angle_var).mean()
        )

        self.loss["train"] = 0
        _inputs = [self.input_var]
        for i, _opt in enumerate(self.opt):
            if _opt == 1:
                if i < 3:
                    _inputs.append(self.inputs[i])
                self.loss["train"] += self.loss[self.optkey[i]]
                paras = lasagne.layers.get_all_params(self.network[self.optkey[i]], trainable=True)
                print("Loss %s add..." % self.optkey[i])

        if deterministic == False:
            print("Building training functions...")
            lrvars = []
            self.layer_rep.append(len(paras))
            for lr, i in zip(self.learning_rate, range(len(self.layer_rep) - 1)):
                lrvars += [theano.shared(np.float32(lr)) for j in range(self.layer_rep[i], self.layer_rep[i + 1])]
            self.updates, self.update_params = adam(self.loss["train"], paras, lrvars)
            self.ftrain = theano.function(_inputs, [], updates=self.updates)
            print("Building validation functions...")
            self.vals["pspout"] = abs((self.predict["pspout"] - self.psp_var) / self.psp_var).mean()
            self.vals["angleout"] = T.eq(T.argmax(self.predict["angleout"], axis=1), self.angle_var).mean()
            self.vals["mal"] = self.loss["mal"]

            self.fval = theano.function(
                _inputs,
                [self.loss["train"]] + [self.vals[self.optkey[i]] for i in range(len(self.opt)) if self.opt[i] == 1],
            )
        else:
            print("Building middle output functions...")
            for f in self.feat:
                self.ffeat[f] = theano.function([self.input_var], lasagne.layers.get_output(self.network[f]))

            print("Building prediction functions...")
            for key in self.predict:
                if key != "mal":
                    self.fpred[key] = theano.function([self.input_var], self.predict[key])
Exemplo n.º 6
0
 def __update_layer(self, X: np.ndarray, err: np.ndarray, W: np.ndarray, next_X: np.ndarray, update_W: bool, config=None, lr=.01) \
     -> Tuple[np.ndarray, np.ndarray, np.ndarray, dict]:
     h = add_bias_col(np.maximum(0, X))
     next_mu = h.dot(W)
     next_err = next_X - next_mu
     #print(np.sum(next_err), '\n')
     relu_mask = X > 0
     #print(-err, next_err.dot(W.T[:,:-1]))
     X += -err + relu_mask * next_err.dot(W.T[:, :-1])
     if update_W:
         N = X.shape[0]
         dW = h.T.dot(next_err) / N
         W, config = adam(W, dW, config)
     h = add_bias_col(np.maximum(0, X))
     next_mu = h.dot(W)
     next_err = next_X - next_mu
     return X, W, next_err, config
Exemplo n.º 7
0
def train(train_x, train_y, val_x, val_y, d, hl, ol, config, lf):

    print("Function Invoked: train")

    epochs, eta, alpha, init_strategy, optimiser, batch_size, ac = config[
        "epochs"], config["learning_rate"], config["weight_decay"], config[
            "init_strategy"], config["optimiser"], config[
                "batch_size"], config["ac"]

    if optimiser == "vgd":
        return vgd.vgd(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf,
                       epochs, eta, init_strategy, alpha)

    elif optimiser == "sgd":
        return sgd.sgd(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf,
                       epochs, eta, init_strategy, alpha)

    elif optimiser == "mgd":
        return mgd.mgd(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf,
                       epochs, eta, init_strategy, batch_size, alpha)

    elif optimiser == "nag":
        return nag.nag(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf,
                       epochs, eta, init_strategy, batch_size, alpha)

    elif optimiser == "adam":
        return adam.adam(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf,
                         epochs, eta, init_strategy, batch_size)

    elif optimiser == "rmsprop":
        return rmsprop.rmsprop(train_x, train_y, val_x, val_y, d, hl, ol, ac,
                               lf, epochs, eta, init_strategy, batch_size)

    elif optimiser == "nadam":
        return nadam.nadam(train_x, train_y, val_x, val_y, d, hl, ol, ac, lf,
                           epochs, eta, init_strategy, batch_size)
def evaluate_lenet5(
        learning_rate=0.1,
        n_epochs=200,
        dataset='/home/hudson/ug/tzbq97/deeplearn/data/mnist.pkl.gz',
        nkerns=[20, 50],
        batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    '''grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]'''

    updates = adam(cost,
                   params,
                   learning_rate=0.001,
                   b1=0.9,
                   b2=0.999,
                   e=1e-8,
                   gamma=1 - 1e-8)

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print(
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' %
         ((end_time - start_time) / 60.)),
        file=sys.stderr)
Exemplo n.º 9
0
inps_sg = inps_net + [target_gradients, activation, latent_gradients, samples]
tparams_net = OrderedDict()
tparams_sg = OrderedDict()

for key, val in tparams.iteritems():
	print key
	# running means and running variances should not be included in the list for which gradients are computed
	if 'rm' in key or 'rv' in key:
		continue
	elif 'sg' in key:
		tparams_sg[key] = val
	else:
		tparams_net[key] = val

print "Setting up optimizers"
f_grad_shared, f_update = adam(lr, tparams_net, grads, inps_net, outs)
# f_grad_shared_sg, f_update_sg = adam(lr, tparams_sg, grads_sg, inps_sg, loss_sg)

# sgd with momentum updates
sgd = SGD(lr=args.learning_rate)
f_update_sg = theano.function(inps_sg, loss_sg, updates=sgd.get_grad_updates(loss_sg, param_sg), on_unused_input='ignore', profile=False)

print "Training"
cost_report = open('./Results/disc/SF/gradcomp_' + code_name + '_' + str(args.batch_size) + '_' + str(args.learning_rate) + '.txt', 'w')
id_order = range(len(trc))

iters = 0
min_cost = 100000.0
epoch = 0
condition = False
Exemplo n.º 10
0
    for i in range(num_of_epochs):
        tmp_train_files = copy.copy(train_files)

        # progress bar (begin)
        sys.stdout.write("Epoch {}: ".format(i + 1))
        sys.stdout.write("[%s]" % (" " * toolbar_width))
        sys.stdout.flush()
        sys.stdout.write("\b" * (toolbar_width + 1))

        # an epoch
        for j in range(train_size // batch_size):
            batch_files = random.sample(tmp_train_files, batch_size)
            for file in batch_files:
                tmp_train_files.remove(file)

            theta, moment_1, moment_2 = adam(batch_files, theta, moment_1,
                                             moment_2)

            # progress bar
            sys.stdout.write("#")
            sys.stdout.flush()

        # progress bar (end)
        sys.stdout.write("] Done.\n")

        loss_list_for_train.append(avg_loss(train_files, theta))
        loss_list_for_valid.append(avg_loss(valid_files, theta))
        accuracy_list_for_train.append(avg_accuracy(train_files, theta))
        accuracy_list_for_valid.append(avg_accuracy(valid_files, theta))

    # plot graphs
    title = "SGD loss on train_data"
Exemplo n.º 11
0
batch_size = 5000
loss = "ce"
opt = "adam"
lr = 0.001
anneal = False
momentum = 0.1
total_loss_train, total_loss_val = [0] * 4, [0] * 4

for i in range(0, 4):
    weights = [0] * layers
    biases = [0] * layers
    weights, biases = randInit.randomInitializer(weights, biases, inputs_num,
                                                 outputs_num, layers, sizes[i])
    total_loss_train[
        i], total_loss_val[i], train_weights, train_biases = adam.adam(
            train_input, train_label, val_input, val_label, layers, biases,
            weights, activation, batch_size, loss, epochs, lr, anneal, gamma,
            expt_dir)
    print("value of i is %d", i)

print(total_loss_train, total_loss_val)

# for i in range(0,4):
#     total_loss_train[i] = total_loss_train[i].reshape(total_loss_train[i].shape[0], 1)
#     total_loss_val[i] = total_loss_val[i].reshape(total_loss_val[i].shape[0], 1)

epoch_count = np.arange(1, epochs + 1)
epoch_count = epoch_count.reshape(epoch_count.shape[0], 1)

plt.figure(1)
plt.plot(epoch_count, total_loss_train[0].T, 'r-', label='50')
plt.plot(epoch_count, total_loss_train[1].T, 'b-', label='100')
Exemplo n.º 12
0
        grads_sg = T.grad(loss_sg, wrt=sg_params_list)
        grads_net = grad_list_3 + grad_list_2[:2] + grad_list_1[:2]

        # key things
        inps_sg = [semi_synth_grad_1, semi_synth_grad_2, h1, h2]
        required_output_from_graph = [
            grad_list_2[2], grad_list_1[2], out1, out2
        ]

    lr = T.scalar('learning_rate', dtype='float32')

    inps = [img_ids]
    print "Setting up optimizer"

    if train_rou == 'backprop':
        f_grad_shared, f_update = adam(lr, tparams, grads, inps, loss)

    elif train_rou == 'synthetic_gradients':

        # split params for sg modules and network
        tparams_net = OrderedDict()
        tparams_sg = OrderedDict()
        for key, val in tparams.iteritems():
            if 'sg' in key:
                tparams_sg[key] = tparams[key]
            else:
                tparams_net[key] = tparams[key]

        f_grad_shared, f_update = adam(lr, tparams_net, grads_net, inps,
                                       [loss] + required_output_from_graph)
        f_grad_shared_sg, f_update_sg = adam(lr, tparams_sg, grads_sg,
Exemplo n.º 13
0
 grad = lambda x: H @ x
 # l2 regularization
 lam = 0.1
 f_reg = lambda x: lam * x @ x
 grad_reg = lambda x: 2 * lam * x
 # covariance
 C = 5.0 * np.eye(dim)
 # risk function
 rn = lambda x: x @ x + 0.5 * np.trace(H @ C)
 from adam import adam
 y_best, Y = adam(grad,
                  x0,
                  C,
                  max_iter=1000,
                  batch_size=10,
                  eta=2,
                  beta1=0.9,
                  beta2=0.9,
                  eps=1e-7,
                  func=f,
                  verbose=True)
 # optimize
 x_best, X = adam_cv(grad,
                     x0,
                     C,
                     grad_reg,
                     max_iter=1000,
                     batch_size=10,
                     eta=2,
                     beta1=0.9,
                     beta2=0.9,
    def __init__(self, We, params):

        lstm_layers_num = 1
        en_hidden_size = We.shape[1]
        self.eta = params.eta
        self.num_labels = params.num_labels
        self.en_hidden_size = en_hidden_size
        self.de_hidden_size = params.de_hidden_size

        self.lstm_layers_num = params.lstm_layers_num
        self._train = None
        self._utter = None
        self.params = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []
        self.hos = []
        self.Cos = []

        encoderInputs = tensor.imatrix()
        decoderInputs, decoderTarget = tensor.imatrices(2)
        encoderMask, TF, decoderMask, decoderInputs0, diagonal = tensor.fmatrices(
            5)

        self.lookuptable = theano.shared(We)

        #### the last one is for the stary symbole
        self.de_lookuptable = theano.shared(name="Decoder LookUpTable",
                                            value=init_xavier_uniform(
                                                self.num_labels + 1,
                                                self.de_hidden_size),
                                            borrow=True)

        self.linear = theano.shared(
            name="Linear",
            value=init_xavier_uniform(self.de_hidden_size + 2 * en_hidden_size,
                                      self.num_labels),
            borrow=True)
        self.linear_bias = theano.shared(
            name="Hidden to Bias",
            value=np.asarray(np.random.randn(self.num_labels, ) * 0.,
                             dtype=theano.config.floatX),
            borrow=True)

        self.hidden_decode = theano.shared(name="Hidden to Decode",
                                           value=init_xavier_uniform(
                                               2 * en_hidden_size,
                                               self.de_hidden_size),
                                           borrow=True)

        self.hidden_bias = theano.shared(
            name="Hidden to Bias",
            value=np.asarray(np.random.randn(self.de_hidden_size, ) * 0.,
                             dtype=theano.config.floatX),
            borrow=True)

        #self.params += [self.linear, self.de_lookuptable, self.hidden_decode, self.hidden_bias]    #concatenate
        self.params += [
            self.linear, self.linear_bias, self.de_lookuptable,
            self.hidden_decode, self.hidden_bias
        ]  #the initial hidden state of decoder lstm is zeros
        #(max_sent_size, batch_size, hidden_size)
        state_below = self.lookuptable[encoderInputs.flatten()].reshape(
            (encoderInputs.shape[0], encoderInputs.shape[1],
             self.en_hidden_size))
        for _ in range(self.lstm_layers_num):

            enclstm_f = LSTM(self.en_hidden_size)
            enclstm_b = LSTM(self.en_hidden_size, True)
            self.encoder_lstm_layers.append(enclstm_f)  #append
            self.encoder_lstm_layers.append(enclstm_b)  #append
            self.params += enclstm_f.params + enclstm_b.params  #concatenate

            hs_f, Cs_f = enclstm_f.forward(state_below, encoderMask)
            hs_b, Cs_b = enclstm_b.forward(state_below, encoderMask)

            hs = tensor.concatenate([hs_f, hs_b], axis=2)
            Cs = tensor.concatenate([Cs_f, Cs_b], axis=2)
            hs0 = tensor.concatenate([hs_f[-1], hs_b[0]], axis=1)
            Cs0 = tensor.concatenate([Cs_f[-1], Cs_b[0]], axis=1)
            #self.hos += tensor.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias),
            #self.Cos += tensor.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias),
            self.hos += tensor.alloc(
                np.asarray(0., dtype=theano.config.floatX),
                encoderInputs.shape[1], self.de_hidden_size),
            self.Cos += tensor.alloc(
                np.asarray(0., dtype=theano.config.floatX),
                encoderInputs.shape[1], self.de_hidden_size),
            state_below = hs

        Encoder = state_below
        Encoder_shuffle = Encoder.dimshuffle(1, 0, 2)
        ## Encoder_shuffle : B x L x h_e
        ##Encoder_shuffle_re : B x L x h_d
        Encoder_shuffle_re = tensor.dot(
            Encoder_shuffle, self.hidden_decode) + self.hidden_bias[None,
                                                                    None, :]

        state_below = self.de_lookuptable[decoderInputs.flatten()].reshape(
            (decoderInputs.shape[0], decoderInputs.shape[1],
             self.de_hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.de_hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, decoderMask, ho, Co)

        ##### Here we include the representation from the decoder
        decoder_lstm_outputs = tensor.concatenate([state_below, Encoder],
                                                  axis=2)

        ei, di, dt = tensor.imatrices(3)  #place holders
        em, dm, tf, di0, diag = tensor.fmatrices(5)
        #####################################################
        #####################################################
        linear_outputs = tensor.dot(decoder_lstm_outputs,
                                    self.linear) + self.linear_bias[None,
                                                                    None, :]
        softmax_outputs, updates = theano.scan(
            fn=lambda x: tensor.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * tensor.log(pred[tensor.arange(encoderInputs.shape[1]),
                                        y])

        costs, _ = theano.scan(
            fn=_NLL, sequences=[softmax_outputs, decoderTarget, decoderMask])
        loss = costs.sum() / decoderMask.sum() + params.L2 * sum(
            lasagne.regularization.l2(x) for x in self.params)

        updates = lasagne.updates.adam(loss, self.params, self.eta)
        #updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9)

        ###################################################
        #### using the ground truth when training
        ##################################################
        self._train = theano.function(inputs=[ei, em, di, dm, dt],
                                      outputs=[loss, softmax_outputs],
                                      updates=updates,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs: di,
                                          decoderMask: dm,
                                          decoderTarget: dt
                                      })

        #########################################################################
        ### For schedule sampling
        #########################################################################

        ###### always use privous predict as next input
        def _step2(diag_, state_, hs_, Cs_):

            hs, Cs = [], []
            token_idxs = tensor.cast(state_.argmax(axis=-1), "int32")
            msk_ = tensor.fill(
                (tensor.zeros_like(token_idxs, dtype="float32")), 1)
            msk_ = msk_.dimshuffle('x', 0)
            state_below0 = self.de_lookuptable[token_idxs].reshape(
                (1, encoderInputs.shape[1], self.de_hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below0, msk_, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below0 = h

            hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable(
                Cs)
            state_below0 = state_below0.reshape(
                (encoderInputs.shape[1], self.de_hidden_size))

            attn_index = tensor.nonzero(diag_, True)
            attn_value = tensor.nonzero_values(diag_)

            en_context = Encoder_shuffle[:, attn_index[0], :]
            attn_context = Encoder_shuffle_re[:, attn_index[0], :]

            attn_weight = tensor.batched_dot(attn_context, state_below0)
            attn_weight = tensor.nnet.softmax(attn_weight)
            #attn_weight *= (encoderMask.dimshuffle(1,0))

            attn_weight *= (attn_value.dimshuffle('x', 0))
            ##attn_weight = attn_weight/(tensor.sum(attn_weight, axis=1).dimshuffle(0,'x'))
            ####### ctx_ : (b, h)
            ctx_ = tensor.sum(en_context * attn_weight[:, :, None], axis=1)

            state_below0 = tensor.concatenate([ctx_, state_below0], axis=1)
            newpred = tensor.dot(state_below0,
                                 self.linear) + self.linear_bias[None, :]
            state_below = tensor.nnet.softmax(newpred)
            ##### the beging symbole probablity is 0
            extra_p = tensor.zeros_like(hs[:, :, 0])
            state_below = tensor.concatenate([state_below, extra_p.T], axis=1)

            return state_below, hs, Cs

        hs0, Cs0 = tensor.as_tensor_variable(
            self.hos, name="hs0"), tensor.as_tensor_variable(self.Cos,
                                                             name="Cs0")
        train_outputs, _ = theano.scan(fn=_step2,
                                       sequences=[diagonal],
                                       outputs_info=[decoderInputs0, hs0, Cs0],
                                       n_steps=encoderInputs.shape[0])

        train_predict = train_outputs[0]
        train_costs, _ = theano.scan(
            fn=_NLL, sequences=[train_predict, decoderTarget, decoderMask])

        train_loss = train_costs.sum() / decoderMask.sum() + params.L2 * sum(
            lasagne.regularization.l2(x) for x in self.params)
        from adam import adam
        train_updates = adam(train_loss, self.params, self.eta)
        #train_updates = lasagne.updates.apply_momentum(train_updates, self.params, momentum=0.9)

        self._train2 = theano.function(
            inputs=[ei, em, di0, dm, dt, diag],
            outputs=[train_loss, train_predict],
            updates=train_updates,
            givens={
                encoderInputs: ei,
                encoderMask: em,
                decoderInputs0: di0,
                decoderMask: dm,
                decoderTarget: dt,
                diagonal: diag
            }
            #givens={encoderInputs:ei, encoderMask:em, decoderInputs:di, decoderMask:dm, decoderTarget:dt, TF:tf}
        )

        listof_token_idx = train_predict.argmax(axis=-1)
        self._utter = theano.function(inputs=[ei, em, di0, diag],
                                      outputs=listof_token_idx,
                                      givens={
                                          encoderInputs: ei,
                                          encoderMask: em,
                                          decoderInputs0: di0,
                                          diagonal: diag
                                      })
Exemplo n.º 15
0
        anneal_rate = 0.00003

    tparams_net = OrderedDict()
    updates_bn = []
    for key, val in tparams.iteritems():
        if ('rmu' in key) or ('rvu' in key):
            continue
        elif 'rm' in key or 'rv' in key:
            updates_bn.append((tparams[key], tparams[key + 'u']))
        else:
            tparams_net[key] = val

    print "Setting up optimizer"
    f_grad_shared, f_update = adam(lr,
                                   tparams_net,
                                   grads,
                                   inps, [cost, xtranorm],
                                   ups=updates_bn)

    print "Training"
    cost_report = open(
        './Results/' + args.latent_type + '/' + args.estimator + '/training_' +
        code_name + '_' + str(args.batch_size) + '_' +
        str(args.learning_rate) + '.txt', 'w')
    id_order = range(len(trc))

    iters = 0
    cur_temp = temperature_init
    min_cost = 100000.0
    epoch = 0
    condition = False
Exemplo n.º 16
0
def exp_raw(dtype):
  shp = (None, 3, 256, 256)
  input_var = T.tensor4('input_var', dtype = 'float32')
  psp = T.dmatrix("psp")
  network = OrderedDict()
  network['input'] = lasagne.layers.InputLayer(shape = shp, input_var = input_var)
  # network = make_vgg16(network, 'model/vgg16_weights_from_caffe.h5')
  # First conv and segmentation part
  network['conv1_1'] = lasagne.layers.Conv2DLayer(network['input'],
    num_filters = 64, filter_size = (3, 3),nonlinearity = lasagne.nonlinearities.rectify,
    W=lasagne.init.GlorotUniform())
  network['conv1_2'] = lasagne.layers.Conv2DLayer(network['conv1_1'],
    num_filters = 64, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify)
  network['pool1_1'] = lasagne.layers.MaxPool2DLayer(network['conv1_2'], pool_size = (2, 2))
  network['norm1_1'] = lasagne.layers.BatchNormLayer(network['pool1_1'])

  network['conv1_3'] = lasagne.layers.Conv2DLayer(network['norm1_1'],
    num_filters = 128, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify)
  network['conv1_4'] = lasagne.layers.Conv2DLayer(network['conv1_3'],
    num_filters = 128, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify)
  network['pool1_2'] = lasagne.layers.MaxPool2DLayer(network['conv1_4'], pool_size = (2, 2))
  network['norm1_2'] = lasagne.layers.BatchNormLayer(network['pool1_2'])

  network['conv1_5'] = lasagne.layers.Conv2DLayer(network['norm1_2'],
    num_filters = 256, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify)
  network['pool1_3'] = lasagne.layers.MaxPool2DLayer(network['conv1_5'], pool_size = (2, 2))

  network['conv1_6'] = lasagne.layers.Conv2DLayer(network['pool1_3'],
    num_filters = 256, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify)
  network['pool1_4'] = lasagne.layers.MaxPool2DLayer(network['conv1_6'], pool_size = (2, 2))

  # Perspective Transform
  network['norm2'] = lasagne.layers.BatchNormLayer(network['pool1_4'])
  # network['cast'] = CastingLayer(network['norm2'], dtype)
  theano.config.floatX = dtype 
  network['pfc2_1'] = lasagne.layers.DenseLayer(
    lasagne.layers.dropout(network['norm2'], p = 0.05),
    num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify)
  network['pfc2_2'] = lasagne.layers.DenseLayer(
    lasagne.layers.dropout(network['pfc2_1'], p=0.05),
    num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify)
  network['pfc2_3'] = lasagne.layers.DenseLayer(
    lasagne.layers.dropout(network['pfc2_2'], p=0.05),
    num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify)
  # loss target 2
  network['pfc_out'] = lasagne.layers.DenseLayer(
    lasagne.layers.dropout(network['pfc2_3'], p = 0.05),
    num_units = 8, nonlinearity = lasagne.nonlinearities.rectify)
  theano.config.floatX = 'float32'

  predict = lasagne.layers.get_output(network['pfc_out'])
  loss = T.sqrt(lasagne.objectives.squared_error(predict, psp).mean())
  paras = lasagne.layers.get_all_params(network['pfc_out'], trainable = True)
  updates = adam(loss, paras, [theano.shared(np.float32(0.0001)) for i in range(len(paras))])
  ftrain = theano.function([input_var, psp], [loss, predict], updates = updates)

  def get_inputs(meta, batch, path):
    # batchidx = [keys[i] for i in batch]
    input = np.array([read_image(path + 'patch/' + idx + '.jpg', shape = (256, 256))
      for idx in batch]).astype(np.float32)
    seg = np.array([read_image(path + 'pmask/' + idx + '.jpg', shape = (256, 256))
      for idx in batch]).astype(np.float32)
    dat = [meta[key] for key in batch]
    Ps = np.array([np.array(dat[i][0]).flatten()[0 : 8] for i in range(len(batch))])
    for P in Ps:
      P[6 : 8] = (P[6 : 8] + 1e-3) * 1e4
    return input, Ps

  path = '/home/yancz/text_generator/data/real/'
  dat, meta = load_data(path, 10000, False)
  for epoch in range(10):
    loss = 0
    trs = 0
    for batch in iterate_minibatch(dat['train'], 32, len(dat['train'])):
      inputs = get_inputs(meta, batch, path)
      l, valp = ftrain(*inputs)
      log(l)
      print(valp)
      loss += l
      trs += 1
    loss /= trs
    log('loss ' + str(epoch) + ' ' + str(l))
  return ftrain
	def __init__(self,  We_initial, char_embedd_table_initial, params):

		We = theano.shared(We_initial)
                We_inf = theano.shared(We_initial)
        	embsize = We_initial.shape[1]
        	hidden = params.hidden

                input_init = np.random.uniform(-0.1, 0.1, (10, MAX_lENGTH, 17)).astype('float32')
                self.input_init = theano.shared(input_init)		


		input_var = T.imatrix(name='inputs')
        	target_var = T.imatrix(name='targets')
        	mask_var = T.fmatrix(name='masks')
		mask_var1 = T.fmatrix(name='masks1')
		length = T.iscalar()
		t_t = T.fscalar()		

		Wyy0 = np.random.uniform(-0.02, 0.02, (18, 18)).astype('float32')
                Wyy = theano.shared(Wyy0)

                char_input_var = T.itensor3()

                char_embedd_dim = params.char_embedd_dim
                char_dic_size = len(params.char_dic)
                char_embedd_table = theano.shared(char_embedd_table_initial)
              

                l_in_word = lasagne.layers.InputLayer((None, None))
                l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

		if params.emb ==1:
                        l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word,  input_size= We_initial.shape[0] , output_size = embsize, W =We, name='word_embedding')
                else:
                        l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

                layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ),
                                                     input_var=char_input_var, name='char-input')

                layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2]))
                layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char, input_size=char_dic_size,
                                                             output_size=char_embedd_dim, W=char_embedd_table,
                                                             name='char_embedding')

                layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1))


                # first get some necessary dimensions or parameters
                conv_window = 3
                num_filters = params.num_filters

                # construct convolution layer
                cnn_layer = lasagne.layers.Conv1DLayer(layer_char, num_filters=num_filters, filter_size=conv_window, pad='full',
                                           nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
                # infer the pool size for pooling (pool size should go through all time step of cnn)
                _, _, pool_size = cnn_layer.output_shape

                # construct max pool layer
                pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size)
                # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
                output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1]))

                # finally, concatenate the two incoming layers together.
                incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2)


		l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word)
        	l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, backwards = True)

        	concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)
		
		l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden))

		l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= 17, nonlinearity=lasagne.nonlinearities.linear)

		
		network_params = lasagne.layers.get_all_params(l_local, trainable=True)
                network_params.append(Wyy)

		
		#print len(network_params)
		f = open('NER_BiLSTM_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.005_0.0_50_hidden_200.pickle','r')
		data = pickle.load(f)
		f.close()

		for idx, p in enumerate(network_params):

                        p.set_value(data[idx])



                
	
		
		def inner_function( targets_one_step, mask_one_step,  prev_label, tg_energy):
                        """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """                 
                        new_ta_energy = T.dot(prev_label, Wyy[:-1,:-1])
                        new_ta_energy_t = tg_energy + T.sum(new_ta_energy*targets_one_step, axis =1)
			tg_energy_t = T.switch(mask_one_step, new_ta_energy_t,  tg_energy)

                        return [targets_one_step, tg_energy_t]


		local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var, layer_char_input:char_input_var})
		local_energy = local_energy.reshape((-1, length, 17))
                local_energy = local_energy*mask_var[:,:,None]		

		#####################
		# for the end symbole of a sequence
		####################

		end_term = Wyy[:-1,-1]
                local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None]

                predy_init = self.input_init[:,:length,:]

                a_params = [self.input_init]

                predy = T.nnet.softmax(predy_init.reshape((-1, 17)))
                predy = predy.reshape((-1, length, 17))

                prediction = T.argmax(predy_init, axis=2)

                predy = predy*mask_var[:,:,None]
		
		
		targets_shuffled = predy.dimshuffle(1, 0, 2)
                target_time0 = targets_shuffled[0]
		
		masks_shuffled = mask_var.dimshuffle(1, 0)		 

                initial_energy0 = T.dot(target_time0, Wyy[-1,:-1])


                initials = [target_time0, initial_energy0]
                [ _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]])
                cost11 = target_energies[-1] + T.sum(T.sum(local_energy*predy, axis=2)*mask_var, axis=1)
	
				
		
		predy_f =  predy.reshape((-1, 17))
		y_f = target_var.flatten()

	
		
		cost = T.mean(-cost11)
                                    
		from adam import adam
                updates_a = adam(cost, a_params, params.eta)
					
		#updates_a = lasagne.updates.sgd(cost, a_params, params.eta)
                #updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9)

		
                self.inf_fn = theano.function([input_var, char_input_var, mask_var, mask_var1, length], cost, updates = updates_a)


		#corr = T.eq(prediction, target_var)
        	#corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        	#num_tokens = mask_var.sum(dtype=theano.config.floatX)
        	

        	self.eval_fn = theano.function([input_var, char_input_var,  mask_var, mask_var1, length], prediction, on_unused_input='ignore')


                if params.WarmStart:

                                       hidden_inf= params.hidden_inf
                                       char_embedd_table_inf = theano.shared(char_embedd_table_initial)
                                      

                                       l_in_word_a = lasagne.layers.InputLayer((None, None))
                                       l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None))

                                       l_emb_word_a = lasagne.layers.EmbeddingLayer(l_in_word_a,  input_size= We_initial.shape[0] , output_size = embsize, W = We_inf, name='inf_word_embedding')


                                       layer_char_input_a = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ),
                                                                                        input_var=char_input_var, name='char-input')

                                       layer_char_a = lasagne.layers.reshape(layer_char_input_a, (-1, [2]))
                                       layer_char_embedding_a = lasagne.layers.EmbeddingLayer(layer_char_a, input_size=char_dic_size,
                                                             output_size=char_embedd_dim, W=char_embedd_table_inf,
                                                             name='char_embedding')


                                       layer_char_a = lasagne.layers.DimshuffleLayer(layer_char_embedding_a, pattern=(0, 2, 1))


                                       # first get some necessary dimensions or parameters
                                       conv_window = 3
                                       num_filters = params.num_filters
                                       #_, sent_length, _ = incoming2.output_shape

                                       # construct convolution layer
                                       cnn_layer_a = lasagne.layers.Conv1DLayer(layer_char_a, num_filters=num_filters, filter_size=conv_window, pad='full',
                                                                     nonlinearity=lasagne.nonlinearities.tanh, name='cnn')
                                       # infer the pool size for pooling (pool size should go through all time step of cnn)
                                       #_, _, pool_size = cnn_layer.output_shape

                                       # construct max pool layer
                                       pool_layer_a = lasagne.layers.MaxPool1DLayer(cnn_layer_a, pool_size=pool_size)
                                       # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
                                       output_cnn_layer_a = lasagne.layers.reshape(pool_layer_a, (-1, length, [1]))

                                       # finally, concatenate the two incoming layers together.
                                       l_emb_word_a = lasagne.layers.concat([output_cnn_layer_a, l_emb_word_a], axis=2)

                                       l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a)
                                       l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a, backwards = True)

                                       l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a ,(-1, hidden_inf))
                                       l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a ,(-1,hidden_inf))
                                       concat2_a = lasagne.layers.ConcatLayer([l_reshapef_a, l_reshapeb_a])

                                       l_local_a = lasagne.layers.DenseLayer(concat2_a, num_units= 17, nonlinearity=lasagne.nonlinearities.softmax)

                                       predy_inf = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var, layer_char_input_a:char_input_var})
                                       predy_inf = predy_inf.reshape((-1, length, 17))
                   
                                       a_params = lasagne.layers.get_all_params(l_local_a, trainable=True)

                                       f = open('CRF_Inf_NER_.num_filters_50_dropout_1_LearningRate_0.001_1.0_emb_1_inf_0_hidden_200_annealing_0.pickle','r')
                                       data = pickle.load(f)
                                       f.close()


                                       for idx, p in enumerate(a_params):
                                               p.set_value(data[idx])

                                       self.start_fn = theano.function([input_var, char_input_var, mask_var, length], predy_inf, on_unused_input='ignore')