def train_and_test(args, print_config):

    assert args.conv_layer_n == len(args.filter_widths) == len(
        args.nkerns) == (len(args.L2_regs) - 2) == len(args.fold_flags) == len(
            args.ks)

    # \mod{dim, 2^{\sum fold_flags}} == 0
    assert args.embed_dm % (2**sum(args.fold_flags)) == 0

    ###################
    # get the data    #
    ###################
    datasets = load_data(args.corpus_path)

    train_set_x, train_set_y = datasets[0]
    dev_set_x, dev_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    word2index = datasets[3]
    index2word = datasets[4]
    pretrained_embeddings = datasets[5]

    n_train_batches = train_set_x.get_value(
        borrow=True).shape[0] / args.batch_size
    n_dev_batches = dev_set_x.get_value(
        borrow=True).shape[0] / args.dev_test_batch_size
    n_test_batches = test_set_x.get_value(
        borrow=True).shape[0] / args.dev_test_batch_size

    train_sent_len = train_set_x.get_value(borrow=True).shape[1]
    possible_labels = set(train_set_y.get_value().tolist())

    if args.use_pretrained_embedding:
        args.embed_dm = pretrained_embeddings.get_value().shape[1]

    ###################################
    # Symbolic variable definition    #
    ###################################
    x = T.imatrix('x')  # the word indices matrix
    y = T.ivector('y')  # the sentiment labels

    batch_index = T.iscalar('batch_index')

    rng = np.random.RandomState(1234)

    ###############################
    # Construction of the network #
    ###############################
    # Layer 1, the embedding layer
    layer1 = WordEmbeddingLayer(
        rng,
        input=x,
        vocab_size=len(word2index),
        embed_dm=args.embed_dm,
        embeddings=(pretrained_embeddings
                    if args.use_pretrained_embedding else None))

    dropout_layers = [layer1]
    layers = [layer1]

    for i in range(args.conv_layer_n):
        fold_flag = args.fold_flags[i]

        # for the dropout layer
        dpl = DropoutLayer(input=dropout_layers[-1].output,
                           rng=rng,
                           dropout_rate=args.dropout_rates[0])
        next_layer_dropout_input = dpl.output
        next_layer_input = layers[-1].output

        # for the conv layer
        filter_shape = (args.nkerns[i], (1 if i == 0 else args.nkerns[i - 1]),
                        1, args.filter_widths[i])

        k = args.ks[i]

        print "For conv layer(%s) %d, filter shape = %r, k = %d, dropout_rate = %f and normalized weight init: %r and fold: %d" % (
            args.conv_activation_unit, i + 2, filter_shape, k,
            args.dropout_rates[i], args.norm_w, fold_flag)

        # we have two layers adding to two paths repsectively,
        # one for training
        # the other for prediction(averaged model)

        dropout_conv_layer = ConvFoldingPoolLayer(
            rng,
            input=next_layer_dropout_input,
            filter_shape=filter_shape,
            k=k,
            norm_w=args.norm_w,
            fold=fold_flag,
            activation=args.conv_activation_unit)

        # for prediction
        # sharing weight with dropout layer
        conv_layer = ConvFoldingPoolLayer(
            rng,
            input=next_layer_input,
            filter_shape=filter_shape,
            k=k,
            activation=args.conv_activation_unit,
            fold=fold_flag,
            W=dropout_conv_layer.W *
            (1 - args.dropout_rates[i]),  # model averaging
            b=dropout_conv_layer.b)

        dropout_layers.append(dropout_conv_layer)
        layers.append(conv_layer)

    # last, the output layer
    # both dropout and without dropout
    if sum(args.fold_flags) > 0:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm / (2**sum(
            args.fold_flags))
    else:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm

    print "For output layer, n_in = %d, dropout_rate = %f" % (
        n_in, args.dropout_rates[-1])

    dropout_output_layer = LogisticRegression(
        rng,
        input=dropout_layers[-1].output.flatten(2),
        n_in=n_in,  # divided by 2x(how many times are folded)
        n_out=len(possible_labels)  # five sentiment level
    )

    output_layer = LogisticRegression(
        rng,
        input=layers[-1].output.flatten(2),
        n_in=n_in,
        n_out=len(possible_labels),
        W=dropout_output_layer.W *
        (1 - args.dropout_rates[-1]),  # sharing the parameters, don't forget
        b=dropout_output_layer.b)

    dropout_layers.append(dropout_output_layer)
    layers.append(output_layer)

    ###############################
    # Error and cost              #
    ###############################
    # cost and error come from different model!
    dropout_cost = dropout_output_layer.nnl(y)
    errors = output_layer.errors(y)

    def prepare_L2_sqr(param_layers, L2_regs):
        assert len(L2_regs) == len(param_layers)
        return T.sum([
            L2_reg / 2 *
            ((layer.W if hasattr(layer, "W") else layer.embeddings)**2).sum()
            for L2_reg, layer in zip(L2_regs, param_layers)
        ])

    L2_sqr = prepare_L2_sqr(dropout_layers, args.L2_regs)
    L2_sqr_no_ebd = prepare_L2_sqr(dropout_layers[1:], args.L2_regs[1:])

    if args.use_L2_reg:
        cost = dropout_cost + L2_sqr
        cost_no_ebd = dropout_cost + L2_sqr_no_ebd
    else:
        cost = dropout_cost
        cost_no_ebd = dropout_cost

    ###############################
    # Parameters to be used       #
    ###############################
    print "Delay embedding learning by %d epochs" % (
        args.embedding_learning_delay_epochs)

    print "param_layers: %r" % dropout_layers
    param_layers = dropout_layers

    ##############################
    # Parameter Update           #
    ##############################
    print "Using AdaDelta with rho = %f and epsilon = %f" % (args.rho,
                                                             args.epsilon)

    params = [param for layer in param_layers for param in layer.params]
    param_shapes = [
        param for layer in param_layers for param in layer.param_shapes
    ]

    param_grads = [T.grad(cost, param) for param in params]

    # AdaDelta parameter update
    # E[g^2]
    # initialized to zero
    egs = [
        theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX),
                      borrow=True,
                      name="Eg:" + param.name)
        for param_shape, param in zip(param_shapes, params)
    ]

    # E[\delta x^2], initialized to zero
    exs = [
        theano.shared(value=np.zeros(param_shape, dtype=theano.config.floatX),
                      borrow=True,
                      name="Ex:" + param.name)
        for param_shape, param in zip(param_shapes, params)
    ]

    new_egs = [
        args.rho * eg + (1 - args.rho) * g**2
        for eg, g in zip(egs, param_grads)
    ]

    delta_x = [
        -(T.sqrt(ex + args.epsilon) / T.sqrt(new_eg + args.epsilon)) * g
        for new_eg, ex, g in zip(new_egs, exs, param_grads)
    ]

    new_exs = [
        args.rho * ex + (1 - args.rho) * (dx**2)
        for ex, dx in zip(exs, delta_x)
    ]

    egs_updates = zip(egs, new_egs)
    exs_updates = zip(exs, new_exs)
    param_updates = [(p, p + dx)
                     for dx, g, p in zip(delta_x, param_grads, params)]

    updates = egs_updates + exs_updates + param_updates

    # updates WITHOUT embedding
    # exclude the embedding parameter
    egs_updates_no_ebd = zip(egs[1:], new_egs[1:])
    exs_updates_no_ebd = zip(exs[1:], new_exs[1:])
    param_updates_no_ebd = [
        (p, p + dx) for dx, g, p in zip(delta_x, param_grads, params)[1:]
    ]
    updates_no_emb = egs_updates_no_ebd + exs_updates_no_ebd + param_updates_no_ebd

    def make_train_func(cost, updates):
        return theano.function(
            inputs=[batch_index],
            outputs=[cost],
            updates=updates,
            givens={
                x:
                train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size],
                y:
                train_set_y[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size]
            })

    train_model_no_ebd = make_train_func(cost_no_ebd, updates_no_emb)
    train_model = make_train_func(cost, updates)

    def make_error_func(x_val, y_val):
        return theano.function(
            inputs=[],
            outputs=errors,
            givens={
                x: x_val,
                y: y_val
            },
        )

    dev_error = make_error_func(dev_set_x, dev_set_y)

    test_error = make_error_func(test_set_x, test_set_y)

    #############################
    # Debugging purpose code    #
    #############################
    # : PARAMETER TUNING NOTE:
    # some demonstration of the gradient vanishing probelm

    train_data_at_index = {
        x:
        train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                    args.batch_size],
    }

    train_data_at_index_with_y = {
        x:
        train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                    args.batch_size],
        y:
        train_set_y[batch_index * args.batch_size:(batch_index + 1) *
                    args.batch_size]
    }

    if print_config["nnl"]:
        get_nnl = theano.function(
            inputs=[batch_index],
            outputs=dropout_cost,
            givens={
                x:
                train_set_x[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size],
                y:
                train_set_y[batch_index * args.batch_size:(batch_index + 1) *
                            args.batch_size]
            })

    if print_config["L2_sqr"]:
        get_L2_sqr = theano.function(inputs=[], outputs=L2_sqr)

        get_L2_sqr_no_ebd = theano.function(inputs=[], outputs=L2_sqr_no_ebd)

    if print_config["grad_abs_mean"]:
        print_grads = theano.function(
            inputs=[],
            outputs=[
                theano.printing.Print(param.name)(T.mean(T.abs_(param_grad)))
                for param, param_grad in zip(params, param_grads)
            ],
            givens={
                x: train_set_x,
                y: train_set_y
            })

    activations = [l.output for l in dropout_layers[1:-1]]
    weight_grads = [T.grad(cost, l.W) for l in dropout_layers[1:-1]]

    if print_config["activation_hist"]:
        # turn into 1D array
        get_activations = theano.function(
            inputs=[batch_index],
            outputs=[val.flatten(1) for val in activations],
            givens=train_data_at_index)

    if print_config["weight_grad_hist"]:
        # turn into 1D array
        get_weight_grads = theano.function(
            inputs=[batch_index],
            outputs=[val.flatten(1) for val in weight_grads],
            givens=train_data_at_index_with_y)

    if print_config["activation_tracking"]:
        # get the mean and variance of activations for each conv layer

        get_activation_mean = theano.function(
            inputs=[batch_index],
            outputs=[T.mean(val) for val in activations],
            givens=train_data_at_index)

        get_activation_std = theano.function(
            inputs=[batch_index],
            outputs=[T.std(val) for val in activations],
            givens=train_data_at_index)

    if print_config["weight_grad_tracking"]:
        # get the mean and variance of activations for each conv layer
        get_weight_grad_mean = theano.function(
            inputs=[batch_index],
            outputs=[T.mean(g) for g in weight_grads],
            givens=train_data_at_index_with_y)

        get_weight_grad_std = theano.function(
            inputs=[batch_index],
            outputs=[T.std(g) for g in weight_grads],
            givens=train_data_at_index_with_y)

    #the training loop
    patience = args.patience  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant

    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = np.inf
    best_iter = 0

    start_time = time.clock()
    done_looping = False
    epoch = 0

    nnls = []
    L2_sqrs = []

    activation_means = [[] for i in range(args.conv_layer_n)]
    activation_stds = [[] for i in range(args.conv_layer_n)]
    weight_grad_means = [[] for i in range(args.conv_layer_n)]
    weight_grad_stds = [[] for i in range(args.conv_layer_n)]
    activation_hist_data = [[] for i in range(args.conv_layer_n)]
    weight_grad_hist_data = [[] for i in range(args.conv_layer_n)]

    train_errors = []
    dev_errors = []
    try:
        print "validation_frequency = %d" % validation_frequency
        while (epoch < args.n_epochs):
            epoch += 1
            print "At epoch {0}".format(epoch)

            if epoch == (args.embedding_learning_delay_epochs + 1):
                print "########################"
                print "Start training embedding"
                print "########################"

            # shuffle the training data
            train_set_x_data = train_set_x.get_value(borrow=True)
            train_set_y_data = train_set_y.get_value(borrow=True)

            permutation = np.random.permutation(
                train_set_x.get_value(borrow=True).shape[0])

            train_set_x.set_value(train_set_x_data[permutation])
            train_set_y.set_value(train_set_y_data[permutation])
            for minibatch_index in range(n_train_batches):
                if epoch >= (args.embedding_learning_delay_epochs + 1):
                    train_cost = train_model(minibatch_index)
                else:
                    train_cost = train_model_no_ebd(minibatch_index)

                iter = (epoch - 1) * n_train_batches + minibatch_index

                if (iter + 1) % validation_frequency == 0:

                    # train_error_val = np.mean([train_error(i)
                    #                            for i in range(n_train_batches)])
                    dev_error_val = dev_error()

                    # print "At epoch %d and minibatch %d. \nTrain error %.2f%%\nDev error %.2f%%\n" %(
                    #     epoch,
                    #     minibatch_index,
                    #     train_error_val * 100,
                    #     dev_error_val * 100
                    # )

                    print "At epoch %d and minibatch %d. \nDev error %.2f%%\n" % (
                        epoch, minibatch_index, dev_error_val * 100)

                    # train_errors.append(train_error_val)
                    dev_errors.append(dev_error_val)

                    if dev_error_val < best_validation_loss:
                        best_iter = iter
                        #improve patience if loss improvement is good enough
                        if dev_error_val < best_validation_loss *  \
                           improvement_threshold:
                            patience = max(patience, iter * patience_increase)

                        best_validation_loss = dev_error_val

                        test_error_val = test_error()

                        print(('     epoch %i, minibatch %i/%i, test error of'
                               ' best dev error %f %%') %
                              (epoch, minibatch_index + 1, n_train_batches,
                               test_error_val * 100.))

                        print "Dumping model to %s" % (args.model_path)
                        dump_params(params, args.model_path)

                if (minibatch_index +
                        1) % 50 == 0 or minibatch_index == n_train_batches - 1:
                    print "%d / %d minibatches completed" % (
                        minibatch_index + 1, n_train_batches)
                    if print_config["nnl"]:
                        print "`nnl` for the past 50 minibatches is %f" % (
                            np.mean(np.array(nnls)))
                        nnls = []
                    if print_config["L2_sqr"]:
                        print "`L2_sqr`` for the past 50 minibatches is %f" % (
                            np.mean(np.array(L2_sqrs)))
                        L2_sqrs = []

                ##################
                # Plotting stuff #
                ##################
                if print_config["nnl"]:
                    nnl = get_nnl(minibatch_index)
                    # print "nll for batch %d: %f" %(minibatch_index, nnl)
                    nnls.append(nnl)

                if print_config["L2_sqr"]:
                    if epoch >= (args.embedding_learning_delay_epochs + 1):
                        L2_sqrs.append(get_L2_sqr())
                    else:
                        L2_sqrs.append(get_L2_sqr_no_ebd())

                if print_config["activation_tracking"]:
                    layer_means = get_activation_mean(minibatch_index)
                    layer_stds = get_activation_std(minibatch_index)
                    for layer_ms, layer_ss, layer_m, layer_s in zip(
                            activation_means, activation_stds, layer_means,
                            layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["weight_grad_tracking"]:
                    layer_means = get_weight_grad_mean(minibatch_index)
                    layer_stds = get_weight_grad_std(minibatch_index)

                    for layer_ms, layer_ss, layer_m, layer_s in zip(
                            weight_grad_means, weight_grad_stds, layer_means,
                            layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["activation_hist"]:
                    for layer_hist, layer_data in zip(
                            activation_hist_data,
                            get_activations(minibatch_index)):
                        layer_hist += layer_data.tolist()

                if print_config["weight_grad_hist"]:
                    for layer_hist, layer_data in zip(
                            weight_grad_hist_data,
                            get_weight_grads(minibatch_index)):
                        layer_hist += layer_data.tolist()

    except:
        import traceback
        traceback.print_exc(file=sys.stdout)
    finally:
        from plot_util import (plot_hist, plot_track, plot_error_vs_epoch, plt)

        if print_config["activation_tracking"]:
            plot_track(activation_means, activation_stds,
                       "activation_tracking")

        if print_config["weight_grad_tracking"]:
            plot_track(weight_grad_means, weight_grad_stds,
                       "weight_grad_tracking")

        if print_config["activation_hist"]:
            plot_hist(activation_hist_data, "activation_hist")

        if print_config["weight_grad_hist"]:
            plot_hist(weight_grad_hist_data, "weight_grad_hist")

        if print_config["error_vs_epoch"]:
            train_errors = [0] * len(dev_errors)
            ax = plot_error_vs_epoch(
                train_errors,
                dev_errors,
                title=('Best dev score: %f %% '
                       ' at iter %i with test error %f %%') %
                (best_validation_loss * 100., best_iter + 1,
                 test_error_val * 100.))
        if not args.task_signature:
            plt.show()
        else:
            plt.savefig("plots/" + args.task_signature + ".png")

    end_time = time.clock()

    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_error_val * 100.))

    # save the result
    with open(args.output, "a") as f:
        f.write("%s\t%f\t%f\n" %
                (args.task_signature, best_validation_loss, test_error_val))

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
n_out = 5
W_logreg = np.asarray(np.random.rand(n_in, n_out), 
                      dtype = theano.config.floatX)
b_logreg = np.asarray(np.random.rand(n_out),
                      dtype = theano.config.floatX)

layer3 = LogisticRegression(rng = rng, 
                            input = layer2.output.flatten(2), 
                            n_in = n_in, 
                            n_out = n_out,
                            W = theano.shared(value = W_logreg, name = "W_logreg"),
                            b = theano.shared(value = b_logreg, name = "b_logreg")
)

f1 = theano.function(inputs = [x_symbol, y_symbol], 
                     outputs = layer3.nnl(y_symbol)
)

f2 = theano.function(inputs = [x_symbol, y_symbol], 
                     outputs = layer3.errors(y_symbol)
)

f3 = theano.function(inputs = [x_symbol], 
                     outputs = layer3.p_y_given_x
)

f_el = theano.function(inputs = [x_symbol], 
                       outputs = layer1.output
)

f_cl = theano.function(inputs = [x_symbol], 
print out.shape

expected = (1, feat_map_n, EMB_DIM / 2, k)
assert out.shape == expected, "%r != %r" % (out.shape, expected)

##### Test Part Three ###############
# LogisticRegressionLayer
#################################

print "############# LogisticRegressionLayer ##############"

l3 = LogisticRegression(
    rng,
    input=l2.output.flatten(2),
    n_in=feat_map_n * k * EMB_DIM / 2,  # we fold once, so divide by 2
    n_out=5,  # five sentiment level
)

print "n_in = %d" % (2 * 2 * math.ceil(EMB_DIM / 2.0))

y = T.ivector("y")  # the sentence sentiment label

p_y_given_x = theano.function(inputs=[x], outputs=l3.p_y_given_x, mode="DebugMode")

print "p_y_given_x = "
print p_y_given_x(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32))

cost = theano.function(inputs=[x, y], outputs=l3.nnl(y), mode="DebugMode")

print "cost:\n", cost(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32), np.array([1, 2], dtype=np.int32))
Пример #4
0
class RNTN(object):
    """
    Recursive Neural Tensor Network architecture
    """
    def __init__(self, x, y, vocab_size, embed_dim, label_n):
        """
        x: theano.tensor.imatrix, (minibatch size, 3)
            the tree matrix of the minibatch
            for each row, (node id, left child id, right child id)

        y: theano.tensor.ivector, (minibatch size,)
            the labels

        vocab_size: int
            vocabulary size, including both the words and phrases
        
        embed_dim: int
            the embedding dimension

        """
        assert x.ndim == 2
        assert y.ndim == 1

        parent_ids = x[:, 0]
        children_ids = x[:, 1:]

        rng = np.random.RandomState(1234)

        self.embedding = theano.shared(
            value=rng.normal(0, 0.05, (vocab_size, embed_dim)),
            name='embedding',
            borrow=True,
        )

        self.rntn_layer = RNTNLayer(rng, embed_dim)

        # Update the embedding by
        # forwarding the embedding from bottom to up
        # and getting the vector for each node in each tree

        def update_embedding(child_indices, my_index, embedding):

            assert child_indices.ndim == 1
            assert my_index.ndim == 0

            return T.switch(
                T.eq(
                    child_indices[0], -1
                ),  # NOTE: not using all() because it's non-differentiable
                embedding,  # if no child, return the word embedding
                T.set_subtensor(
                    embedding[
                        my_index],  # otherwise, compute the embedding of RNTN layer
                    self.rntn_layer.output(embedding[child_indices[0]],
                                           embedding[child_indices[1]])))

        final_embedding, updates = theano.scan(
            fn=update_embedding,
            sequences=[children_ids, parent_ids],
            outputs_info=self.
            embedding,  # we should pass the whole matrix and fill in the positions if necessary
        )

        self.update_embedding = theano.function(
            inputs=[x],
            updates=[(self.embedding,
                      T.set_subtensor(self.embedding[parent_ids],
                                      final_embedding[-1][parent_ids]))])

        # the logistic regression layer that predicts the label
        self.logreg_layer = LogisticRegression(
            rng,
            input=final_embedding[-1][parent_ids],
            n_in=embed_dim,
            n_out=label_n)

        cost = self.logreg_layer.nnl(y)

        params = self.logreg_layer.params + self.rntn_layer.params + [
            self.embedding
        ]
        self.params = params

        param_shapes = self.logreg_layer.param_shapes + self.rntn_layer.param_shapes + [
            (vocab_size, embed_dim)
        ]

        grads = [T.grad(cost=cost, wrt=p) for p in params]

        updates = build_adadelta_updates(params,
                                         param_shapes,
                                         grads,
                                         epsilon=0.1)

        # TODO: in this step, forward propagation is done again besides the one in `update_embedding`
        #       this extra computation should be avoided
        self.train = theano.function(inputs=[x, y], updates=updates)
Пример #5
0
def train_and_test(args, print_config):

    assert args.conv_layer_n == len(args.filter_widths) == len(args.nkerns) == (len(args.L2_regs) - 2) == len(args.fold_flags) == len(args.ks)

    # \mod{dim, 2^{\sum fold_flags}} == 0
    assert args.embed_dm % (2 ** sum(args.fold_flags)) == 0
    
    ###################
    # get the data    #
    ###################
    datasets = load_data(args.corpus_path)
    
    train_set_x, train_set_y = datasets[0]
    dev_set_x, dev_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    word2index = datasets[3]
    index2word = datasets[4]
    pretrained_embeddings = datasets[5]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / args.batch_size
    n_dev_batches = dev_set_x.get_value(borrow=True).shape[0] / args.dev_test_batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / args.dev_test_batch_size
    
    train_sent_len = train_set_x.get_value(borrow=True).shape[1]
    possible_labels =  set(train_set_y.get_value().tolist())
    
    if args.use_pretrained_embedding:
        args.embed_dm = pretrained_embeddings.get_value().shape[1]
        
    ###################################
    # Symbolic variable definition    #
    ###################################
    x = T.imatrix('x') # the word indices matrix
    y = T.ivector('y') # the sentiment labels

    batch_index = T.iscalar('batch_index')
    
    rng = np.random.RandomState(1234)
    
    ###############################
    # Construction of the network #
    ###############################
    # Layer 1, the embedding layer
    layer1 = WordEmbeddingLayer(rng, 
                                input = x, 
                                vocab_size = len(word2index),
                                embed_dm = args.embed_dm, 
                                embeddings = (
                                    pretrained_embeddings 
                                    if args.use_pretrained_embedding else None
                                )
    )
    
    dropout_layers = [layer1]
    layers = [layer1]
    
    for i in xrange(args.conv_layer_n):
        fold_flag = args.fold_flags[i]
        
        # for the dropout layer
        dpl = DropoutLayer(
            input = dropout_layers[-1].output,
            rng = rng, 
            dropout_rate = args.dropout_rates[0]
        ) 
        next_layer_dropout_input = dpl.output
        next_layer_input = layers[-1].output
        
        # for the conv layer
        filter_shape = (
            args.nkerns[i],
            (1 if i == 0 else args.nkerns[i-1]), 
            1, 
            args.filter_widths[i]
        )
        
        k = args.ks[i]
        
        print "For conv layer(%s) %d, filter shape = %r, k = %d, dropout_rate = %f and normalized weight init: %r and fold: %d" %(
            args.conv_activation_unit, 
            i+2, 
            filter_shape, 
            k, 
            args.dropout_rates[i], 
            args.norm_w, 
            fold_flag
        )
        
        # we have two layers adding to two paths repsectively, 
        # one for training
        # the other for prediction(averaged model)

        dropout_conv_layer = ConvFoldingPoolLayer(rng, 
                                                  input = next_layer_dropout_input,
                                                  filter_shape = filter_shape, 
                                                  k = k, 
                                                  norm_w = args.norm_w,
                                                  fold = fold_flag,
                                                  activation = args.conv_activation_unit)
    
        # for prediction
        # sharing weight with dropout layer
        conv_layer = ConvFoldingPoolLayer(rng, 
                                          input = next_layer_input,
                                          filter_shape = filter_shape,
                                          k = k,
                                          activation = args.conv_activation_unit,
                                          fold = fold_flag,
                                          W = dropout_conv_layer.W * (1 - args.dropout_rates[i]), # model averaging
                                          b = dropout_conv_layer.b
        )

        dropout_layers.append(dropout_conv_layer)
        layers.append(conv_layer)
    
    # last, the output layer
    # both dropout and without dropout
    if sum(args.fold_flags) > 0:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm / (2**sum(args.fold_flags))
    else:
        n_in = args.nkerns[-1] * args.ks[-1] * args.embed_dm
        
    print "For output layer, n_in = %d, dropout_rate = %f" %(n_in, args.dropout_rates[-1])
    
    dropout_output_layer = LogisticRegression(
        rng,
        input = dropout_layers[-1].output.flatten(2), 
        n_in = n_in, # divided by 2x(how many times are folded)
        n_out = len(possible_labels) # five sentiment level
    )

    output_layer = LogisticRegression(
        rng,
        input = layers[-1].output.flatten(2), 
        n_in = n_in,
        n_out = len(possible_labels),
        W = dropout_output_layer.W * (1 - args.dropout_rates[-1]), # sharing the parameters, don't forget
        b = dropout_output_layer.b
    )
    
    dropout_layers.append(dropout_output_layer)
    layers.append(output_layer)

    ###############################
    # Error and cost              #
    ###############################
    # cost and error come from different model!
    dropout_cost = dropout_output_layer.nnl(y)
    errors = output_layer.errors(y)
    
    def prepare_L2_sqr(param_layers, L2_regs):
        assert len(L2_regs) == len(param_layers)
        return T.sum([
            L2_reg / 2 * ((layer.W if hasattr(layer, "W") else layer.embeddings) ** 2).sum()
            for L2_reg, layer in zip(L2_regs, param_layers)
        ])
    L2_sqr = prepare_L2_sqr(dropout_layers, args.L2_regs)
    L2_sqr_no_ebd = prepare_L2_sqr(dropout_layers[1:], args.L2_regs[1:])
    
    if args.use_L2_reg:
        cost = dropout_cost + L2_sqr
        cost_no_ebd = dropout_cost + L2_sqr_no_ebd
    else:
        cost = dropout_cost
        cost_no_ebd = dropout_cost
    
    ###############################
    # Parameters to be used       #
    ###############################
    print "Delay embedding learning by %d epochs" %(args.embedding_learning_delay_epochs)
        
    print "param_layers: %r" %dropout_layers
    param_layers = dropout_layers
    
    ##############################
    # Parameter Update           #
    ##############################
    print "Using AdaDelta with rho = %f and epsilon = %f" %(args.rho, args.epsilon)
    
    params = [param for layer in param_layers for param in layer.params]
    param_shapes=  [param for layer in param_layers for param in layer.param_shapes]                                
    
    param_grads = [T.grad(cost, param) for param in params]
        
    # AdaDelta parameter update
    # E[g^2]
    # initialized to zero
    egs = [
        theano.shared(
            value = np.zeros(param_shape,
                             dtype = theano.config.floatX
                         ),
            borrow = True,        
            name = "Eg:" + param.name
        )
        for param_shape, param in zip(param_shapes, params)
    ]
    
    # E[\delta x^2], initialized to zero
    exs = [
        theano.shared(
            value = np.zeros(param_shape,
                             dtype = theano.config.floatX
                         ),
            borrow = True,        
            name = "Ex:" + param.name
        )
        for param_shape, param in zip(param_shapes, params)
    ]        
    
    new_egs = [
        args.rho * eg + (1 - args.rho) * g ** 2
        for eg, g in zip(egs, param_grads)
    ]
        
    delta_x = [
        -(T.sqrt(ex + args.epsilon) / T.sqrt(new_eg + args.epsilon)) * g
        for new_eg, ex, g in zip(new_egs, exs, param_grads)
    ]    
    
    new_exs = [
        args.rho * ex + (1 - args.rho) * (dx ** 2)
        for ex, dx in zip(exs, delta_x)
    ]    
    
    egs_updates = zip(egs, new_egs)
    exs_updates = zip(exs, new_exs)
    param_updates = [
        (p, p + dx)
        for dx, g, p in zip(delta_x, param_grads, params)
    ]

    updates = egs_updates + exs_updates + param_updates
    
    # updates WITHOUT embedding
    # exclude the embedding parameter
    egs_updates_no_ebd = zip(egs[1:], new_egs[1:])
    exs_updates_no_ebd = zip(exs[1:], new_exs[1:])
    param_updates_no_ebd = [
        (p, p + dx)
        for dx, g, p in zip(delta_x, param_grads, params)[1:]
    ]
    updates_no_emb = egs_updates_no_ebd + exs_updates_no_ebd + param_updates_no_ebd
    
    def make_train_func(cost, updates):
        return theano.function(inputs = [batch_index],
                               outputs = [cost], 
                               updates = updates,
                               givens = {
                                   x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
                                   y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size]
                               }
        )        

    train_model_no_ebd = make_train_func(cost_no_ebd, updates_no_emb)
    train_model = make_train_func(cost, updates)

    def make_error_func(x_val, y_val):
        return theano.function(inputs = [],
                               outputs = errors, 
                               givens = {
                                   x: x_val,
                                   y: y_val
                               }, 
                           )
        
    dev_error = make_error_func(dev_set_x, dev_set_y)

    test_error = make_error_func(test_set_x, test_set_y)
    

    #############################
    # Debugging purpose code    #
    #############################
    # : PARAMETER TUNING NOTE:
    # some demonstration of the gradient vanishing probelm
    
    train_data_at_index = {
        x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
    }

    train_data_at_index_with_y = {
        x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
        y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size]
    }

    if print_config["nnl"]:
        get_nnl = theano.function(
            inputs = [batch_index],
            outputs = dropout_cost,
            givens = {
                x: train_set_x[batch_index * args.batch_size: (batch_index + 1) * args.batch_size],
                y: train_set_y[batch_index * args.batch_size: (batch_index + 1) * args.batch_size]
            }
        )
        
    if print_config["L2_sqr"]:
        get_L2_sqr = theano.function(
            inputs = [],
            outputs = L2_sqr
        )

        get_L2_sqr_no_ebd = theano.function(
            inputs = [],
            outputs = L2_sqr_no_ebd
        )
        
    if print_config["grad_abs_mean"]:
        print_grads = theano.function(
            inputs = [], 
            outputs = [theano.printing.Print(param.name)(
                T.mean(T.abs_(param_grad))
            )
                       for param, param_grad in zip(params, param_grads)
                   ], 
            givens = {
                x: train_set_x,
                y: train_set_y
            }
        )

    activations = [
        l.output
        for l in dropout_layers[1:-1]
    ]
    weight_grads = [
        T.grad(cost, l.W)
        for l in dropout_layers[1:-1]
    ]

    if print_config["activation_hist"]:
        # turn into 1D array
        get_activations = theano.function(
            inputs = [batch_index], 
            outputs = [
                val.flatten(1)
                for val in activations
            ], 
            givens = train_data_at_index
        )

    if print_config["weight_grad_hist"]:
        # turn into 1D array
        get_weight_grads = theano.function(
            inputs = [batch_index], 
            outputs = [
                val.flatten(1)
                for val in weight_grads
            ], 
            givens = train_data_at_index_with_y
        )
        
    if print_config["activation_tracking"]:
        # get the mean and variance of activations for each conv layer                
        
        get_activation_mean = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.mean(val)
                for val in activations
            ], 
            givens = train_data_at_index
        )

        get_activation_std = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.std(val)
                for val in activations
            ], 
            givens = train_data_at_index
        )


    if print_config["weight_grad_tracking"]:
        # get the mean and variance of activations for each conv layer
        get_weight_grad_mean = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.mean(g)
                for g in weight_grads
            ], 
            givens = train_data_at_index_with_y
        )

        get_weight_grad_std = theano.function(
            inputs = [batch_index], 
            outputs = [
                T.std(g)
                for g in weight_grads
            ], 
            givens = train_data_at_index_with_y
        )        
    
    #the training loop
    patience = args.patience  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
                                  
    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = np.inf
    best_iter = 0

    start_time = time.clock()
    done_looping = False
    epoch = 0
    
    nnls = []
    L2_sqrs = []
    
    activation_means = [[] for i in xrange(args.conv_layer_n)]
    activation_stds = [[] for i in xrange(args.conv_layer_n)]
    weight_grad_means = [[] for i in xrange(args.conv_layer_n)]
    weight_grad_stds = [[] for i in xrange(args.conv_layer_n)]
    activation_hist_data = [[] for i in xrange(args.conv_layer_n)]
    weight_grad_hist_data = [[] for i in xrange(args.conv_layer_n)]

    train_errors = []
    dev_errors = []
    try:
        print "validation_frequency = %d" %validation_frequency
        while (epoch < args.n_epochs):
            epoch += 1
            print "At epoch {0}".format(epoch)

            if epoch == (args.embedding_learning_delay_epochs + 1):
                print "########################"
                print "Start training embedding"
                print "########################"

            # shuffle the training data        
            train_set_x_data = train_set_x.get_value(borrow = True)
            train_set_y_data = train_set_y.get_value(borrow = True)        
            
            permutation = np.random.permutation(train_set_x.get_value(borrow=True).shape[0])

            train_set_x.set_value(train_set_x_data[permutation])
            train_set_y.set_value(train_set_y_data[permutation])
            for minibatch_index in xrange(n_train_batches):
                if epoch >= (args.embedding_learning_delay_epochs + 1):
                    train_cost = train_model(minibatch_index)
                else:
                    train_cost = train_model_no_ebd(minibatch_index)


                iter = (epoch - 1) * n_train_batches + minibatch_index
                
                if (iter + 1) % validation_frequency == 0:

                    # train_error_val = np.mean([train_error(i)
                    #                            for i in xrange(n_train_batches)])
                    dev_error_val = dev_error()
                    
                    # print "At epoch %d and minibatch %d. \nTrain error %.2f%%\nDev error %.2f%%\n" %(
                    #     epoch, 
                    #     minibatch_index,
                    #     train_error_val * 100, 
                    #     dev_error_val * 100
                    # )

                    print "At epoch %d and minibatch %d. \nDev error %.2f%%\n" %(
                        epoch, 
                        minibatch_index,
                        dev_error_val * 100
                    )
                    
                    # train_errors.append(train_error_val)
                    dev_errors.append(dev_error_val)
                    
                    if dev_error_val < best_validation_loss:
                        best_iter = iter
                        #improve patience if loss improvement is good enough
                        if dev_error_val < best_validation_loss *  \
                           improvement_threshold:
                            patience = max(patience, iter * patience_increase)

                        best_validation_loss = dev_error_val

                        test_error_val = test_error()

                        print(
                           (
                               '     epoch %i, minibatch %i/%i, test error of'
                                ' best dev error %f %%'
                            ) %
                            (
                                epoch,
                                minibatch_index + 1,
                                n_train_batches,
                                test_error_val * 100.
                            )
                        )

                        print "Dumping model to %s" %(args.model_path)
                        dump_params(params, args.model_path)

                if (minibatch_index+1) % 50 == 0 or minibatch_index == n_train_batches - 1:
                    print "%d / %d minibatches completed" %(minibatch_index + 1, n_train_batches)                
                    if print_config["nnl"]:
                        print "`nnl` for the past 50 minibatches is %f" %(np.mean(np.array(nnls)))
                        nnls = []
                    if print_config["L2_sqr"]:
                        print "`L2_sqr`` for the past 50 minibatches is %f" %(np.mean(np.array(L2_sqrs)))
                        L2_sqrs = []                                                                            
                    
                ##################
                # Plotting stuff #
                ##################
                if print_config["nnl"]:
                    nnl = get_nnl(minibatch_index)
                    # print "nll for batch %d: %f" %(minibatch_index, nnl)
                    nnls.append(nnl)
                    
                if print_config["L2_sqr"]:
                    if epoch >= (args.embedding_learning_delay_epochs + 1):
                        L2_sqrs.append(get_L2_sqr())
                    else:
                        L2_sqrs.append(get_L2_sqr_no_ebd())
                    
                if print_config["activation_tracking"]:
                    layer_means = get_activation_mean(minibatch_index)
                    layer_stds = get_activation_std(minibatch_index)
                    for layer_ms, layer_ss, layer_m, layer_s in zip(activation_means, activation_stds, layer_means, layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["weight_grad_tracking"]:
                    layer_means = get_weight_grad_mean(minibatch_index)
                    layer_stds = get_weight_grad_std(minibatch_index)
                    
                    for layer_ms, layer_ss, layer_m, layer_s in zip(weight_grad_means, weight_grad_stds, layer_means, layer_stds):
                        layer_ms.append(layer_m)
                        layer_ss.append(layer_s)

                if print_config["activation_hist"]:
                    for layer_hist, layer_data in zip(activation_hist_data , get_activations(minibatch_index)):
                        layer_hist += layer_data.tolist()

                if print_config["weight_grad_hist"]:
                    for layer_hist, layer_data in zip(weight_grad_hist_data , get_weight_grads(minibatch_index)):
                        layer_hist += layer_data.tolist()
                                    
    except:
        import traceback
        traceback.print_exc(file = sys.stdout)
    finally:
        from plot_util import (plot_hist, 
                               plot_track, 
                               plot_error_vs_epoch, 
                               plt)

        if print_config["activation_tracking"]:
            plot_track(activation_means, 
                          activation_stds, 
                          "activation_tracking")

        if print_config["weight_grad_tracking"]:
            plot_track(weight_grad_means, 
                          weight_grad_stds,
                          "weight_grad_tracking")
            
        if print_config["activation_hist"]:        
            plot_hist(activation_hist_data, "activation_hist")

        if print_config["weight_grad_hist"]:
            plot_hist(weight_grad_hist_data, "weight_grad_hist")

        if print_config["error_vs_epoch"]:
            train_errors = [0] * len(dev_errors)
            ax = plot_error_vs_epoch(train_errors, dev_errors, 
                                     title = ('Best dev score: %f %% '
                                              ' at iter %i with test error %f %%') %(
                                                  best_validation_loss * 100., best_iter + 1, test_error_val * 100.
                                              )
            )
        if not args.task_signature:
            plt.show()
        else:
            plt.savefig("plots/" + args.task_signature + ".png")
    
    end_time = time.clock()
    
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_error_val * 100.))
    
    # save the result
    with open(args.output, "a") as f:
        f.write("%s\t%f\t%f\n" %(args.task_signature, best_validation_loss, test_error_val))
        
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Пример #6
0
np_l = LogisticRegression(W, b)

#########################
# THEANO PART
#########################

x_symbol = theano.tensor.dmatrix('x')
y_symbol = theano.tensor.ivector('y')

th_l = TheanoLogisticRegression(rng=np.random.RandomState(1234),
                                input=x_symbol,
                                n_in=10,
                                n_out=5,
                                W=theano.shared(value=W, name="W"),
                                b=theano.shared(value=b, name="b"))

f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=th_l.nnl(y_symbol))

actual = np_l.nnl(x, y)
expected = f1(x, y)

assert_matrix_eq(actual, expected, "nnl")

f2 = theano.function(inputs=[x_symbol, y_symbol],
                     outputs=th_l.errors(y_symbol))

actual = np_l.errors(x, y)
expected = f2(x, y)

assert_matrix_eq(actual, expected, "errors")
Пример #7
0
assert out.shape == expected, "%r != %r" % (out.shape, expected)

##### Test Part Three ###############
# LogisticRegressionLayer
#################################

print "############# LogisticRegressionLayer ##############"

l3 = LogisticRegression(
    rng,
    input=l2.output.flatten(2),
    n_in=feat_map_n * k * EMB_DIM / 2,  # we fold once, so divide by 2
    n_out=5  # five sentiment level
)

print "n_in = %d" % (2 * 2 * math.ceil(EMB_DIM / 2.))

y = T.ivector('y')  # the sentence sentiment label

p_y_given_x = theano.function(inputs=[x],
                              outputs=l3.p_y_given_x,
                              mode="DebugMode")

print "p_y_given_x = "
print p_y_given_x(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32))

cost = theano.function(inputs=[x, y], outputs=l3.nnl(y), mode="DebugMode")

print "cost:\n", cost(np.array([[1, 3, 4, 5], [0, 1, 4, 7]], dtype=np.int32),
                      np.array([1, 2], dtype=np.int32))
Пример #8
0
                              W=theano.shared(value=W, name="W"),
                              b=theano.shared(value=b, name="b"))

n_in = filter_shape[0] * k * embed_dm / 2
n_out = 5
W_logreg = np.asarray(np.random.rand(n_in, n_out), dtype=theano.config.floatX)
b_logreg = np.asarray(np.random.rand(n_out), dtype=theano.config.floatX)

layer3 = LogisticRegression(rng=rng,
                            input=layer2.output.flatten(2),
                            n_in=n_in,
                            n_out=n_out,
                            W=theano.shared(value=W_logreg, name="W_logreg"),
                            b=theano.shared(value=b_logreg, name="b_logreg"))

f1 = theano.function(inputs=[x_symbol, y_symbol], outputs=layer3.nnl(y_symbol))

f2 = theano.function(inputs=[x_symbol, y_symbol],
                     outputs=layer3.errors(y_symbol))

f3 = theano.function(inputs=[x_symbol], outputs=layer3.p_y_given_x)

f_el = theano.function(inputs=[x_symbol], outputs=layer1.output)

f_cl = theano.function(inputs=[x_symbol], outputs=layer2.output)

#########################
# NUMPY PART            #
#########################

Пример #9
0
class RNTN(object):
    """
    Recursive Neural Tensor Network architecture
    """
    
    def __init__(self, x, y, vocab_size, embed_dim, label_n):
        """
        x: theano.tensor.imatrix, (minibatch size, 3)
            the tree matrix of the minibatch
            for each row, (node id, left child id, right child id)

        y: theano.tensor.ivector, (minibatch size,)
            the labels

        vocab_size: int
            vocabulary size, including both the words and phrases
        
        embed_dim: int
            the embedding dimension

        """
        assert x.ndim == 2
        assert y.ndim == 1
        
        parent_ids = x[:,0]
        children_ids = x[:,1:]
        
        rng = np.random.RandomState(1234)     

        self.embedding = theano.shared(
            value = rng.normal(0, 0.05, (vocab_size, embed_dim)),
            name = 'embedding',
            borrow = True,
        )        
        
        self.rntn_layer = RNTNLayer(rng, embed_dim)

        # Update the embedding by
        # forwarding the embedding from bottom to up
        # and getting the vector for each node in each tree
        
        def update_embedding(child_indices, my_index, embedding):

            assert child_indices.ndim == 1
            assert my_index.ndim == 0

            return T.switch(T.eq(child_indices[0], -1), # NOTE: not using all() because it's non-differentiable
                            embedding, # if no child, return the word embedding
                            T.set_subtensor(embedding[my_index], # otherwise, compute the embedding of RNTN layer
                                            self.rntn_layer.output(embedding[child_indices[0]], 
                                                                   embedding[child_indices[1]])
                                        )
            )
            
        final_embedding, updates = theano.scan(
            fn = update_embedding, 
            sequences = [children_ids, parent_ids],
            outputs_info = self.embedding, # we should pass the whole matrix and fill in the positions if necessary
        )
                

        self.update_embedding = theano.function(inputs = [x], 
                                                updates = [(self.embedding, 
                                                            T.set_subtensor(self.embedding[parent_ids], final_embedding[-1][parent_ids]))])

        # the logistic regression layer that predicts the label
        self.logreg_layer = LogisticRegression(rng, 
                                          input = final_embedding[-1][parent_ids], 
                                          n_in = embed_dim,
                                          n_out = label_n
        )
        
        cost = self.logreg_layer.nnl(y)

        params = self.logreg_layer.params + self.rntn_layer.params + [self.embedding]
        self.params = params

        param_shapes = self.logreg_layer.param_shapes + self.rntn_layer.param_shapes + [(vocab_size, embed_dim)]
        
        grads = [T.grad(cost = cost, wrt=p) for p in params]
        
        updates = build_adadelta_updates(params, param_shapes, grads, epsilon = 0.1)
        
        # TODO: in this step, forward propagation is done again besides the one in `update_embedding`
        #       this extra computation should be avoided
        self.train = theano.function(inputs = [x, y], 
                                     updates = updates)
x_symbol = theano.tensor.dmatrix('x')
y_symbol = theano.tensor.ivector('y')

th_l = TheanoLogisticRegression(rng = np.random.RandomState(1234), 
                                input = x_symbol, 
                                n_in = 10, 
                                n_out = 5,
                                W = theano.shared(value = W, 
                                                  name = "W"), 
                                b = theano.shared(value = b, 
                                                  name = "b")
)

f1 = theano.function(inputs = [x_symbol, y_symbol], 
                     outputs = th_l.nnl(y_symbol)
                 )

actual = np_l.nnl(x, y)
expected = f1(x, y)


assert_matrix_eq(actual, expected, "nnl")


f2 = theano.function(inputs = [x_symbol, y_symbol], 
                     outputs = th_l.errors(y_symbol)
                 )

actual = np_l.errors(x, y)
expected = f2(x, y)
def main():

    print "############# Load Datasets ##############"

    import stanfordSentimentTreebank as sst

    skip_unknown_words = bool(args.get("--skip"))
    shuffle_flag = bool(args.get("--shuffle"))
    datatype = args.get("--datatype")
    if datatype == 5:
        # Fine-grained 5-class
        n_class = 5
    elif datatype == 2:
        # Binary 2-class
        n_class = 2

    # print "skip_unknown_words",skip_unknown_words
    vocab, index2word, datasets, datasets_all_sentences, funcs = sst.load_stanfordSentimentTreebank_dataset(
        normalize=True, skip_unknown_words=skip_unknown_words, datatype=datatype
    )
    train_set, test_set, dev_set = datasets
    train_set_sentences, test_set_sentences, dev_set_sentences = datasets_all_sentences
    get, sentence2ids, ids2sentence = funcs  # 関数を読み込み
    scores, sentences = zip(*train_set_sentences)
    sentences = [[word for word in sentence.lower().split()] for sentence in sentences]
    vocab_size = len(vocab)

    dev_unknown_count = sum([unknown_word_count for score, (ids, unknown_word_count) in dev_set])
    test_unknown_count = sum([unknown_word_count for score, (ids, unknown_word_count) in test_set])

    train_set = [(score, ids) for score, (ids, unknown_word_count) in train_set]
    test_set = [(score, ids) for score, (ids, unknown_word_count) in test_set]
    dev_set = [(score, ids) for score, (ids, unknown_word_count) in dev_set]

    print "train_size : ", len(train_set)
    print "dev_size   : ", len(dev_set)
    print "test_size  : ", len(test_set)
    print "-" * 30
    print "vocab_size: ", len(vocab)
    print "dev_unknown_words  : ", dev_unknown_count
    print "test_unknown_words : ", test_unknown_count

    print args

    # EMB_DIM = 50
    EMB_DIM = args.get("--emb_size")
    vocab_size = len(vocab)

    feat_map_n_1 = args.get("--feat_map_n_1")
    feat_map_n_final = args.get("--feat_map_n_final")

    height = 1
    width1 = args.get("--width1")
    width2 = args.get("--width2")
    k_top = args.get("--k_top")
    n_class = n_class
    alpha = args.get("--alpha")
    n_epoch = args.get("--n_epoch")
    dropout_rate0 = args.get("--dropout_rate0")
    dropout_rate1 = args.get("--dropout_rate1")
    dropout_rate2 = args.get("--dropout_rate2")
    activation = args.get("--activation")
    learn = args.get("--learn")
    number_of_convolutinal_layer = 2

    pretrain = args.get("--pretrain")
    if pretrain == "word2vec":
        print "*Using word2vec"
        embeddings_W, model = pretrained_embedding.use_word2vec(
            sentences=sentences, index2word=index2word, emb_dim=EMB_DIM
        )
        # -0.5 ~ 0.5で初期化している
    elif pretrain == "glove":
        print "*Using glove"
        embeddings_W = pretrained_embedding.use_glove(
            sentences=sentences,
            index2word=index2word,
            emb_dim=EMB_DIM,
            model_file="glove_model/glove_50_iter2900.model",
        )
    else:
        embeddings_W = np.asarray(rng.normal(0, 0.05, size=(vocab_size, EMB_DIM)), dtype=theano.config.floatX)
        embeddings_W[0, :] = 0

    print np.amax(embeddings_W)
    print np.amin(embeddings_W)
    # print "*embeddings"
    print embeddings_W
    # print bool(embeddings)

    # input_x = [1, 3, 4, 5, 0, 22, 4, 5]

    print "############# Model Setting ##############"
    x = T.imatrix("x")
    length_x = T.iscalar("length_x")
    y = T.ivector("y")  # the sentence sentiment label
    embeddings = WordEmbeddingLayer(rng=rng, input=x, vocab_size=vocab_size, embed_dm=EMB_DIM, embeddings=embeddings_W)

    def dropout(X, p=0.5):
        if p > 0:
            retain_prob = 1 - p
            X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
            # X /= retain_prob
        return X

    # number_of_convolutinal_layer = theano.shared(number_of_convolutinal_layer)
    # dynamic_func = theano.function(inputs=[length_x], outputs=number_of_convolutinal_layer * length_x)

    # dynamic_func_test = theano.function(
    #     inputs = [length_x],
    #     outputs = dynamic_func(length_x),
    #     )
    # print dynamic_func(len([1,2,3]))

    l1 = DynamicConvFoldingPoolLayer(
        rng,
        input=dropout(embeddings.output, p=dropout_rate0),
        filter_shape=(feat_map_n_1, 1, height, width1),  # two feature map, height: 1, width: 2,
        k_top=k_top,
        number_of_convolutinal_layer=number_of_convolutinal_layer,
        index_of_convolitonal_layer=1,
        length_x=length_x,
        activation=activation,
    )
    l1_no_dropout = DynamicConvFoldingPoolLayer(
        rng,
        input=embeddings.output,
        W=l1.W * (1 - dropout_rate0),
        b=l1.b,
        filter_shape=(feat_map_n_1, 1, height, width1),  # two feature map, height: 1, width: 2,
        k_top=k_top,
        number_of_convolutinal_layer=number_of_convolutinal_layer,
        index_of_convolitonal_layer=1,
        length_x=length_x,
        activation=activation,
    )

    l2 = DynamicConvFoldingPoolLayer(
        rng,
        input=dropout(l1.output, p=dropout_rate1),
        filter_shape=(feat_map_n_final, feat_map_n_1, height, width2),
        # two feature map, height: 1, width: 2,
        k_top=k_top,
        number_of_convolutinal_layer=number_of_convolutinal_layer,
        index_of_convolitonal_layer=2,
        length_x=length_x,
        activation=activation,
    )
    l2_no_dropout = DynamicConvFoldingPoolLayer(
        rng,
        input=l1_no_dropout.output,
        W=l2.W * (1 - dropout_rate1),
        b=l2.b,
        filter_shape=(feat_map_n_final, feat_map_n_1, height, width2),
        # two feature map, height: 1, width: 2,
        k_top=k_top,
        number_of_convolutinal_layer=number_of_convolutinal_layer,
        index_of_convolitonal_layer=2,
        length_x=length_x,
        activation=activation,
    )

    # l2_output = theano.function(
    #     inputs = [x,length_x],
    #     outputs = l2.output,
    #     # on_unused_input='ignore'
    # )

    # TODO:
    # check the dimension
    # input: 1 x 1 x 6 x 4
    # out = l2_output(
    #     np.array([input_x], dtype = np.int32),
    #     len(input_x),
    # )

    # test = theano.function(
    #     inputs = [x],
    #     outputs = embeddings.output,
    # )

    # print "--input--"
    # print np.array([input_x], dtype = np.int32).shape
    # print "--input embeddings--"
    # a = np.array([input_x], dtype = np.int32)
    # print test(a).shape
    # print "-- output --"
    # print out
    # print out.shape

    # x = T.dscalar("x")
    # b = T.dscalar("b")
    # a = 1
    # f = theano.function(inputs=[x,b], outputs=b * x + a)
    # print f(2,2)

    # expected = (1, feat_map_n, EMB_DIM / 2, k)
    # assert out.shape == expected, "%r != %r" %(out.shape, expected)

    ##### Test Part Three ###############
    # LogisticRegressionLayer
    #################################

    # print "############# LogisticRegressionLayer ##############"

    l_final = LogisticRegression(
        rng,
        input=dropout(l2.output.flatten(2), p=dropout_rate2),
        n_in=feat_map_n_final * k_top * EMB_DIM,
        # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2
        n_out=n_class,  # five sentiment level
    )

    l_final_no_dropout = LogisticRegression(
        rng,
        input=l2_no_dropout.output.flatten(2),
        W=l_final.W * (1 - dropout_rate2),
        b=l_final.b,
        n_in=feat_map_n_final * k_top * EMB_DIM,
        # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2
        n_out=n_class,  # five sentiment level
    )

    print "n_in : ", feat_map_n_final * k_top * EMB_DIM
    # print "n_in = %d" %(2 * 2 * math.ceil(EMB_DIM / 2.))

    # p_y_given_x = theano.function(
    #     inputs = [x, length_x],
    #     outputs = l_final.p_y_given_x,
    #     allow_input_downcast=True,
    #     # mode = "DebugMode"
    # )

    # print "p_y_given_x = "
    # print p_y_given_x(
    #     np.array([input_x], dtype=np.int32),
    #     len(input_x)
    # )

    cost = theano.function(
        inputs=[x, length_x, y],
        outputs=l_final.nnl(y),
        allow_input_downcast=True,
        # mode = "DebugMode"
    )

    # print "cost:\n", cost(
    #     np.array([input_x], dtype = np.int32),
    #     len(input_x),
    #     np.array([1], dtype = np.int32)
    # )

    print "############# Learning ##############"
    layers = []
    layers.append(embeddings)
    layers.append(l1)
    layers.append(l2)
    layers.append(l_final)

    cost = l_final.nnl(y)

    params = [p for layer in layers for p in layer.params]
    param_shapes = [l.param_shapes for l in layers]
    param_grads = [T.grad(cost, param) for param in params]

    def sgd(cost, params, lr=0.05):
        grads = [T.grad(cost, param) for param in params]
        updates = []
        for p, g in zip(params, grads):
            updates.append([p, p - g * lr])
        return updates

    from sgd import rmsprop, adagrad, adadelta, adam

    # updates = sgd(cost, l_final.params)

    # print param_grads
    if learn == "sgd":
        updates = sgd(cost, params, lr=0.05)
    elif learn == "adam":
        updates = adam(loss_or_grads=cost, params=params, learning_rate=alpha)
    elif learn == "adagrad":
        updates = adagrad(loss_or_grads=cost, params=params, learning_rate=alpha)
    elif learn == "adadelta":
        updates = adadelta(loss_or_grads=cost, params=params)
    elif learn == "rmsprop":
        updates = rmsprop(loss_or_grads=cost, params=params, learning_rate=alpha)

    train = theano.function(inputs=[x, length_x, y], outputs=cost, updates=updates, allow_input_downcast=True)
    # predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)
    predict = theano.function(
        inputs=[x, length_x],
        outputs=T.argmax(l_final_no_dropout.p_y_given_x, axis=1),
        allow_input_downcast=True,
        # mode = "DebugMode"
    )

    def b(x_data):
        return np.array(x_data, dtype=np.int32)

    def test(test_set):
        # print "############# TEST ##############"
        y_pred = []
        test_set_y = []
        # for train_x, train_y in zip(X_data, Y_data):
        # print test_set
        # Accuracy_count = 0
        for test_y, test_x in test_set:
            test_x = b([test_x])
            p = predict(test_x, len(test_x))[0]
            y_pred.append(p)
            test_set_y.append(test_y)

            # if test_y == p:
            #     Accuracy_count += 1

            # print "*predict :",predict(train_x, len(train_x)), train_y
        # Accuracy = float(Accuracy_count) / len(test_set)
        # print "  accuracy : %f" % Accuracy,
        return accuracy_score(test_set_y, y_pred)
        # print classification_report(test_set_y, y_pred)

    # train_set_rand = np.ndarray(train_set)
    train_set_rand = train_set[:]
    train_cost_sum = 0.0
    for epoch in xrange(n_epoch):
        print "== epoch : %d ==" % epoch
        if shuffle_flag:
            np.random.shuffle(train_set_rand)
            # train_set_rand = np.random.permutation(train_set)
        for i, x_y_set in enumerate(train_set_rand):
            train_y, train_x = x_y_set
            train_x = b([train_x])
            train_y = b([train_y])

            train_cost = train(train_x, len(train_x), train_y)
            train_cost_sum += train_cost
            if i % 1000 == 0 or i == len(train_set) - 1:
                print "i : (%d/%d)" % (i, len(train_set)),
                print " (cost : %f )" % train_cost

        print "  cost :", train_cost_sum
        print "  train_set : %f" % test(train_set)
        print "  dev_set   : %f" % test(dev_set)
        print "  test_set  : %f" % test(test_set)

    """
def main():

    print "############# Load Datasets ##############"

    import stanfordSentimentTreebank as sst

    skip_unknown_words = bool(args.get("--skip"))
    shuffle_flag = bool(args.get("--shuffle"))
    datatype = args.get("--datatype")
    if datatype == 5:
        # Fine-grained 5-class
        n_class = 5
    elif datatype == 2:
        # Binary 2-class
        n_class = 2

    # print "skip_unknown_words",skip_unknown_words
    vocab, index2word, datasets, datasets_all_sentences, funcs = sst.load_stanfordSentimentTreebank_dataset(normalize=True, skip_unknown_words=skip_unknown_words, datatype=datatype)
    train_set, test_set, dev_set  = datasets
    train_set_sentences, test_set_sentences, dev_set_sentences = datasets_all_sentences
    get,sentence2ids, ids2sentence = funcs # 関数を読み込み
    scores, sentences = zip(*train_set_sentences)
    sentences = [[word for word in sentence.lower().split()] for sentence in sentences]
    vocab_size = len(vocab)

 
    dev_unknown_count  = sum([unknown_word_count for score,(ids,unknown_word_count) in dev_set])
    test_unknown_count = sum([unknown_word_count for score,(ids,unknown_word_count) in test_set])

    train_set = [(score, ids) for score,(ids,unknown_word_count) in train_set]
    test_set  = [(score, ids) for score,(ids,unknown_word_count) in test_set]
    dev_set   = [(score, ids) for score,(ids,unknown_word_count) in dev_set]

    print "train_size : ", len(train_set)
    print "dev_size   : ", len(dev_set)
    print "test_size  : ", len(test_set)
    print "-"*30
    print "vocab_size: ", len(vocab)
    print "dev_unknown_words  : ", dev_unknown_count
    print "test_unknown_words : ", test_unknown_count



    
    print args

    # EMB_DIM = 50
    EMB_DIM = args.get("--emb_size")
    vocab_size = len(vocab)


    feat_map_n_1 = args.get("--feat_map_n_1")
    feat_map_n_final = args.get("--feat_map_n_final")

    height = 1
    width1 = args.get("--width1")
    width2 = args.get("--width2")
    k_top  = args.get("--k_top")
    n_class = n_class
    alpha   = args.get("--alpha")
    n_epoch = args.get("--n_epoch")
    dropout_rate0 = args.get("--dropout_rate0")
    dropout_rate1 = args.get("--dropout_rate1")
    dropout_rate2 = args.get("--dropout_rate2")
    activation = args.get("--activation")
    learn      = args.get("--learn")
    number_of_convolutinal_layer = 2
    use_regular = bool(args.get("--use_regular"))
    regular_c   = args.get("--regular_c")

    pretrain = args.get('--pretrain')
    if pretrain == 'word2vec':
        print "*Using word2vec"
        embeddings_W, model = pretrained_embedding.use_word2vec(sentences=sentences, index2word=index2word, emb_dim=EMB_DIM)
        # -0.5 ~ 0.5で初期化している
    elif pretrain == 'glove':
        print "*Using glove"
        embeddings_W = pretrained_embedding.use_glove(sentences=sentences, index2word=index2word, emb_dim=EMB_DIM, model_file='glove_model/glove_50_iter2900.model')
    else:
        embeddings_W = np.asarray(
            rng.normal(0, 0.05, size = (vocab_size, EMB_DIM)), 
            dtype = theano.config.floatX
        )
        embeddings_W[0,:] = 0

    print np.amax(embeddings_W)
    print np.amin(embeddings_W)
    # print "*embeddings"
    print embeddings_W
    # print bool(embeddings)

    # input_x = [1, 3, 4, 5, 0, 22, 4, 5]

    print "############# Model Setting ##############"    
    x = T.imatrix('x')
    length_x = T.iscalar('length_x')
    y = T.ivector('y') # the sentence sentiment label
    embeddings = WordEmbeddingLayer(rng=rng, 
                            input=x,
                            vocab_size=vocab_size, embed_dm=EMB_DIM, embeddings=embeddings_W)


    def dropout(X, p=0.5):
        if p > 0:
            retain_prob = 1 - p
            X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
            # X /= retain_prob
        return X
    # number_of_convolutinal_layer = theano.shared(number_of_convolutinal_layer)
    # dynamic_func = theano.function(inputs=[length_x], outputs=number_of_convolutinal_layer * length_x)

    # dynamic_func_test = theano.function(
    #     inputs = [length_x],
    #     outputs = dynamic_func(length_x),
    #     )
    # print dynamic_func(len([1,2,3]))

    l1 = DynamicConvFoldingPoolLayer(rng, 
                              input = dropout(embeddings.output, p=dropout_rate0), 
                              filter_shape = (feat_map_n_1, 1, height, width1),  # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=1,
                              length_x=length_x,
                              activation = activation
    )
    l1_no_dropout = DynamicConvFoldingPoolLayer(rng, 
                              input = embeddings.output,
                              W=l1.W * (1 - dropout_rate0),
                              b=l1.b,
                              filter_shape = (feat_map_n_1, 1, height, width1),  # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=1,
                              length_x=length_x,
                              activation = activation
    )


    l2 = DynamicConvFoldingPoolLayer(rng, 
                              input = dropout(l1.output, p=dropout_rate1), 
                              filter_shape = (feat_map_n_final, feat_map_n_1, height, width2),
                              # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=2,
                              length_x=length_x,
                              activation = activation
    )
    l2_no_dropout = DynamicConvFoldingPoolLayer(rng, 
                              input = l1_no_dropout.output,
                              W=l2.W * (1 - dropout_rate1),
                              b=l2.b,
                              filter_shape = (feat_map_n_final, feat_map_n_1, height, width2),
                              # two feature map, height: 1, width: 2, 
                              k_top = k_top,
                              number_of_convolutinal_layer=number_of_convolutinal_layer,
                              index_of_convolitonal_layer=2,
                              length_x=length_x,
                              activation = activation
    )


    # l2_output = theano.function(
    #     inputs = [x,length_x],
    #     outputs = l2.output,
    #     # on_unused_input='ignore'
    # ) 

    # TODO:
    # check the dimension
    # input: 1 x 1 x 6 x 4
    # out = l2_output(
    #     np.array([input_x], dtype = np.int32),
    #     len(input_x),
    # )


    # test = theano.function(
    #     inputs = [x],
    #     outputs = embeddings.output,
    # ) 


    # print "--input--"
    # print np.array([input_x], dtype = np.int32).shape
    # print "--input embeddings--"
    # a = np.array([input_x], dtype = np.int32)
    # print test(a).shape
    # print "-- output --"
    # print out
    # print out.shape



    # x = T.dscalar("x")
    # b = T.dscalar("b")
    # a = 1
    # f = theano.function(inputs=[x,b], outputs=b * x + a)
    # print f(2,2)


    # expected = (1, feat_map_n, EMB_DIM / 2, k)
    # assert out.shape == expected, "%r != %r" %(out.shape, expected)

    ##### Test Part Three ###############
    # LogisticRegressionLayer
    #################################

    # print "############# LogisticRegressionLayer ##############"

    l_final = LogisticRegression(
        rng, 
        input = dropout(l2.output.flatten(2), p=dropout_rate2),
        n_in = feat_map_n_final * k_top * EMB_DIM,
        # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2
        n_out = n_class, # five sentiment level
    )

    l_final_no_dropout = LogisticRegression(
        rng, 
        input = l2_no_dropout.output.flatten(2),
        W = l_final.W * (1 - dropout_rate2),
        b = l_final.b,
        n_in = feat_map_n_final * k_top * EMB_DIM,
        # n_in = feat_map_n * k * EMB_DIM / 2, # we fold once, so divide by 2
        n_out = n_class, # five sentiment level
    )


    print "n_in : ", feat_map_n_final * k_top * EMB_DIM
    # print "n_in = %d" %(2 * 2 * math.ceil(EMB_DIM / 2.))


    # p_y_given_x = theano.function(
    #     inputs = [x, length_x],
    #     outputs = l_final.p_y_given_x,
    #     allow_input_downcast=True,
    #     # mode = "DebugMode"
    # )

    # print "p_y_given_x = "
    # print p_y_given_x(
    #     np.array([input_x], dtype=np.int32),
    #     len(input_x)
    # )

    cost = theano.function(
        inputs = [x, length_x, y],
        outputs = l_final.nnl(y),
        allow_input_downcast=True,
        # mode = "DebugMode"
    )

    # print "cost:\n", cost(
    #     np.array([input_x], dtype = np.int32), 
    #     len(input_x),
    #     np.array([1], dtype = np.int32)
    # )

    
    print "############# Learning ##############"

    from sgd import sgd, rmsprop, adagrad, adadelta, adam
    from regularizer import regularize_l2

    layers = []
    layers.append(embeddings)
    layers.append(l1)
    layers.append(l2)
    layers.append(l_final)


    cost = l_final.nnl(y)
    params = [p for layer in layers for p in layer.params]
    param_shapes = [l.param_shapes for l in layers]
    param_grads = [T.grad(cost, param) for param in params]

    # regularizer setting
    regularizers = {}
    regularizers['c'] = regular_c # 2.0, 4.0, 15.0
    regularizers['func'] = [None for _ in range(len(params))]
    if use_regular:
        regularizers_func = []
        regularizers_func.append([regularize_l2(l=0.0001)]) # [embeddings]
        regularizers_func.append([regularize_l2(l=0.00003), None]) # [W, b]
        regularizers_func.append([regularize_l2(l=0.000003), None]) # [W, b]
        regularizers_func.append([regularize_l2(l=0.0001), None]) # [logreg_W, logreg_b]
        regularizers_func = [r_func for r in regularizers_func for r_func in r]
        regularizers['func'] = regularizers_func

    # if third conv layer: 1e-5
    
    print embeddings.params
    print l1.params
    print l2.params
    print l_final.params




    # updates = sgd(cost, l_final.params)
    # RegE = 1e-4
    # print param_grads
    if learn == "sgd":
        updates = sgd(cost, params, lr=0.05)
    elif learn == "adam":
        updates = adam(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers)
    elif learn == "adagrad":
        updates = adagrad(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers)
    elif learn == "adadelta":
        updates = adadelta(loss_or_grads=cost, params=params, regularizers=regularizers)
    elif learn == "rmsprop":
        updates = rmsprop(loss_or_grads=cost, params=params, learning_rate=alpha, regularizers=regularizers)


    train = theano.function(inputs=[x, length_x, y], outputs=cost, updates=updates, allow_input_downcast=True)
    # predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)
    predict = theano.function(
        inputs = [x, length_x],
        outputs = T.argmax(l_final_no_dropout.p_y_given_x, axis=1),
        allow_input_downcast=True,
        # mode = "DebugMode"
    )




    def b(x_data):
        return np.array(x_data, dtype=np.int32)


    def test(test_set):
        # print "############# TEST ##############"
        y_pred = []
        test_set_y = []
        # for train_x, train_y in zip(X_data, Y_data):
        # print test_set
        # Accuracy_count = 0
        for test_y,test_x in test_set:
            test_x = b([test_x])
            p = predict(test_x, len(test_x))[0]
            y_pred.append(p)
            test_set_y.append(test_y)

            # if test_y == p:
            #     Accuracy_count += 1

            # print "*predict :",predict(train_x, len(train_x)), train_y 
        # Accuracy = float(Accuracy_count) / len(test_set)
        # print "  accuracy : %f" % Accuracy, 
        return accuracy_score(test_set_y, y_pred)
        # print classification_report(test_set_y, y_pred)

    # train_set_rand = np.ndarray(train_set)
    train_set_rand = train_set[:]
    train_cost_sum = 0.0
    for epoch in xrange(n_epoch):
        print "== epoch : %d =="  % epoch
        if shuffle_flag:
            np.random.shuffle(train_set_rand)
            # train_set_rand = np.random.permutation(train_set)
        for i,x_y_set in enumerate(train_set_rand):
            train_y, train_x = x_y_set
            train_x = b([train_x])
            train_y = b([train_y])

            train_cost = train(train_x, len(train_x) , train_y)
            train_cost_sum += train_cost
            if i % 1000 == 0 or i == len(train_set)-1:
                print "i : (%d/%d)" % (i, len(train_set)) , 
                print " (cost : %f )" % train_cost
        
        print '  cost :', train_cost_sum
        print '  train_set : %f' % test(train_set)
        print '  dev_set   : %f' % test(dev_set)
        print '  test_set  : %f' % test(test_set)





    '''