def train_joint_conv_net(w2vFile,
                         dataFile,
                         labelStructureFile,
                         cfswitch,
                         filter_hs,
                         n_epochs=1000,
                         batch_size=50,
                         feature_maps=100,
                         hasmlphidden=False,
                         usefscore=False):
    """
    function: learning and testing sentence level Question Classification Task
            in a joint fashion, ie. adding the loss function of coarse label prediction
            and fine label prediction together.
    :param w2vFile: the path of the word embedding file(pickle file with numpy
            array value, produced by word2vec.py module)
    :param dataFile: the dataset file produced by process_data.py module
    :param labelStructureFile: a file that describes label structure of coarse and fine
            grains. It is produced in produce_data.py in outputlabelstructure()
    "param filter_h: sliding window size.
            *** warning ***
            you cannot just change window size here, if you want to use a different window
            for the experiment. YOU NEED TO RE-PRODUCE A NEW DATASET IN process_data.py
            WITH THE CORRESPONDING WINDOW SIZE.
    :param n_epochs: the number of epochs the training needs to run
    :param batch_size: the size of the mini-batch
    :param feature_maps: how many dimensions you want the abstract sentence
            representation to be
    :param mlphiddensize: the size of the hidden layer in MLP
    :param logFile: the output file of the brief info of each epoch results, basically a
            save for the print out
    :param logTest: keep track of results on test set
    :return: a tuple of best fine grained prediction accuracy and its corresponding
            coarse grained prediction accuracy
    """
    """
    Loading and preparing data
    """
    datasets = load(dataFile)
    clbl_vec, flbl_vec = process_qc.label_structure(labelStructureFile)
    trainDataSetIndex = 0
    testDataSetIndex = 1
    validDataSetIndex = 2
    sentenceIndex = 0
    clblIndex = 1  # coarse label(clbl) index in the dataset structure
    flblIndex = 2  # fine label(flbl) index

    if cfswitch == 'c':
        lblIndex = clblIndex
        label_vec = clbl_vec
    elif cfswitch == 'f':
        lblIndex = flblIndex
        label_vec = flbl_vec
    else:
        print 'wrong arg value in: cfswtich!'
        sys.exit()

    label_size = len(label_vec)

    if hasmlphidden:
        layer_size = [feature_maps * len(filter_hs), 100, label_size]
    else:
        layer_size = [feature_maps * len(filter_hs), label_size]

    # train part
    train_y = shared_store(datasets[trainDataSetIndex][lblIndex])
    train_x = shared_store(datasets[trainDataSetIndex][sentenceIndex])

    # test part
    gold_test_y = datasets[testDataSetIndex][lblIndex]
    test_x = shared_store(datasets[testDataSetIndex][sentenceIndex])

    # valid part
    gold_valid_y = datasets[validDataSetIndex][lblIndex]
    valid_x = shared_store(datasets[validDataSetIndex][sentenceIndex])

    w2v = load(w2vFile)
    img_w = w2v.shape[1]  # the dimension of the word embedding
    img_h = len(datasets[trainDataSetIndex][sentenceIndex]
                [0])  # length of each sentence
    filter_w = img_w  # word embedding dimension
    image_shapes = []
    filter_shapes = []
    for i in xrange(len(filter_hs)):
        image_shapes.append((batch_size, 1, img_h, img_w * filter_hs[i]))
        filter_shapes.append((feature_maps, 1, 1, filter_w * filter_hs[i]))

    pool_size = (img_h, 1)

    train_size = len(datasets[trainDataSetIndex][sentenceIndex])
    print 'number of sentences in training set: ' + str(train_size)
    print 'max sentence length: ' + str(
        len(datasets[trainDataSetIndex][sentenceIndex][0]))
    print 'train data shape: ' + str(
        datasets[trainDataSetIndex][sentenceIndex].shape)
    print 'word embedding dim: ' + str(w2v.shape[1])
    """
    Building model in theano language, less comments here.
    You can refer to Theano web site for more details
    """
    batch_index = T.lvector('hello_batch_index')
    x = T.itensor3('hello_x')
    y = T.ivector('hello_y')
    w2v_shared = theano.shared(value=w2v, name='w2v', borrow=True)
    rng = np.random.RandomState(3435)

    conv_layer_outputs = []
    conv_layers = []
    for i in xrange(len(filter_hs)):
        input = w2v_shared[x.flatten()].reshape(
            (x.shape[0], 1, x.shape[1],
             x.shape[2] * img_w))[:, :, :, 0:filter_hs[i] * img_w]

        conv_layer = LeNetConvPoolLayer(rng,
                                        input=input,
                                        filter_shape=filter_shapes[i],
                                        poolsize=pool_size,
                                        image_shape=image_shapes[i],
                                        non_linear="relu")

        conv_layers.append(conv_layer)
        conv_layer_outputs.append(conv_layer.output.flatten(2))

    mlp_input = T.concatenate(conv_layer_outputs, 1)

    classifier = MLPDropout(
        rng=rng,
        input=mlp_input,
        layer_sizes=layer_size,  # [feature_maps * len(filter_hs), label_size],
        dropout_rate=0.5,
        activation=Iden)

    params = []
    for conv_layer in conv_layers:
        params += conv_layer.params
    params += classifier.params

    cost = classifier.negative_log_likelihood(y)
    updates = sgd_updates_adadelta(params, cost)

    n_batches = train_x.shape.eval()[0] / batch_size

    train_model = theano.function(
        inputs=[batch_index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_x[batch_index],
            y: train_y[batch_index],
        },
    )
    """
    Building test model
    """
    test_conv_layer_outputs = []
    for i, conv_layer in enumerate(conv_layers):
        test_input = w2v_shared[x.flatten()].reshape(
            (x.shape[0], 1, x.shape[1],
             x.shape[2] * img_w))[:, :, :, 0:filter_hs[i] * img_w]
        test_conv_layer_outputs.append(
            conv_layer.conv_layer_output(test_input,
                                         (test_x.shape.eval()[0], 1, img_h,
                                          img_w * filter_hs[i])).flatten(2))
    test_prediction = classifier.predict(
        T.concatenate(test_conv_layer_outputs, 1))

    # test on test set
    test_model = theano.function(inputs=[],
                                 outputs=test_prediction,
                                 givens={
                                     x: test_x,
                                 })

    # test on valid set
    valid_model = theano.function(inputs=[],
                                  outputs=test_prediction,
                                  givens={
                                      x: valid_x,
                                  })
    """
    Training part
    """
    print 'training....'
    best_valid_ep = 0
    best_valid_acc = 0.
    best_test_ep = 0
    best_test_acc = 0.
    final_acc = 0.
    epoch = 0
    last_acc = 0.

    # create gold value sequences, required by the eval.py
    with open('../exp/goldrs', 'w') as writer:
        for lbl in gold_test_y:
            writer.write(str(lbl) + '\n')

    # training loop
    while (epoch < n_epochs):
        epoch += 1
        print '************* epoch ' + str(epoch)
        batch_indexes = range(train_size)
        rng.shuffle(batch_indexes)
        for bchidx in xrange(n_batches):
            random_indexes = batch_indexes[bchidx * batch_size:(bchidx + 1) *
                                           batch_size]
            train_cost = train_model(random_indexes)

        test_y_preds = test_model()
        valid_y_preds = valid_model()
        if usefscore:
            test_acc = eval.fscore(gold_test_y, test_y_preds)
            valid_acc = eval.fscore(gold_valid_y, valid_y_preds)
        else:
            test_acc = eval.accuracy(gold_test_y, test_y_preds)
            valid_acc = eval.accuracy(gold_valid_y, valid_y_preds)
        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            best_valid_ep = epoch
            if final_acc < test_acc:
                final_acc = test_acc
                with open('../exp/predictions', 'w') as writer:
                    for lblidx in test_y_preds:
                        writer.write(str(lblidx) + '\n')
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_test_ep = epoch
            # output predictions

        print 'test accuracy is: ' + str(test_acc)
        print 'valid accuracy is: ' + str(valid_acc)
        print 'current best valid prediction accuracy is: ' + str(
            best_valid_acc) + ' at epoch ' + str(best_valid_ep)
        print 'current best final prediction accuracy is: ' + str(
            final_acc) + ' at epoch ' + str(best_valid_ep)
        print 'current best test prediction accuracy is: ' + str(
            best_test_acc) + ' at epoch ' + str(best_test_ep)
        last_acc = test_acc
    # final_acc = last_acc
    return final_acc
Exemplo n.º 2
0
bow = Feature.feature("bow", examples, dev_set)

example_features = bow.get_incremental_features(examples)

classes = set(target)
classifyers = []

for each in classes:
    Y = np.array([1 if x == each else 0 for x in target])
    clf = GaussianNB()
    clf.fit(X, Y)
    classifyers.append(clf)

pred = []
for i, keyword in enumerate(classes):
    pred = classifyers[i].predict(Dev)
    pred.append([])
    for exampleno, each in enumerate(pred):
        if each == 1:
            pred[exampleno].append(keyword)


import eval

print eval.fscore(gold, pred)


# batch-learning
# from sklearn.naive_bayes import GaussianNB
Exemplo n.º 3
0
for each in _examples:
    examples.append(each[:-1])
    target.append(each[-1])

bow = Feature.feature("bow", examples, dev_set)

example_features = bow.get_incremental_features(examples)

classes = set(target)
classifyers = []

for each in classes:
    Y = np.array([1 if x == each else 0 for x in target])
    clf = GaussianNB()
    clf.fit(X, Y)
    classifyers.append(clf)

pred = []
for i, keyword in enumerate(classes):
    pred = classifyers[i].predict(Dev)
    pred.append([])
    for exampleno, each in enumerate(pred):
        if each == 1:
            pred[exampleno].append(keyword)

import eval
print eval.fscore(gold, pred)

#batch-learning
#from sklearn.naive_bayes import GaussianNB
def train_joint_conv_net(
        w2vFile,
        dataFile,
        labelStructureFile,
        cfswitch,
        filter_hs,
        n_epochs=1000,
        batch_size=50,
        feature_maps=100,
        hasmlphidden=False,
        usefscore=False
):
    """
    function: learning and testing sentence level Question Classification Task
            in a joint fashion, ie. adding the loss function of coarse label prediction
            and fine label prediction together.
    :param w2vFile: the path of the word embedding file(pickle file with numpy
            array value, produced by word2vec.py module)
    :param dataFile: the dataset file produced by process_data.py module
    :param labelStructureFile: a file that describes label structure of coarse and fine
            grains. It is produced in produce_data.py in outputlabelstructure()
    "param filter_h: sliding window size.
            *** warning ***
            you cannot just change window size here, if you want to use a different window
            for the experiment. YOU NEED TO RE-PRODUCE A NEW DATASET IN process_data.py
            WITH THE CORRESPONDING WINDOW SIZE.
    :param n_epochs: the number of epochs the training needs to run
    :param batch_size: the size of the mini-batch
    :param feature_maps: how many dimensions you want the abstract sentence
            representation to be
    :param mlphiddensize: the size of the hidden layer in MLP
    :param logFile: the output file of the brief info of each epoch results, basically a
            save for the print out
    :param logTest: keep track of results on test set
    :return: a tuple of best fine grained prediction accuracy and its corresponding
            coarse grained prediction accuracy
    """

    """
    Loading and preparing data
    """
    datasets = load(dataFile)
    clbl_vec, flbl_vec = process_qc.label_structure(labelStructureFile)
    trainDataSetIndex = 0
    testDataSetIndex = 1
    validDataSetIndex = 2
    sentenceIndex = 0
    clblIndex = 1  # coarse label(clbl) index in the dataset structure
    flblIndex = 2  # fine label(flbl) index

    if cfswitch == 'c':
        lblIndex = clblIndex
        label_vec = clbl_vec
    elif cfswitch == 'f':
        lblIndex = flblIndex
        label_vec = flbl_vec
    else:
        print 'wrong arg value in: cfswtich!'
        sys.exit()

    label_size = len(label_vec)

    if hasmlphidden:
        layer_size = [feature_maps * len(filter_hs), 100, label_size]
    else:
        layer_size = [feature_maps * len(filter_hs), label_size]

    # train part
    train_y = shared_store(datasets[trainDataSetIndex][lblIndex])
    train_x = shared_store(datasets[trainDataSetIndex][sentenceIndex])

    # test part
    gold_test_y = datasets[testDataSetIndex][lblIndex]
    test_x = shared_store(datasets[testDataSetIndex][sentenceIndex])

    # valid part
    gold_valid_y = datasets[validDataSetIndex][lblIndex]
    valid_x = shared_store(datasets[validDataSetIndex][sentenceIndex])

    w2v = load(w2vFile)
    img_w = w2v.shape[1]  # the dimension of the word embedding
    img_h = len(datasets[trainDataSetIndex][sentenceIndex][0])  # length of each sentence
    filter_w = img_w  # word embedding dimension
    image_shapes = []
    filter_shapes = []
    for i in xrange(len(filter_hs)):
        image_shapes.append((batch_size, 1, img_h, img_w * filter_hs[i]))
        filter_shapes.append((feature_maps, 1, 1, filter_w * filter_hs[i]))

    pool_size = (img_h, 1)

    train_size = len(datasets[trainDataSetIndex][sentenceIndex])
    print 'number of sentences in training set: ' + str(train_size)
    print 'max sentence length: ' + str(len(datasets[trainDataSetIndex][sentenceIndex][0]))
    print 'train data shape: ' + str(datasets[trainDataSetIndex][sentenceIndex].shape)
    print 'word embedding dim: ' + str(w2v.shape[1])

    """
    Building model in theano language, less comments here.
    You can refer to Theano web site for more details
    """
    batch_index = T.lvector('hello_batch_index')
    x = T.itensor3('hello_x')
    y = T.ivector('hello_y')
    w2v_shared = theano.shared(value=w2v, name='w2v', borrow=True)
    rng = np.random.RandomState(3435)

    conv_layer_outputs = []
    conv_layers = []
    for i in xrange(len(filter_hs)):
        input = w2v_shared[x.flatten()].reshape(
            (x.shape[0], 1, x.shape[1], x.shape[2] * img_w)
        )[:, :, :, 0:filter_hs[i] * img_w]

        conv_layer = LeNetConvPoolLayer(
            rng,
            input=input,
            filter_shape=filter_shapes[i],
            poolsize=pool_size,
            image_shape=image_shapes[i],
            non_linear="relu"
        )

        conv_layers.append(conv_layer)
        conv_layer_outputs.append(conv_layer.output.flatten(2))

    mlp_input = T.concatenate(conv_layer_outputs, 1)

    classifier = MLPDropout(
        rng=rng,
        input=mlp_input,
        layer_sizes=layer_size,  # [feature_maps * len(filter_hs), label_size],
        dropout_rate=0.5,
        activation=Iden
    )

    params = []
    for conv_layer in conv_layers:
        params += conv_layer.params
    params += classifier.params

    cost = classifier.negative_log_likelihood(y)
    updates = sgd_updates_adadelta(params, cost)

    n_batches = train_x.shape.eval()[0] / batch_size

    train_model = theano.function(
        inputs=[batch_index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_x[batch_index],
            y: train_y[batch_index],
        },
    )

    """
    Building test model
    """
    test_conv_layer_outputs = []
    for i, conv_layer in enumerate(conv_layers):
        test_input = w2v_shared[x.flatten()].reshape(
            (x.shape[0], 1, x.shape[1], x.shape[2] * img_w)
        )[:, :, :, 0:filter_hs[i] * img_w]
        test_conv_layer_outputs.append(
            conv_layer.conv_layer_output(
                test_input,
                (test_x.shape.eval()[0], 1, img_h, img_w * filter_hs[i])
            ).flatten(2)
        )
    test_prediction = classifier.predict(T.concatenate(test_conv_layer_outputs, 1))

    # test on test set
    test_model = theano.function(
        inputs=[],
        outputs=test_prediction,
        givens={
            x: test_x,
        }
    )

    # test on valid set
    valid_model = theano.function(
        inputs=[],
        outputs=test_prediction,
        givens={
            x: valid_x,
        }
    )

    """
    Training part
    """
    print 'training....'
    best_valid_ep = 0
    best_valid_acc = 0.
    best_test_ep = 0
    best_test_acc = 0.
    final_acc = 0.
    epoch = 0
    last_acc = 0.

    # create gold value sequences, required by the eval.py
    with open('../exp/goldrs', 'w') as writer:
        for lbl in gold_test_y:
            writer.write(str(lbl) + '\n')

    # training loop
    while (epoch < n_epochs):
        epoch += 1
        print '************* epoch ' + str(epoch)
        batch_indexes = range(train_size)
        rng.shuffle(batch_indexes)
        for bchidx in xrange(n_batches):
            random_indexes = batch_indexes[bchidx * batch_size:(bchidx + 1) * batch_size]
            train_cost = train_model(random_indexes)

        test_y_preds = test_model()
        valid_y_preds = valid_model()
        if usefscore:
            test_acc = eval.fscore(gold_test_y, test_y_preds)
            valid_acc = eval.fscore(gold_valid_y, valid_y_preds)
        else:
            test_acc = eval.accuracy(gold_test_y, test_y_preds)
            valid_acc = eval.accuracy(gold_valid_y, valid_y_preds)
        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            best_valid_ep = epoch
            if final_acc < test_acc:
                final_acc = test_acc
                with open('../exp/predictions', 'w') as writer:
                    for lblidx in test_y_preds:
                        writer.write(str(lblidx) + '\n')
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_test_ep = epoch
            # output predictions

        print 'test accuracy is: ' + str(test_acc)
        print 'valid accuracy is: ' + str(valid_acc)
        print 'current best valid prediction accuracy is: ' + str(best_valid_acc) + ' at epoch ' + str(best_valid_ep)
        print 'current best final prediction accuracy is: ' + str(final_acc) + ' at epoch ' + str(best_valid_ep)
        print 'current best test prediction accuracy is: ' + str(best_test_acc) + ' at epoch ' + str(best_test_ep)
        last_acc = test_acc
    # final_acc = last_acc
    return final_acc