Exemplo n.º 1
0
def test(args):
    print '\nNEURAL POS TAGGER START\n'

    print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list)
    print '\tWORD\t\t\tEmb Dim: %d  Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim)
    print '\tCHARACTER\t\tEmb Dim: %d  Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim)
    print '\tOPTIMIZATION\t\tMethod: %s  Learning Rate: %f\n' % (args.opt, args.lr)

    """ load vocab """
    print 'Loading vocabularies...\n'
    vocab_word = io_utils.load_data('vocab_word')
    vocab_char = io_utils.load_data('vocab_char')
    vocab_tag = io_utils.load_data('vocab_tag')
    print '\tWord size: %d  Char size: %d' % (vocab_word.size(), vocab_char.size())

    """ load data """
    print '\nLoading data set...\n'
    test_corpus, test_vocab_word, test_vocab_char, test_vocab_tag = io_utils.load_conll(args.dev_data)
    print '\tTest Sentences: %d' % len(test_corpus)

    """ converting into ids """
    print '\nConverting into IDs...\n'
    test_x, test_c, test_b, test_y = preprocessor.convert_into_ids(test_corpus, vocab_word, vocab_char, vocab_tag)

    """ tagger set up """
    tagger = io_utils.load_data(args.load)

    dev_f = theano.function(
        inputs=tagger.input[:-1],
        outputs=tagger.result,
        mode='FAST_RUN'
    )

    """ Prediction """
    print '\nPREDICTION START\n'

    print '\tBatch Index: ',
    start = time.time()

    total = 0.0
    correct = 0

    for index in xrange(len(test_x)):
        if index % 100 == 0 and index != 0:
            print index,
            sys.stdout.flush()

        if tagger.name == 'char':
            corrects = dev_f(test_x[index], test_c[index], test_b[index], test_y[index])
        else:
            corrects = dev_f(test_x[index], test_y[index])

        total += len(corrects)
        correct += np.sum(corrects)

    end = time.time()

    print '\n\tTime: %f seconds' % (end - start)
    print '\tAccuracy:%f  Total:%d  Correct:%d' % ((correct / total), total, correct)
Exemplo n.º 2
0
def train(args):
    print '\nNEURAL POS TAGGER START\n'

    print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list)
    print '\tWORD\t\t\tEmb Dim: %d  Hidden Dim: %d' % (args.w_emb_dim,
                                                       args.w_hidden_dim)
    print '\tCHARACTER\t\tEmb Dim: %d  Hidden Dim: %d' % (args.c_emb_dim,
                                                          args.c_hidden_dim)
    print '\tOPTIMIZATION\t\tMethod: %s  Learning Rate: %f\n' % (args.opt,
                                                                 args.lr)
    """ load data """
    print 'Loading data sets...\n'
    train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = load_train_data(
        args)
    dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, dev_max_char_len = load_dev_data(
        args)

    if dev_corpus:
        for w in dev_vocab_word.i2w:
            if args.vocab_size is None or vocab_word.size() < args.vocab_size:
                vocab_word.add_word(w)
        for c in dev_vocab_char.i2w:
            vocab_char.add_word(c)
        for t in dev_vocab_tag.i2w:
            vocab_tag.add_word(t)

    if args.save:
        io_utils.dump_data(vocab_word, 'vocab_word')
        io_utils.dump_data(vocab_char, 'vocab_char')
        io_utils.dump_data(vocab_tag, 'vocab_tag')
    """ load word embeddings """
    init_w_emb = None
    if args.emb_list:
        print '\tLoading pre-trained word embeddings...\n'
        init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list,
                                            vocab_word)
        w_emb_dim = init_w_emb.shape[1]
    else:
        print '\tUse random-initialized word embeddings...\n'
        w_emb_dim = args.w_emb_dim
    """ limit data set """
    train_corpus = train_corpus[:args.data_size]
    train_corpus.sort(key=lambda a: len(a))
    """ converting into ids """
    print '\nConverting into IDs...\n'

    tr_x, tr_c, tr_b, tr_y = preprocessor.convert_into_ids(
        train_corpus, vocab_word, vocab_char, vocab_tag)

    if args.dev_data:
        dev_x, dev_c, dev_b, dev_y = preprocessor.convert_into_ids(
            dev_corpus, vocab_word, vocab_char, vocab_tag)
        print '\tTrain Sentences: %d  Dev Sentences: %d' % (len(train_corpus),
                                                            len(dev_corpus))
    else:
        print '\tTrain Sentences: %d' % len(train_corpus)

    print '\tWord size: %d  Char size: %d' % (vocab_word.size(),
                                              vocab_char.size())
    """ tagger set up """
    tagger = set_model(args, init_w_emb, w_emb_dim, vocab_word, vocab_char,
                       vocab_tag)

    train_f = theano.function(inputs=tagger.input,
                              outputs=[tagger.nll, tagger.result],
                              updates=tagger.updates,
                              mode='FAST_RUN')

    dev_f = theano.function(inputs=tagger.input[:-1],
                            outputs=tagger.result,
                            mode='FAST_RUN')

    def _train():
        print '\nTRAINING START\n'

        for epoch in xrange(args.epoch):
            _lr = args.lr / float(epoch + 1)
            indices = range(len(tr_x))
            random.shuffle(indices)

            print '\nEpoch: %d' % (epoch + 1)
            print '\n\tTrain set'
            print '\t\tBatch Index: ',
            start = time.time()

            total = 0.0
            correct = 0
            losses = 0.0

            for i, index in enumerate(indices):
                if i % 100 == 0 and i != 0:
                    print i,
                    sys.stdout.flush()

                if args.model == 'char':
                    loss, corrects = train_f(tr_x[index], tr_c[index],
                                             tr_b[index], tr_y[index], _lr)
                else:
                    loss, corrects = train_f(tr_x[index], tr_y[index], _lr)

                assert math.isnan(loss) is False, index

                total += len(corrects)
                correct += np.sum(corrects)
                losses += loss

            end = time.time()
            print '\n\t\tTime: %f seconds' % (end - start)
            print '\t\tNegative Log Likelihood: %f' % losses
            print '\t\tAccuracy:%f  Total:%d  Correct:%d' % (
                (correct / total), total, correct)

            if args.save:
                io_utils.dump_data(
                    tagger, 'model-%s.epoch-%d' % (args.model, epoch + 1))

            _dev(dev_f)

    def _dev(_dev_f):
        print '\n\tDev set'
        print '\t\tBatch Index: ',
        start = time.time()

        total = 0.0
        correct = 0

        for index in xrange(len(dev_x)):
            if index % 100 == 0 and index != 0:
                print index,
                sys.stdout.flush()

            if args.model == 'char':
                corrects = _dev_f(dev_x[index], dev_c[index], dev_b[index],
                                  dev_y[index])
            else:
                corrects = _dev_f(dev_x[index], dev_y[index])

            total += len(corrects)
            correct += np.sum(corrects)

        end = time.time()

        print '\n\t\tTime: %f seconds' % (end - start)
        print '\t\tAccuracy:%f  Total:%d  Correct:%d' % (
            (correct / total), total, correct)

    _train()
Exemplo n.º 3
0
def train(args):
    print '\nNEURAL POS TAGGER START\n'

    print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list)
    print '\tWORD\t\t\tEmb Dim: %d  Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim)
    print '\tCHARACTER\t\tEmb Dim: %d  Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim)
    print '\tOPTIMIZATION\t\tMethod: %s  Learning Rate: %f\n' % (args.opt, args.lr)

    """ load data """
    print 'Loading data sets...\n'
    train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = load_train_data(args)
    dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, dev_max_char_len = load_dev_data(args)

    if dev_corpus:
        for w in dev_vocab_word.i2w:
            if args.vocab_size is None or vocab_word.size() < args.vocab_size:
                vocab_word.add_word(w)
        for c in dev_vocab_char.i2w:
            vocab_char.add_word(c)
        for t in dev_vocab_tag.i2w:
            vocab_tag.add_word(t)

    if args.save:
        io_utils.dump_data(vocab_word, 'vocab_word')
        io_utils.dump_data(vocab_char, 'vocab_char')
        io_utils.dump_data(vocab_tag, 'vocab_tag')

    """ load word embeddings """
    init_w_emb = None
    if args.emb_list:
        print '\tLoading pre-trained word embeddings...\n'
        init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word)
        w_emb_dim = init_w_emb.shape[1]
    else:
        print '\tUse random-initialized word embeddings...\n'
        w_emb_dim = args.w_emb_dim

    """ limit data set """
    train_corpus = train_corpus[:args.data_size]
    train_corpus.sort(key=lambda a: len(a))

    """ converting into ids """
    print '\nConverting into IDs...\n'

    tr_x, tr_c, tr_b, tr_y = preprocessor.convert_into_ids(train_corpus, vocab_word, vocab_char, vocab_tag)

    if args.dev_data:
        dev_x, dev_c, dev_b, dev_y = preprocessor.convert_into_ids(dev_corpus, vocab_word, vocab_char, vocab_tag)
        print '\tTrain Sentences: %d  Dev Sentences: %d' % (len(train_corpus), len(dev_corpus))
    else:
        print '\tTrain Sentences: %d' % len(train_corpus)

    print '\tWord size: %d  Char size: %d' % (vocab_word.size(), vocab_char.size())

    """ tagger set up """
    tagger = set_model(args, init_w_emb, w_emb_dim, vocab_word, vocab_char, vocab_tag)

    train_f = theano.function(
        inputs=tagger.input,
        outputs=[tagger.nll, tagger.result],
        updates=tagger.updates,
        mode='FAST_RUN'
    )

    dev_f = theano.function(
        inputs=tagger.input[:-1],
        outputs=tagger.result,
        mode='FAST_RUN'
    )

    def _train():
        print '\nTRAINING START\n'

        for epoch in xrange(args.epoch):
            _lr = args.lr / float(epoch+1)
            indices = range(len(tr_x))
            random.shuffle(indices)

            print '\nEpoch: %d' % (epoch + 1)
            print '\n\tTrain set'
            print '\t\tBatch Index: ',
            start = time.time()

            total = 0.0
            correct = 0
            losses = 0.0

            for i, index in enumerate(indices):
                if i % 100 == 0 and i != 0:
                    print i,
                    sys.stdout.flush()

                if args.model == 'char':
                    loss, corrects = train_f(tr_x[index], tr_c[index], tr_b[index], tr_y[index], _lr)
                else:
                    loss, corrects = train_f(tr_x[index], tr_y[index], _lr)

                assert math.isnan(loss) is False, index

                total += len(corrects)
                correct += np.sum(corrects)
                losses += loss

            end = time.time()
            print '\n\t\tTime: %f seconds' % (end - start)
            print '\t\tNegative Log Likelihood: %f' % losses
            print '\t\tAccuracy:%f  Total:%d  Correct:%d' % ((correct / total), total, correct)

            if args.save:
                io_utils.dump_data(tagger, 'model-%s.epoch-%d' % (args.model, epoch+1))

            _dev(dev_f)

    def _dev(_dev_f):
        print '\n\tDev set'
        print '\t\tBatch Index: ',
        start = time.time()

        total = 0.0
        correct = 0

        for index in xrange(len(dev_x)):
            if index % 100 == 0 and index != 0:
                print index,
                sys.stdout.flush()

            if args.model == 'char':
                corrects = _dev_f(dev_x[index], dev_c[index], dev_b[index], dev_y[index])
            else:
                corrects = _dev_f(dev_x[index], dev_y[index])

            total += len(corrects)
            correct += np.sum(corrects)

        end = time.time()

        print '\n\t\tTime: %f seconds' % (end - start)
        print '\t\tAccuracy:%f  Total:%d  Correct:%d' % ((correct / total), total, correct)

    _train()