Exemplo n.º 1
0
def train_cnn_for_el(train_data_file_name,
                     val_data_file_name,
                     num_val_candidates,
                     test_data_file_name,
                     num_test_candidates,
                     full_sentence_len, word_vec_len,
                     all_words,  # first row of all_words should be a non-existing word
                     wid_idx_dict,
                     entity_vecs,
                     gold_as_first_candidate=True,
                     skip_width_loading=40,  # skip width while loading samples
                     n_epochs=25,
                     batch_size=50,
                     filter_hs=def_filter_hs,
                     num_feature_maps=100,
                     lr_decay=0.9,
                     sqr_norm_lim=9,
                     hidden_out_len=50,):
    rng = np.random.RandomState(3435)

    print 'making entity_vecs...', len(entity_vecs)
    # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=theano.config.floatX),
    #                                    name='entity_vecs', borrow=True)
    shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype="int32"),
                                       name='entity_vecs', borrow=True)
    # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=np.float32),
    #                                    name='entity_vecs', borrow=True)
    print 'making shared_words...', len(all_words)
    shared_words = theano.shared(value=np.asarray(all_words, dtype=theano.config.floatX),
                                 name='shared_words', borrow=True)
    print 'done'

    # test_contexts, test_indices = get_data_set_full(test_data_file_name, wid_idx_dict, skip_width_loading)
    # num_test_batches = test_indices.shape[0] / batch_size
    # num_val_contexts, val_contexts, val_indices = get_data_set_full(val_data_file_name,
    #                                                                 wid_idx_dict, skip_width_loading)
    val_contexts, val_indices = data_load.load_samples_full(val_data_file_name, wid_idx_dict, sentence_len,
                                                            sentence_pad_len,
                                                            skip_width=skip_width_loading,
                                                            num_candidates=num_val_candidates)
    num_val_batches = len(val_contexts) / batch_size
    print num_val_batches, 'validation batches'
    print len(val_indices[0]), 'candidates per mention'
    val_contexts = T.cast(to_theano_shared(val_contexts), 'int32')
    val_indices = T.cast(to_theano_shared(val_indices), 'int32')

    test_contexts, test_indices = data_load.load_samples_full(test_data_file_name, wid_idx_dict, sentence_len,
                                                              sentence_pad_len,
                                                              skip_width=skip_width_loading,
                                                              num_candidates=num_test_candidates)
    num_test_batches = len(test_contexts) / batch_size
    print num_test_batches, 'test batches'
    print len(test_indices[0]), 'candidates per mention'
    test_contexts = T.cast(to_theano_shared(test_contexts), 'int32')
    test_indices = T.cast(to_theano_shared(test_indices), 'int32')

    if gold_as_first_candidate:
        gold_labels = theano.shared(value=np.zeros(batch_size,
                                                   dtype='int32'),
                                    borrow=True)
    else:
        gold_labels = theano.shared(value=np.ones(batch_size,
                                                  dtype='int32'),
                                    borrow=True)

    x = T.imatrix('x')
    entities = T.imatrix('entities')

    sentence_cnn0 = SentenceCNN(x, shared_words, full_sentence_len, word_vec_len, filter_hs, num_feature_maps,
                                batch_size,
                                hidden_out_len, rng)
    mc = sentence_cnn0.output  # mention contexts
    unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 1e-5)).dimshuffle(0, 'x')

    batch_entity_vecs = shared_entity_vecs[entities]
    entity_vecs_reshaped = batch_entity_vecs.reshape((batch_entity_vecs.shape[0] * batch_entity_vecs.shape[1],
                                                      batch_entity_vecs.shape[2]))

    sentence_cnn1_train = SentenceCNN(entity_vecs_reshaped, shared_words, entity_rep_len, word_vec_len, entity_hs,
                                      num_entity_rep_feature_maps,
                                      batch_size * num_train_candidates, hidden_out_len, rng)
    entity_reps_train = sentence_cnn1_train.output
    similarities_train = get_entity_context_similarities(unit_mc, entity_reps_train, batch_size, num_train_candidates)
    loss = T.maximum(0, 1 - similarities_train[:, 0] + similarities_train[:, 1]).sum()

    # entity_reps_train = entity_reps_train.reshape((batch_size, num_train_candidates, entity_reps_train.shape[1]))
    # matcher1 = HiddenLayer(rng, batch_entity_vecs, len(entity_vecs[0]), hidden_out_len, relu)
    # entity_reps = matcher1.output

    # unit_entity_reps_train = entity_reps_train / T.sqrt(T.maximum(
    #     T.sum(T.sqr(entity_reps_train), 2), 0.0001)).dimshuffle(0, 1, 'x')
    #
    # similarities = (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2)

    sentence_cnn1_val = SentenceCNN(entity_vecs_reshaped, shared_words, entity_rep_len, word_vec_len, entity_hs,
                                    num_entity_rep_feature_maps,
                                    batch_size * num_val_candidates,
                                    hidden_out_len, rng,
                                    hidden_W=sentence_cnn1_train.hiddenW,
                                    hidden_b=sentence_cnn1_train.hiddenb,
                                    conv_Ws=sentence_cnn1_train.convWs,
                                    conv_bs=sentence_cnn1_train.convbs)
    entity_reps_val = sentence_cnn1_val.output
    similarities_val = get_entity_context_similarities(unit_mc, entity_reps_val, batch_size, num_val_candidates)
    correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities_val, axis=1)))

    # similarities = (mc.dimshuffle(0, 'x', 1) * batch_entity_vecs).sum(axis=2)  # / mc_norm

    # params = sentence_cnn0.params + matcher1.params
    params = sentence_cnn0.params + sentence_cnn1_train.params
    grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6, sqr_norm_lim)

    index = T.lscalar()

    val_model = theano.function(
        [index],
        correct_rate,
        givens={x: val_contexts[index * batch_size: (index + 1) * batch_size],
                entities: val_indices[index * batch_size: (index + 1) * batch_size]}
    )

    test_model = theano.function(
        [index],
        correct_rate,
        givens={x: test_contexts[index * batch_size: (index + 1) * batch_size],
                entities: test_indices[index * batch_size: (index + 1) * batch_size]}
    )

    train_contexts = theano.shared(
        value=np.zeros((3, 2)),
        borrow=True)
    int_train_contexts = T.cast(train_contexts, 'int32')
    train_indices = theano.shared(
        value=np.zeros((3, 2)),
        borrow=True)
    int_train_indices = T.cast(train_indices, 'int32')
    train_model = theano.function(
        [index],
        loss,
        updates=grad_updates,
        givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
                entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
    )

    # fdebug = theano.function(
    #     [index],
    #     similarities_train,
    #     givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
    #             entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
    # )
    fdebug0 = theano.function(
        [index],
        entity_reps_train.sum(axis=1),
        givens={entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
    )
    fdebug1 = theano.function(
        [index],
        similarities_train,
        givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
                entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
    )
    fdebug2 = theano.function(
        [index],
        unit_mc.sum(axis=1),
        givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size]}
    )
    # print fdebug(0)

    # val_perfs = [val_model(i) for i in xrange(num_val_batches)]
    # print('init val perf %f' % np.mean(val_perfs))

    epoch = 0
    max_val_perf = 0
    test_perf = 0
    print 'training ...'
    # while epoch < n_epochs:
    epoch += 1

    train_part_cnt = 0

    # f_train = open(train_data_file_name, 'rb')
    # for i in xrange(143):
    #     data_load.skip_training_sample(f_train, 50000)
    #     if i % 40 == 0:
    #         print i
    # print 'skipped'
    #
    # f_train = open(train_data_file_name, 'rb')
    # cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
    #                                                                         training_part_size,
    #                                                                         wid_idx_dict,
    #                                                                         sentence_len,
    #                                                                         sentence_pad_len)
    # f_train.close()

    f_debug = open('debug_data.bin', 'rb')
    cur_train_contexts, cur_train_indices = cPickle.load(f_debug)
    f_debug.close()

    # print cur_train_contexts[9 * batch_size: (9 + 1) * batch_size]
    # print cur_train_indices[8 * batch_size: (8 + 1) * batch_size]

    train_contexts.set_value(cur_train_contexts, borrow=True)
    train_indices.set_value(cur_train_indices, borrow=True)

    # entity_index_vecs = fdebug0(8)
    # for entity_index_vec in entity_index_vecs:
    #     print entity_index_vec

    train_part_cnt += 1
    num_train_batches = len(cur_train_contexts) / batch_size
    # print 'num_train_batches', num_train_batches
    mean_loss = 0
    for minibatch_index in xrange(num_train_batches):
        # if minibatch_index == 8:
        #     continue
        # if 6 < minibatch_index < 10:
            # print minibatch_index
            # print sentence_cnn1_train.hiddenb.get_value()
            # print fdebug0(minibatch_index)
        cur_loss = train_model(minibatch_index)
        # if 6 < minibatch_index < 10:
            # print minibatch_index
            # print sentence_cnn1_train.hiddenb.get_value()
            # print fdebug0(minibatch_index)
        print minibatch_index, cur_loss
        mean_loss += cur_loss
        # if 11 > minibatch_index > 8:
        #     print minibatch_index, cur_loss
        # print fdebug(minibatch_index)
        # print minibatch_index, cur_loss
    print 'loss:', mean_loss / num_train_batches
    # print fdebug(0)

    val_perfs = [val_model(i) for i in xrange(num_val_batches)]
    val_perf = np.mean(val_perfs)
    print('epoch %i, training part %i, val perf %f(%f), test perf %f'
          % (epoch, train_part_cnt, val_perf, max_val_perf, test_perf))

    if val_perf > max_val_perf:
        max_val_perf = val_perf
        test_perfs = [test_model(i) for i in xrange(num_test_batches)]
        test_perf = np.mean(test_perfs)
        print('\tepoch %i, training part %i, test_perf %f'
              % (epoch, train_part_cnt, test_perf))
def train_cnn_for_el(train_data_file_name,
                     val_data_file_name,
                     num_val_candidates,
                     test_data_file_name,
                     num_test_candidates,
                     sentence_len, word_vec_len,
                     all_words,  # first row of all_words should be a non-existing word
                     wid_idx_dict,
                     entity_vecs,
                     entity_side_cnn=False,
                     gold_as_first_candidate=False,
                     skip_width_loading=40,  # skip width while loading samples
                     n_epochs=25,
                     batch_size=50,
                     filter_hs=def_filter_hs,
                     num_feature_maps=100,
                     lr_decay=0.9,
                     sqr_norm_lim=9,
                     hidden_out_len=50,
                     training_part_size=50000,
                     num_train_candidates=2):
    full_sentence_len = sentence_len + 2 * sentence_pad_len
    rng = np.random.RandomState(3435)

    print 'making entity_vecs...', len(entity_vecs)
    if entity_side_cnn:
        shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype="int32"),
                                           name='entity_vecs', borrow=True)
    else:
        shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=theano.config.floatX),
                                           name='entity_vecs', borrow=True)

    print 'making shared_words...', len(all_words)
    shared_words = theano.shared(value=np.asarray(all_words, dtype=theano.config.floatX),
                                 name='shared_words', borrow=True)
    print 'done'

    val_contexts, val_indices = data_load.load_samples_full(val_data_file_name, wid_idx_dict, sentence_len,
                                                            sentence_pad_len,
                                                            skip_width=skip_width_loading,
                                                            num_candidates=num_val_candidates)
    num_val_batches = len(val_contexts) / batch_size
    print num_val_batches, 'validation batches'
    print len(val_indices[0]), 'candidates per mention'
    val_contexts = T.cast(to_theano_shared(val_contexts), 'int32')
    val_indices = T.cast(to_theano_shared(val_indices), 'int32')

    test_contexts, test_indices = data_load.load_samples_full(test_data_file_name, wid_idx_dict, sentence_len,
                                                              sentence_pad_len,
                                                              skip_width=skip_width_loading,
                                                              num_candidates=num_test_candidates)
    num_test_batches = len(test_contexts) / batch_size
    print num_test_batches, 'test batches'
    print len(test_indices[0]), 'candidates per mention'
    test_contexts = T.cast(to_theano_shared(test_contexts), 'int32')
    test_indices = T.cast(to_theano_shared(test_indices), 'int32')

    if gold_as_first_candidate:
        gold_labels = theano.shared(value=np.zeros(batch_size,
                                                   dtype='int32'),
                                    borrow=True)
    else:
        gold_labels = theano.shared(value=np.ones(batch_size,
                                                  dtype='int32'),
                                    borrow=True)

    x = T.imatrix('x')
    entities = T.imatrix('entities')

    sentence_cnn0 = SentenceCNN(x, shared_words, full_sentence_len, word_vec_len, filter_hs, num_feature_maps,
                                batch_size,
                                hidden_out_len, rng)
    mc = sentence_cnn0.output  # mention contexts
    unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 0.0001)).dimshuffle(0, 'x')

    batch_entity_vecs = shared_entity_vecs[entities]

    if entity_side_cnn:
        loss, correct_rate, entity_side_params = get_training_variables_with_entity_side_cnn(batch_entity_vecs,
                                                                                             shared_words,
                                                                                             word_vec_len, batch_size,
                                                                                             hidden_out_len, unit_mc,
                                                                                             num_train_candidates,
                                                                                             num_val_candidates,
                                                                                             gold_labels, rng)
    else:
        loss, correct_rate, entity_side_params = get_training_variables_no_entity_side_cnn(batch_entity_vecs,
                                                                                           len(entity_vecs[0]),
                                                                                           hidden_out_len,
                                                                                           unit_mc,
                                                                                           gold_labels, rng)
    # params = matcher0.params + entity_side_params
    # for conv_layer in conv_layers:
    #     params += conv_layer.params

    # params = sentence_cnn0.params + matcher1.params
    # params = sentence_cnn0.params + sentence_cnn1_train.params

    params = sentence_cnn0.params + entity_side_params
    grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6, sqr_norm_lim)

    index = T.lscalar()

    val_model = theano.function(
        [index],
        correct_rate,
        givens={x: val_contexts[index * batch_size: (index + 1) * batch_size],
                entities: val_indices[index * batch_size: (index + 1) * batch_size]}
    )

    test_model = theano.function(
        [index],
        correct_rate,
        givens={x: test_contexts[index * batch_size: (index + 1) * batch_size],
                entities: test_indices[index * batch_size: (index + 1) * batch_size]}
    )

    train_contexts = theano.shared(
        value=np.zeros((3, 2)),
        borrow=True)
    int_train_contexts = T.cast(train_contexts, 'int32')
    train_indices = theano.shared(
        value=np.zeros((3, 2)),
        borrow=True)
    int_train_indices = T.cast(train_indices, 'int32')
    train_model = theano.function(
        [index],
        loss,
        updates=grad_updates,
        givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
                entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
    )

    fdebug = theano.function(
        [index],
        batch_entity_vecs,
        givens={entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
    )
    # print fdebug(0)

    val_perfs = [val_model(i) for i in xrange(num_val_batches)]
    print('init val perf %f' % np.mean(val_perfs))

    epoch = 0
    max_val_perf = 0
    test_perf = 0
    print 'training ...'
    while epoch < n_epochs:
        f_train = open(train_data_file_name, 'rb')
        epoch += 1

        train_part_cnt = 0

        cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
                                                                                training_part_size,
                                                                                wid_idx_dict,
                                                                                sentence_len,
                                                                                sentence_pad_len)

        while not len(cur_train_contexts) == 0 and train_part_cnt < 100:
            train_contexts.set_value(cur_train_contexts, borrow=True)
            train_indices.set_value(cur_train_indices, borrow=True)

            train_part_cnt += 1
            num_train_batches = len(cur_train_contexts) / batch_size
            # print 'num_train_batches', num_train_batches
            mean_loss = 0
            for minibatch_index in xrange(num_train_batches):
                cur_loss = train_model(minibatch_index)
                mean_loss += cur_loss
                # print minibatch_index, cur_loss
            print 'loss:', mean_loss / num_train_batches
            # print fdebug(0)

            val_perfs = [val_model(i) for i in xrange(num_val_batches)]
            val_perf = np.mean(val_perfs)
            print('epoch %i, training part %i, val perf %f(%f), test perf %f'
                  % (epoch, train_part_cnt, val_perf, max_val_perf, test_perf))

            if val_perf > max_val_perf:
                max_val_perf = val_perf
                test_perfs = [test_model(i) for i in xrange(num_test_batches)]
                test_perf = np.mean(test_perfs)
                print('\tepoch %i, training part %i, test_perf %f'
                      % (epoch, train_part_cnt, test_perf))

            cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
                                                                                    training_part_size,
                                                                                    wid_idx_dict,
                                                                                    sentence_len,
                                                                                    sentence_pad_len)
        f_train.close()
def train_cnn_for_el(
    train_data_file_name,
    val_data_file_name,
    num_val_candidates,
    test_data_file_name,
    num_test_candidates,
    full_sentence_len,
    word_vec_len,
    all_words,  # first row of all_words should be a non-existing word
    wid_idx_dict,
    entity_vecs,
    gold_as_first_candidate=True,
    skip_width_loading=40,  # skip width while loading samples
    n_epochs=25,
    batch_size=50,
    filter_hs=def_filter_hs,
    num_feature_maps=100,
    lr_decay=0.9,
    sqr_norm_lim=9,
    hidden_out_len=50,
):
    rng = np.random.RandomState(3435)

    print 'making entity_vecs...', len(entity_vecs)
    # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=theano.config.floatX),
    #                                    name='entity_vecs', borrow=True)
    shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs,
                                                        dtype="int32"),
                                       name='entity_vecs',
                                       borrow=True)
    # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=np.float32),
    #                                    name='entity_vecs', borrow=True)
    print 'making shared_words...', len(all_words)
    shared_words = theano.shared(value=np.asarray(all_words,
                                                  dtype=theano.config.floatX),
                                 name='shared_words',
                                 borrow=True)
    print 'done'

    # test_contexts, test_indices = get_data_set_full(test_data_file_name, wid_idx_dict, skip_width_loading)
    # num_test_batches = test_indices.shape[0] / batch_size
    # num_val_contexts, val_contexts, val_indices = get_data_set_full(val_data_file_name,
    #                                                                 wid_idx_dict, skip_width_loading)
    val_contexts, val_indices = data_load.load_samples_full(
        val_data_file_name,
        wid_idx_dict,
        sentence_len,
        sentence_pad_len,
        skip_width=skip_width_loading,
        num_candidates=num_val_candidates)
    num_val_batches = len(val_contexts) / batch_size
    print num_val_batches, 'validation batches'
    print len(val_indices[0]), 'candidates per mention'
    val_contexts = T.cast(to_theano_shared(val_contexts), 'int32')
    val_indices = T.cast(to_theano_shared(val_indices), 'int32')

    test_contexts, test_indices = data_load.load_samples_full(
        test_data_file_name,
        wid_idx_dict,
        sentence_len,
        sentence_pad_len,
        skip_width=skip_width_loading,
        num_candidates=num_test_candidates)
    num_test_batches = len(test_contexts) / batch_size
    print num_test_batches, 'test batches'
    print len(test_indices[0]), 'candidates per mention'
    test_contexts = T.cast(to_theano_shared(test_contexts), 'int32')
    test_indices = T.cast(to_theano_shared(test_indices), 'int32')

    if gold_as_first_candidate:
        gold_labels = theano.shared(value=np.zeros(batch_size, dtype='int32'),
                                    borrow=True)
    else:
        gold_labels = theano.shared(value=np.ones(batch_size, dtype='int32'),
                                    borrow=True)

    x = T.imatrix('x')
    entities = T.imatrix('entities')

    sentence_cnn0 = SentenceCNN(x, shared_words, full_sentence_len,
                                word_vec_len, filter_hs, num_feature_maps,
                                batch_size, hidden_out_len, rng)
    mc = sentence_cnn0.output  # mention contexts
    unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 1e-5)).dimshuffle(
        0, 'x')

    batch_entity_vecs = shared_entity_vecs[entities]
    entity_vecs_reshaped = batch_entity_vecs.reshape(
        (batch_entity_vecs.shape[0] * batch_entity_vecs.shape[1],
         batch_entity_vecs.shape[2]))

    sentence_cnn1_train = SentenceCNN(entity_vecs_reshaped, shared_words,
                                      entity_rep_len, word_vec_len, entity_hs,
                                      num_entity_rep_feature_maps,
                                      batch_size * num_train_candidates,
                                      hidden_out_len, rng)
    entity_reps_train = sentence_cnn1_train.output
    similarities_train = get_entity_context_similarities(
        unit_mc, entity_reps_train, batch_size, num_train_candidates)
    loss = T.maximum(0, 1 - similarities_train[:, 0] +
                     similarities_train[:, 1]).sum()

    # entity_reps_train = entity_reps_train.reshape((batch_size, num_train_candidates, entity_reps_train.shape[1]))
    # matcher1 = HiddenLayer(rng, batch_entity_vecs, len(entity_vecs[0]), hidden_out_len, relu)
    # entity_reps = matcher1.output

    # unit_entity_reps_train = entity_reps_train / T.sqrt(T.maximum(
    #     T.sum(T.sqr(entity_reps_train), 2), 0.0001)).dimshuffle(0, 1, 'x')
    #
    # similarities = (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2)

    sentence_cnn1_val = SentenceCNN(entity_vecs_reshaped,
                                    shared_words,
                                    entity_rep_len,
                                    word_vec_len,
                                    entity_hs,
                                    num_entity_rep_feature_maps,
                                    batch_size * num_val_candidates,
                                    hidden_out_len,
                                    rng,
                                    hidden_W=sentence_cnn1_train.hiddenW,
                                    hidden_b=sentence_cnn1_train.hiddenb,
                                    conv_Ws=sentence_cnn1_train.convWs,
                                    conv_bs=sentence_cnn1_train.convbs)
    entity_reps_val = sentence_cnn1_val.output
    similarities_val = get_entity_context_similarities(unit_mc,
                                                       entity_reps_val,
                                                       batch_size,
                                                       num_val_candidates)
    correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities_val,
                                                     axis=1)))

    # similarities = (mc.dimshuffle(0, 'x', 1) * batch_entity_vecs).sum(axis=2)  # / mc_norm

    # params = sentence_cnn0.params + matcher1.params
    params = sentence_cnn0.params + sentence_cnn1_train.params
    grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6,
                                        sqr_norm_lim)

    index = T.lscalar()

    val_model = theano.function(
        [index],
        correct_rate,
        givens={
            x: val_contexts[index * batch_size:(index + 1) * batch_size],
            entities: val_indices[index * batch_size:(index + 1) * batch_size]
        })

    test_model = theano.function(
        [index],
        correct_rate,
        givens={
            x: test_contexts[index * batch_size:(index + 1) * batch_size],
            entities: test_indices[index * batch_size:(index + 1) * batch_size]
        })

    train_contexts = theano.shared(value=np.zeros((3, 2)), borrow=True)
    int_train_contexts = T.cast(train_contexts, 'int32')
    train_indices = theano.shared(value=np.zeros((3, 2)), borrow=True)
    int_train_indices = T.cast(train_indices, 'int32')
    train_model = theano.function(
        [index],
        loss,
        updates=grad_updates,
        givens={
            x:
            int_train_contexts[index * batch_size:(index + 1) * batch_size],
            entities:
            int_train_indices[index * batch_size:(index + 1) * batch_size]
        })

    # fdebug = theano.function(
    #     [index],
    #     similarities_train,
    #     givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
    #             entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
    # )
    fdebug0 = theano.function(
        [index],
        entity_reps_train.sum(axis=1),
        givens={
            entities:
            int_train_indices[index * batch_size:(index + 1) * batch_size]
        })
    fdebug1 = theano.function(
        [index],
        similarities_train,
        givens={
            x:
            int_train_contexts[index * batch_size:(index + 1) * batch_size],
            entities:
            int_train_indices[index * batch_size:(index + 1) * batch_size]
        })
    fdebug2 = theano.function(
        [index],
        unit_mc.sum(axis=1),
        givens={
            x: int_train_contexts[index * batch_size:(index + 1) * batch_size]
        })
    # print fdebug(0)

    # val_perfs = [val_model(i) for i in xrange(num_val_batches)]
    # print('init val perf %f' % np.mean(val_perfs))

    epoch = 0
    max_val_perf = 0
    test_perf = 0
    print 'training ...'
    # while epoch < n_epochs:
    epoch += 1

    train_part_cnt = 0

    # f_train = open(train_data_file_name, 'rb')
    # for i in xrange(143):
    #     data_load.skip_training_sample(f_train, 50000)
    #     if i % 40 == 0:
    #         print i
    # print 'skipped'
    #
    # f_train = open(train_data_file_name, 'rb')
    # cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
    #                                                                         training_part_size,
    #                                                                         wid_idx_dict,
    #                                                                         sentence_len,
    #                                                                         sentence_pad_len)
    # f_train.close()

    f_debug = open('debug_data.bin', 'rb')
    cur_train_contexts, cur_train_indices = cPickle.load(f_debug)
    f_debug.close()

    # print cur_train_contexts[9 * batch_size: (9 + 1) * batch_size]
    # print cur_train_indices[8 * batch_size: (8 + 1) * batch_size]

    train_contexts.set_value(cur_train_contexts, borrow=True)
    train_indices.set_value(cur_train_indices, borrow=True)

    # entity_index_vecs = fdebug0(8)
    # for entity_index_vec in entity_index_vecs:
    #     print entity_index_vec

    train_part_cnt += 1
    num_train_batches = len(cur_train_contexts) / batch_size
    # print 'num_train_batches', num_train_batches
    mean_loss = 0
    for minibatch_index in xrange(num_train_batches):
        # if minibatch_index == 8:
        #     continue
        # if 6 < minibatch_index < 10:
        # print minibatch_index
        # print sentence_cnn1_train.hiddenb.get_value()
        # print fdebug0(minibatch_index)
        cur_loss = train_model(minibatch_index)
        # if 6 < minibatch_index < 10:
        # print minibatch_index
        # print sentence_cnn1_train.hiddenb.get_value()
        # print fdebug0(minibatch_index)
        print minibatch_index, cur_loss
        mean_loss += cur_loss
        # if 11 > minibatch_index > 8:
        #     print minibatch_index, cur_loss
        # print fdebug(minibatch_index)
        # print minibatch_index, cur_loss
    print 'loss:', mean_loss / num_train_batches
    # print fdebug(0)

    val_perfs = [val_model(i) for i in xrange(num_val_batches)]
    val_perf = np.mean(val_perfs)
    print('epoch %i, training part %i, val perf %f(%f), test perf %f' %
          (epoch, train_part_cnt, val_perf, max_val_perf, test_perf))

    if val_perf > max_val_perf:
        max_val_perf = val_perf
        test_perfs = [test_model(i) for i in xrange(num_test_batches)]
        test_perf = np.mean(test_perfs)
        print('\tepoch %i, training part %i, test_perf %f' %
              (epoch, train_part_cnt, test_perf))
Exemplo n.º 4
0
def train_cnn_for_el(
        train_data_file_name,
        val_data_file_name,
        num_val_candidates,
        test_data_file_name,
        num_test_candidates,
        sentence_len,
        word_vec_len,
        all_words,  # first row of all_words should be a non-existing word
        wid_idx_dict,
        entity_vecs,
        entity_side_cnn=False,
        gold_as_first_candidate=False,
        skip_width_loading=40,  # skip width while loading samples
        n_epochs=25,
        batch_size=50,
        filter_hs=def_filter_hs,
        num_feature_maps=100,
        lr_decay=0.9,
        sqr_norm_lim=9,
        hidden_out_len=50,
        training_part_size=50000,
        num_train_candidates=2):
    full_sentence_len = sentence_len + 2 * sentence_pad_len
    rng = np.random.RandomState(3435)

    print 'making entity_vecs...', len(entity_vecs)
    if entity_side_cnn:
        shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs,
                                                            dtype="int32"),
                                           name='entity_vecs',
                                           borrow=True)
    else:
        shared_entity_vecs = theano.shared(value=np.asarray(
            entity_vecs, dtype=theano.config.floatX),
                                           name='entity_vecs',
                                           borrow=True)

    print 'making shared_words...', len(all_words)
    shared_words = theano.shared(value=np.asarray(all_words,
                                                  dtype=theano.config.floatX),
                                 name='shared_words',
                                 borrow=True)
    print 'done'

    val_contexts, val_indices = data_load.load_samples_full(
        val_data_file_name,
        wid_idx_dict,
        sentence_len,
        sentence_pad_len,
        skip_width=skip_width_loading,
        num_candidates=num_val_candidates)
    num_val_batches = len(val_contexts) / batch_size
    print num_val_batches, 'validation batches'
    print len(val_indices[0]), 'candidates per mention'
    val_contexts = T.cast(to_theano_shared(val_contexts), 'int32')
    val_indices = T.cast(to_theano_shared(val_indices), 'int32')

    test_contexts, test_indices = data_load.load_samples_full(
        test_data_file_name,
        wid_idx_dict,
        sentence_len,
        sentence_pad_len,
        skip_width=skip_width_loading,
        num_candidates=num_test_candidates)
    num_test_batches = len(test_contexts) / batch_size
    print num_test_batches, 'test batches'
    print len(test_indices[0]), 'candidates per mention'
    test_contexts = T.cast(to_theano_shared(test_contexts), 'int32')
    test_indices = T.cast(to_theano_shared(test_indices), 'int32')

    if gold_as_first_candidate:
        gold_labels = theano.shared(value=np.zeros(batch_size, dtype='int32'),
                                    borrow=True)
    else:
        gold_labels = theano.shared(value=np.ones(batch_size, dtype='int32'),
                                    borrow=True)

    x = T.imatrix('x')
    entities = T.imatrix('entities')

    sentence_cnn0 = SentenceCNN(x, shared_words, full_sentence_len,
                                word_vec_len, filter_hs, num_feature_maps,
                                batch_size, hidden_out_len, rng)
    mc = sentence_cnn0.output  # mention contexts
    unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 0.0001)).dimshuffle(
        0, 'x')

    batch_entity_vecs = shared_entity_vecs[entities]

    if entity_side_cnn:
        loss, correct_rate, entity_side_params = get_training_variables_with_entity_side_cnn(
            batch_entity_vecs, shared_words, word_vec_len, batch_size,
            hidden_out_len, unit_mc, num_train_candidates, num_val_candidates,
            gold_labels, rng)
    else:
        loss, correct_rate, entity_side_params = get_training_variables_no_entity_side_cnn(
            batch_entity_vecs, len(entity_vecs[0]), hidden_out_len, unit_mc,
            gold_labels, rng)
    # params = matcher0.params + entity_side_params
    # for conv_layer in conv_layers:
    #     params += conv_layer.params

    # params = sentence_cnn0.params + matcher1.params
    # params = sentence_cnn0.params + sentence_cnn1_train.params

    params = sentence_cnn0.params + entity_side_params
    grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6,
                                        sqr_norm_lim)

    index = T.lscalar()

    val_model = theano.function(
        [index],
        correct_rate,
        givens={
            x: val_contexts[index * batch_size:(index + 1) * batch_size],
            entities: val_indices[index * batch_size:(index + 1) * batch_size]
        })

    test_model = theano.function(
        [index],
        correct_rate,
        givens={
            x: test_contexts[index * batch_size:(index + 1) * batch_size],
            entities: test_indices[index * batch_size:(index + 1) * batch_size]
        })

    train_contexts = theano.shared(value=np.zeros((3, 2)), borrow=True)
    int_train_contexts = T.cast(train_contexts, 'int32')
    train_indices = theano.shared(value=np.zeros((3, 2)), borrow=True)
    int_train_indices = T.cast(train_indices, 'int32')
    train_model = theano.function(
        [index],
        loss,
        updates=grad_updates,
        givens={
            x:
            int_train_contexts[index * batch_size:(index + 1) * batch_size],
            entities:
            int_train_indices[index * batch_size:(index + 1) * batch_size]
        })

    fdebug = theano.function(
        [index],
        batch_entity_vecs,
        givens={
            entities:
            int_train_indices[index * batch_size:(index + 1) * batch_size]
        })
    # print fdebug(0)

    val_perfs = [val_model(i) for i in xrange(num_val_batches)]
    print('init val perf %f' % np.mean(val_perfs))

    epoch = 0
    max_val_perf = 0
    test_perf = 0
    print 'training ...'
    while epoch < n_epochs:
        f_train = open(train_data_file_name, 'rb')
        epoch += 1

        train_part_cnt = 0

        cur_train_contexts, cur_train_indices = data_load.load_training_samples(
            f_train, training_part_size, wid_idx_dict, sentence_len,
            sentence_pad_len)

        while not len(cur_train_contexts) == 0 and train_part_cnt < 100:
            train_contexts.set_value(cur_train_contexts, borrow=True)
            train_indices.set_value(cur_train_indices, borrow=True)

            train_part_cnt += 1
            num_train_batches = len(cur_train_contexts) / batch_size
            # print 'num_train_batches', num_train_batches
            mean_loss = 0
            for minibatch_index in xrange(num_train_batches):
                cur_loss = train_model(minibatch_index)
                mean_loss += cur_loss
                # print minibatch_index, cur_loss
            print 'loss:', mean_loss / num_train_batches
            # print fdebug(0)

            val_perfs = [val_model(i) for i in xrange(num_val_batches)]
            val_perf = np.mean(val_perfs)
            print('epoch %i, training part %i, val perf %f(%f), test perf %f' %
                  (epoch, train_part_cnt, val_perf, max_val_perf, test_perf))

            if val_perf > max_val_perf:
                max_val_perf = val_perf
                test_perfs = [test_model(i) for i in xrange(num_test_batches)]
                test_perf = np.mean(test_perfs)
                print('\tepoch %i, training part %i, test_perf %f' %
                      (epoch, train_part_cnt, test_perf))

            cur_train_contexts, cur_train_indices = data_load.load_training_samples(
                f_train, training_part_size, wid_idx_dict, sentence_len,
                sentence_pad_len)
        f_train.close()
Exemplo n.º 5
0
def train_cnn_for_el(train_data_file_name,
                     val_data_file_name,
                     num_val_candidates,
                     test_data_file_name,
                     num_test_candidates,
                     img_h, img_w,
                     all_words,  # first row of all_words should be a non-existing word
                     wid_idx_dict,
                     entity_vecs,
                     gold_as_first_candidate=False,
                     skip_width_loading=40,  # skip width while loading samples
                     n_epochs=25,
                     batch_size=50,
                     filter_hs=def_filter_hs,
                     num_feature_maps=100,
                     conv_non_linear="relu",
                     lr_decay=0.9,
                     sqr_norm_lim=9,
                     hidden_out_len=50,):
    rng = np.random.RandomState(3435)

    x = T.imatrix('x')
    # es = T.imatrix('es')
    # es_test = T.imatrix('es_test')
    entities = T.imatrix('entities')

    print 'making entity_vecs...', len(entity_vecs)
    shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=theano.config.floatX),
                                       name='entity_vecs', borrow=True)
    # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=np.float32),
    #                                    name='entity_vecs', borrow=True)
    print 'making shared_words...', len(all_words)
    shared_words = theano.shared(value=np.asarray(all_words, dtype=theano.config.floatX),
                                 name='shared_words', borrow=True)
    print 'done'

    # test_contexts, test_indices = get_data_set_full(test_data_file_name, wid_idx_dict, skip_width_loading)
    # num_test_batches = test_indices.shape[0] / batch_size
    # num_val_contexts, val_contexts, val_indices = get_data_set_full(val_data_file_name,
    #                                                                 wid_idx_dict, skip_width_loading)
    val_contexts, val_indices = data_load.load_samples_full(val_data_file_name, wid_idx_dict, sentence_len,
                                                            sentence_pad_len,
                                                            skip_width=skip_width_loading,
                                                            num_candidates=num_val_candidates)
    num_val_batches = len(val_contexts) / batch_size
    print num_val_batches, 'validation batches'
    print len(val_indices[0]), 'candidates per mention'

    if gold_as_first_candidate:
        gold_labels = theano.shared(value=np.zeros(batch_size,
                                                   dtype='int32'),
                                    borrow=True)
    else:
        gold_labels = theano.shared(value=np.ones(batch_size,
                                                  dtype='int32'),
                                    borrow=True)

    val_contexts = T.cast(to_theano_shared(val_contexts), 'int32')
    val_indices = T.cast(to_theano_shared(val_indices), 'int32')

    filter_shapes = []
    pool_sizes = []
    filter_w = img_w
    for filter_h in filter_hs:
        filter_shapes.append((num_feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1))

    layer0_input = shared_words[x.flatten()].reshape((x.shape[0], 1, x.shape[1], shared_words.shape[1]))
    conv_layers = []
    layer1_inputs = []
    for i in xrange(len(filter_hs)):
        filter_shape = filter_shapes[i]
        pool_size = pool_sizes[i]
        conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_h, img_w),
                                        filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear)
        layer1_input = conv_layer.output.flatten(2)
        conv_layers.append(conv_layer)
        layer1_inputs.append(layer1_input)

    layer1_input = T.concatenate(layer1_inputs, 1)
    matcher0 = HiddenLayer(rng, layer1_input, num_feature_maps * len(filter_hs),
                           hidden_out_len, relu)
    mc = matcher0.output  # mention contexts

    unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 0.0001)).dimshuffle(0, 'x')

    batch_entity_vecs = shared_entity_vecs[entities]
    matcher1 = HiddenLayer(rng, batch_entity_vecs, len(entity_vecs[0]), hidden_out_len, relu)
    entity_reps = matcher1.output
    # entity_reps = batch_entity_vecs

    unit_entity_reps = entity_reps / T.sqrt(T.maximum(T.sum(T.sqr(entity_reps), 2), 0.0001)).dimshuffle(0, 1, 'x')

    similarities = (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2)
    correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities, axis=1)))

    loss = T.maximum(0, 1 - similarities[:, 0] + similarities[:, 1]).sum()

    # similarities = (mc.dimshuffle(0, 'x', 1) * batch_entity_vecs).sum(axis=2)  # / mc_norm

    params = matcher0.params + matcher1.params
    # params = matcher0.params
    for conv_layer in conv_layers:
        params += conv_layer.params
    grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6, sqr_norm_lim)

    index = T.lscalar()

    # test_model = theano.function(
    #     [index],
    #     error_rate,
    #     givens={x: test_contexts[index * batch_size: (index + 1) * batch_size],
    #             es: test_indices[index * batch_size: (index + 1) * batch_size]}
    # )

    val_model = theano.function(
        [index],
        correct_rate,
        givens={x: val_contexts[index * batch_size: (index + 1) * batch_size],
                entities: val_indices[index * batch_size: (index + 1) * batch_size]}
    )

    train_contexts = theano.shared(
        value=np.zeros((3, 2)),
        borrow=True)
    int_train_contexts = T.cast(train_contexts, 'int32')
    train_indices = theano.shared(
        value=np.zeros((3, 2)),
        borrow=True)
    int_train_indices = T.cast(train_indices, 'int32')
    train_model = theano.function(
        [index],
        loss,
        updates=grad_updates,
        givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
                entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
    )

    fdebug = theano.function(
        [index],
        similarities,
        givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
                entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
    )
    # print fdebug(0)

    val_perfs = [val_model(i) for i in xrange(num_val_batches)]
    print('init val perf %f' % np.mean(val_perfs))

    print 'training ...'
    f_train = open(train_data_file_name, 'rb')
    epoch = 0
    while epoch < n_epochs:
        epoch += 1

        train_part_cnt = 0
        # num_train_contexts, cur_train_contexts, cur_train_indices = get_data_set_part(
        #     f_train, wid_idx_dict, 50000)
        cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
                                                                                training_part_size,
                                                                                wid_idx_dict,
                                                                                sentence_len,
                                                                                sentence_pad_len)
        while not len(cur_train_contexts) == 0:
            train_contexts.set_value(cur_train_contexts, borrow=True)
            train_indices.set_value(cur_train_indices, borrow=True)
            # print fdebug(0)

            train_part_cnt += 1
            num_train_batches = len(cur_train_contexts) / batch_size
            # print 'num_train_batches', num_train_batches
            mean_loss = 0
            for minibatch_index in xrange(num_train_batches):
                cur_loss = train_model(minibatch_index)
                mean_loss += cur_loss
                # if (minibatch_index + 1) % (num_train_batches / 3) == 0:  # show some progress
                #     print minibatch_index, num_train_batches
            print 'loss:', mean_loss / num_train_batches
            # print fdebug(0)

            val_perfs = [val_model(i) for i in xrange(num_val_batches)]
            val_perf = np.mean(val_perfs)
            print('epoch %i, training part %i, val perf %f'
                  % (epoch, train_part_cnt, val_perf))
            cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
                                                                                    training_part_size,
                                                                                    wid_idx_dict,
                                                                                    sentence_len,
                                                                                    sentence_pad_len)
            # num_train_contexts, cur_train_contexts, cur_train_indices = get_data_set_part(
            #     f_train, wid_idx_dict, 50000)

    f_train.close()