Exemplo n.º 1
0
def run_training():
    print("Begin!")
    #python list of (e1, R, e2) for entire training set in string form
    print("Load training data...")
    raw_training_data = ntn_input.load_training_data(params.data_path)
    print("Load entities and relations...")
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_relations(params.data_path)
    #python list of (e1, R, e2) for entire training set in index form
    indexed_training_data = data_to_indexed(raw_training_data, entities_list, relations_list)
    print("Load embeddings...")
    (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)
    # init_word_embeds size (67447,100), entity_to_wordvec size (38696)
    # entity_to_wordvec -> [[45792], [50003], [19154, 50004], [11403], [7456, 6932], [47896, 50004], [24589], [50005, 50006, 50004], [6551], [12288]]

    num_entities = len(entities_list)
    num_relations = len(relations_list)

    num_iters = params.num_iter
    batch_size = params.batch_size
    corrupt_size = params.corrupt_size
    slice_size = params.slice_size

    with tf.Graph().as_default():
        print("Starting to build graph "+str(datetime.datetime.now()))
        batch_placeholders = [tf.placeholder(tf.int32, shape=(None, 3), name='batch_'+str(i)) for i in range(num_relations)]
        label_placeholders = [tf.placeholder(tf.float32, shape=(None, 1), name='label_'+str(i)) for i in range(num_relations)]

        corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt?
        inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, \
                num_entities, num_relations, slice_size, batch_size, False, label_placeholders)
        # [2,r*batch_predictions]

        loss = ntn.loss(inference, params.regularization)
        training = ntn.training(loss, params.learning_rate)

	    # Create a session for running Ops on the Graph.
        print('1    Here------->>>>>>>> run init <<<<<<<<------------')
        sess = tf.Session()

        # Run the Op to initialize the variables.
        init = tf.global_variables_initializer()
        print('2    Here------->>>>>>>> run init <<<<<<<<------------')
        sess.run(init)
        print('run init')
        saver = tf.train.Saver(tf.trainable_variables())
        for i in range(1, num_iters):
            print("Starting iter "+str(i)+" "+str(datetime.datetime.now()))
            data_batch = get_batch(batch_size, indexed_training_data, num_entities, corrupt_size) # [batch*10,4]
            relation_batches = split_batch(data_batch, num_relations) # [num_relations,batch*10,3]

            if i % params.save_per_iter == 0:
                saver.save(sess, params.output_path+"/"+params.data_name+str(i)+'.sess')

            feed_dict = fill_feed_dict(relation_batches, params.train_both, batch_placeholders, label_placeholders, corrupt_placeholder)

            _, loss_value = sess.run([training, loss], feed_dict=feed_dict)
            print('loss_'+str(i)+': ',loss_value)
Exemplo n.º 2
0
def run_training():
    print("Begin!")
    #python list of (e1, R, e2) for entire training set in string form
    print("Load training data...")
    raw_training_data = ntn_input.load_training_data(params.data_path)
    print("Load entities and relations...")
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_relations(params.data_path)
    #python list of (e1, R, e2) for entire training set in index form
    indexed_training_data = data_to_indexed(raw_training_data, entities_list, relations_list)
    print("Load embeddings...")
    (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)

    num_entities = len(entities_list)
    num_relations = len(relations_list)

    num_iters = params.num_iter
    batch_size = params.batch_size
    corrupt_size = params.corrupt_size
    slice_size = params.slice_size

    with tf.Graph().as_default():
        print("Starting to build graph "+str(datetime.datetime.now()))
        batch_placeholders = [tf.placeholder(tf.int32, shape=(None, 3), name='batch_'+str(i)) for i in range(num_relations)]
        label_placeholders = [tf.placeholder(tf.float32, shape=(None, 1), name='label_'+str(i)) for i in range(num_relations)]

        corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt?
        inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, \
                num_entities, num_relations, slice_size, batch_size, False, label_placeholders)
        loss = ntn.loss(inference, params.regularization)
        training = ntn.training(loss, params.learning_rate)

	# Create a session for running Ops on the Graph.
	sess = tf.Session()

	# Run the Op to initialize the variables.
	init = tf.initialize_all_variables()
	sess.run(init)
        saver = tf.train.Saver(tf.trainable_variables())
        for i in range(1, num_iters):
            print("Starting iter "+str(i)+" "+str(datetime.datetime.now()))
            data_batch = get_batch(batch_size, indexed_training_data, num_entities, corrupt_size)
            relation_batches = split_batch(data_batch, num_relations)

            if i % params.save_per_iter == 0:
                saver.save(sess, params.output_path+"/"+params.data_name+str(i)+'.sess')

	    feed_dict = fill_feed_dict(relation_batches, params.train_both, batch_placeholders, label_placeholders, corrupt_placeholder)
            _, loss_value = sess.run([training, loss], feed_dict=feed_dict)
Exemplo n.º 3
0
def run_training():
    print("Begin!")
    #python list of (e1, R, e2) for entire training set in string form
    print("Load training data...")
    raw_training_data = ntn_input.load_training_data(params.data_path)
    print("Load entities and relations...")
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_relations(params.data_path)
    #python list of (e1, R, e2) for entire training set in index form
    indexed_training_data = data_to_indexed(raw_training_data, entities_list, relations_list)
    print("Load embeddings...")
    (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)

    num_entities = len(entities_list)
    num_relations = len(relations_list)

    num_iters = params.num_iter
    batch_size = params.batch_size
    corrupt_size = params.corrupt_size
    slice_size = params.slice_size

    with tf.Graph().as_default():
        print("Starting to build graph "+str(datetime.datetime.now()))
        batch_placeholders = [tf.placeholder(tf.int32, shape=(None, 3), name='batch_'+str(i)) for i in range(num_relations)]
        label_placeholders = [tf.placeholder(tf.float32, shape=(None, 1), name='label_'+str(i)) for i in range(num_relations)]

        corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt?
        inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, \
                num_entities, num_relations, slice_size, batch_size, False, label_placeholders)
        loss = ntn.loss(inference, params.regularization)
        training = ntn.training(loss, params.learning_rate)

	# Create a session for running Ops on the Graph.
	sess = tf.Session()

	# Run the Op to initialize the variables.
	init = tf.initialize_all_variables()
	sess.run(init)
        saver = tf.train.Saver(tf.trainable_variables())
        for i in range(1, num_iters):
            print("Starting iter "+str(i)+" "+str(datetime.datetime.now()))
            data_batch = get_batch(batch_size, indexed_training_data, num_entities, corrupt_size)
            relation_batches = split_batch(data_batch, num_relations)

            if i % params.save_per_iter == 0:
                saver.save(sess, params.output_path+"/"+params.data_name+str(i)+'.sess')

	    feed_dict = fill_feed_dict(relation_batches, params.train_both, batch_placeholders, label_placeholders, corrupt_placeholder)
            _, loss_value = sess.run([training, loss], feed_dict=feed_dict)
Exemplo n.º 4
0
def run_training():
    print("Begin!")
    # python list of (e1, R, e2) for entire training set in string form
    print("Load training data...")
    # shape of raw training data: (112581, 3)
    raw_training_data = ntn_input.load_training_data(params.data_path)
    raw_dev_data = ntn_input.load_dev_data(params.data_path)
    raw_test_data = ntn_input.load_test_data(params.data_path)

    print("Load entities and relations...")
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_relations(params.data_path)
    num_entities = len(entities_list)  # entity: 38696
    num_relations = len(relations_list)  # relations: 11
    # python list of (e1, R, e2) for entire training set in index form
    indexed_training_data = data_to_indexed(raw_training_data, entities_list,
                                            relations_list)
    indexed_dev_data = data_to_indexed(raw_dev_data, entities_list,
                                       relations_list)
    indexed_test_data = data_to_indexed(raw_test_data, entities_list,
                                        relations_list)

    print("Load embeddings...")
    # shape of word embeds: 67447, 100; number of entities: 38696
    (init_word_embeds,
     entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)

    num_epoches = params.epoches
    batch_size = params.batch_size
    corrupt_size = params.corrupt_size
    slice_size = params.slice_size

    n_iterations_per_epoch = len(indexed_training_data) // batch_size
    n_iterations_validation = len(indexed_dev_data) // batch_size
    n_iterations_evaluation = len(indexed_test_data) // batch_size
    print("# of iteration/epoch", n_iterations_per_epoch)
    print("# of iteration/validation", n_iterations_validation)
    print("# of iteration/evaluation", n_iterations_evaluation)

    with tf.Graph().as_default():
        print("Starting to build graph " + str(datetime.datetime.now()))
        batch_placeholders = [
            tf.placeholder(tf.int32, shape=(None, 3), name='batch_' + str(i))
            for i in range(num_relations)
        ]
        label_placeholders = [
            tf.placeholder(tf.float32, shape=(None, 1), name='label_' + str(i))
            for i in range(num_relations)
        ]
        corrupt_placeholder = tf.placeholder(tf.bool, shape=1)
        train_inference = ntn.inference(batch_placeholders,
                                        corrupt_placeholder, init_word_embeds,
                                        entity_to_wordvec, num_entities,
                                        num_relations, slice_size, batch_size,
                                        False, label_placeholders)
        test_inference = ntn.inference(batch_placeholders, corrupt_placeholder,
                                       init_word_embeds, entity_to_wordvec,
                                       num_entities, num_relations, slice_size,
                                       batch_size, True, label_placeholders)
        train_loss = ntn.loss(train_inference, params.regularization)
        training = ntn.training(train_loss, params.learning_rate)

        # Create a session for running Ops on the Graph.
        sess = tf.Session()

        # Run the Op to initialize the variables.
        init = tf.global_variables_initializer()
        sess.run(init)
        saver = tf.train.Saver(tf.trainable_variables())

        # training
        for i in range(1, num_epoches):
            print("Starting iter " + str(i) + " " +
                  str(datetime.datetime.now()))
            for j in range(1, n_iterations_per_epoch + 1):
                data_train_batch = get_train_batch(batch_size,
                                                   indexed_training_data,
                                                   num_entities, corrupt_size)
                relation_train_batches = split_train_batch(
                    data_train_batch, num_relations)
                feed_dict_training = fill_feed_dict(relation_train_batches,
                                                    params.train_both,
                                                    batch_placeholders,
                                                    label_placeholders,
                                                    corrupt_placeholder)
                _, train_loss_value, train_eval_value = sess.run(
                    [training, train_loss,
                     ntn.eval(train_inference)],
                    feed_dict=feed_dict_training)
                print("Iter {}, batch {}, Training data loss = {}".format(
                    i, j, train_eval_value))
            if i % params.save_per_iter == 0:
                saver.save(
                    sess, params.output_path + "/" + params.data_name +
                    str(i) + '.sess')
                print("Model saved at iter {}".format(i))

            # At the end of each epoch, test the dev data
            for j in range(1, n_iterations_validation + 1):
                data_dev_batch = get_test_batch(batch_size, indexed_dev_data)
                relation_dev_batches = split_test_batch(
                    data_dev_batch, num_relations)
                feed_dict_dev = fill_feed_dict(relation_dev_batches,
                                               params.train_both,
                                               batch_placeholders,
                                               label_placeholders,
                                               corrupt_placeholder)
                dev_eval_value = sess.run(ntn.eval(test_inference),
                                          feed_dict=feed_dict_dev)
                print("Iter {}, batch {}, Dev data loss = {}".format(
                    i, j, dev_eval_value))

        # testing
        for j in range(1, n_iterations_evaluation):
            data_test_batch = get_test_batch(batch_size, indexed_test_data)
            relation_test_batches = split_test_batch(data_test_batch,
                                                     num_relations)
            feed_dict_testing = fill_feed_dict(relation_test_batches,
                                               params.train_both,
                                               batch_placeholders,
                                               label_placeholders,
                                               corrupt_placeholder)
            test_eval_value = sess.run(ntn.eval(test_inference),
                                       feed_dict=feed_dict_testing)
            print("Final Test Accuracy = {}".format(test_eval_value))
Exemplo n.º 5
0
def prepare_data(corrupt_samples):
    raw_training_data = ntn_input.load_training_data(ntn_input.data_path)
    raw_dev_data = ntn_input.load_dev_data(ntn_input.data_path)
    print("Load entities and relations...")
    entities_list = ntn_input.load_entities(ntn_input.data_path)
    relations_list = ntn_input.load_relations(ntn_input.data_path)
    #python list of (e1, R, e2) for entire training set in index form
    indexed_training_data = data_to_indexed(raw_training_data, entities_list,
                                            relations_list)
    indexed_dev_data = data_to_indexed(raw_dev_data, entities_list,
                                       relations_list)
    print("Load embeddings...")
    (init_word_embeds,
     entity_to_wordvec) = ntn_input.load_init_embeds(ntn_input.data_path)

    num_entities = len(entities_list)
    num_relations = len(relations_list)

    e1, e2, labels_train, labels_dev, t1, t2 = {}, {}, [], [], {}, {}

    for i in indexed_training_data:
        try:
            e1[i[1]].append(init_word_embeds[i[0]])
            e2[i[1]].append(init_word_embeds[i[2]])
        except:
            e1[i[1]] = []
            e2[i[1]] = []

    max_len_e1 = max([len(e1[i]) for i in e1])
    labels_train = [1] * max_len_e1
    e1, e2 = fill_entity(e1, e2, max_len_e1)
    #bre
    for i in range(max_len_e1):
        for j in range(corrupt_samples):
            for k in range(11):
                e1[k].append(init_word_embeds[indexed_training_data[i][0]])
                e2[k].append(init_word_embeds[random.randrange(
                    0, len(init_word_embeds))])
        labels_train.append(0)

    for i in indexed_dev_data:
        try:
            t1[i[1]].append(init_word_embeds[i[0]])
            t2[i[1]].append(init_word_embeds[i[2]])
        except:
            t1[i[1]] = []
            t2[i[1]] = []

    max_len_t1 = max([len(t1[i]) for i in t1])
    labels_dev = [1] * max_len_t1

    t1, t2 = fill_entity(t1, t2, max_len_t1)

    for i in range(max_len_t1):
        for j in range(corrupt_samples):
            for k in range(11):
                t1[k].append(init_word_embeds[indexed_dev_data[i][0]])
                t2[k].append(init_word_embeds[random.randrange(
                    0, len(init_word_embeds))])
        labels_dev.append(0)

    labels_train, labels_dev = np.array(labels_train), np.array(labels_dev)
    new_lab_train, new_lab_dev = [], []

    for i in labels_train:
        new_lab_train.append([i] * 11)

    for j in labels_train:
        new_lab_dev.append([j] * 11)

    return e1, e2, np.array(new_lab_train), t1, t2, np.array(
        new_lab_dev), num_relations