예제 #1
0
def run_evaluation():
    print(params.output_path)
    print(tf.train.latest_checkpoint(params.output_path, 'checkpoint'))
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_relations(params.data_path)
    test_data = ntn_input.load_test_data(params.data_path)
    test_data = data_to_indexed(test_data, entities_list, relations_list)
    batch_size= len(test_data)
    
    num_entities = len(entities_list)
    num_relations = len(relations_list)

    slice_size = params.slice_size
    (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)
    batches, labels = data_to_relation_sets(test_data, num_relations)

    
    with tf.Graph().as_default():
        sess = tf.Session()
        batch_placeholders = [tf.placeholder(tf.float32, shape=(None, 3)) for i in range(num_relations)]
        label_placeholders = [tf.placeholder(tf.float32, shape=(None, 1)) for i in range(num_relations)]
        corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) 
        inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, \
                num_entities, num_relations, slice_size, batch_size, True, label_placeholders) 
        eval_correct = ntn.eval(inference)
        saver = tf.train.Saver()

        saver.restore(sess, params.output_path+'around100/Wordnet70.sess')
        #init = tf.initialize_all_variables()
        #sess.run(init)
        print do_eval(sess, eval_correct, batch_placeholders, label_placeholders, corrupt_placeholder, batches, labels, batch_size)
예제 #2
0
def getPredictions():
    best_thresholds = getThresholds()
    test_data = ntn_input.load_test_data()
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_entities(params.data_path)
    
    num_entities = len(entities_list)
    num_relations = len(relations_list)

    slice_size = params.slice_size
    (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)

    batch_placeholder = tf.placeholder(tf.float32, shape=(4, batch_size))
    corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt?
    predictions_list = ntn.inference(batch_placeholder, corrupt_placeholder, init_word_embeds,entity_to_wordvec, num_entities, num_relations, slice_size, batch_size)

    predictions = tf.zeros((test_data.shape[0], 1))
    for i in range(test_data.shape[0]):
        # get relation
        rel = test_data[i, 1]

        # get labels based on predictions
        if(preictions_list[i, 0] <= self.best_thresholds[rel, 0]):
            predictions[i, 0] = 1
        else:
            predictions[i, 0] = -1

    return predictions
예제 #3
0
def run_training():
    print("Begin!")
    #python list of (e1, R, e2) for entire training set in string form
    print("Load training data...")
    raw_training_data = ntn_input.load_training_data(params.data_path)
    print("Load entities and relations...")
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_relations(params.data_path)
    #python list of (e1, R, e2) for entire training set in index form
    indexed_training_data = data_to_indexed(raw_training_data, entities_list, relations_list)
    print("Load embeddings...")
    (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)
    # init_word_embeds size (67447,100), entity_to_wordvec size (38696)
    # entity_to_wordvec -> [[45792], [50003], [19154, 50004], [11403], [7456, 6932], [47896, 50004], [24589], [50005, 50006, 50004], [6551], [12288]]

    num_entities = len(entities_list)
    num_relations = len(relations_list)

    num_iters = params.num_iter
    batch_size = params.batch_size
    corrupt_size = params.corrupt_size
    slice_size = params.slice_size

    with tf.Graph().as_default():
        print("Starting to build graph "+str(datetime.datetime.now()))
        batch_placeholders = [tf.placeholder(tf.int32, shape=(None, 3), name='batch_'+str(i)) for i in range(num_relations)]
        label_placeholders = [tf.placeholder(tf.float32, shape=(None, 1), name='label_'+str(i)) for i in range(num_relations)]

        corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt?
        inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, \
                num_entities, num_relations, slice_size, batch_size, False, label_placeholders)
        # [2,r*batch_predictions]

        loss = ntn.loss(inference, params.regularization)
        training = ntn.training(loss, params.learning_rate)

	    # Create a session for running Ops on the Graph.
        print('1    Here------->>>>>>>> run init <<<<<<<<------------')
        sess = tf.Session()

        # Run the Op to initialize the variables.
        init = tf.global_variables_initializer()
        print('2    Here------->>>>>>>> run init <<<<<<<<------------')
        sess.run(init)
        print('run init')
        saver = tf.train.Saver(tf.trainable_variables())
        for i in range(1, num_iters):
            print("Starting iter "+str(i)+" "+str(datetime.datetime.now()))
            data_batch = get_batch(batch_size, indexed_training_data, num_entities, corrupt_size) # [batch*10,4]
            relation_batches = split_batch(data_batch, num_relations) # [num_relations,batch*10,3]

            if i % params.save_per_iter == 0:
                saver.save(sess, params.output_path+"/"+params.data_name+str(i)+'.sess')

            feed_dict = fill_feed_dict(relation_batches, params.train_both, batch_placeholders, label_placeholders, corrupt_placeholder)

            _, loss_value = sess.run([training, loss], feed_dict=feed_dict)
            print('loss_'+str(i)+': ',loss_value)
예제 #4
0
def getThresholds():
    dev_data = ntn_input.load_dev_data()
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_relations(params.data_path)

    num_entities = len(entities_list)
    num_relations = len(relations_list)

    slice_size = params.slice_size
    (init_word_embeds,
     entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)

    batch_placeholder = tf.placeholder(tf.float32, shape=(4, batch_size))
    corrupt_placeholder = tf.placeholder(
        tf.bool, shape=(1))  # Which of e1 or e2 to corrupt?
    predictions_list = ntn.inference(batch_placeholder, corrupt_placeholder,
                                     init_word_embeds, entity_to_wordvec,
                                     num_entities, num_relations, slice_size,
                                     batch_size)

    min_score = tf.reduce_min(predictions_list)
    max_score = tf.reduce_max(predictions_list)

    # initialize thresholds and accuracies
    best_thresholds = tf.zeros([params.num_relations, 1])
    best_accuracies = tf.zeros([params.num_relations, 1])

    for i in range(params.num_relations):
        best_thresholds[i, :] = min_score
        best_accuracies[i, :] = -1

    score = min_score
    increment = 0.01

    while (score <= max_score):
        # iterate through relations list to find
        for i in range(params.num_relations):
            current_relation_list = (dev_data[:, 1] == i)
            predictions = (predictions_list[current_relation_list, 0] <=
                           score) * 2 - 1
            accuracy = tf.reduce_mean(
                (predictions == dev_labels[current_relations_list, 0]))

            # update threshold and accuracy
            if (accuracy > best_accuracies[i, 0]):
                best_accuracies[i, 0] = accuracy
                best_thresholds[i, 0] = score

        score += increment

    # store threshold values
    return best_thresholds
예제 #5
0
def run_training():
    print("Begin!")
    #python list of (e1, R, e2) for entire training set in string form
    print("Load training data...")
    raw_training_data = ntn_input.load_training_data(params.data_path)
    print("Load entities and relations...")
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_relations(params.data_path)
    #python list of (e1, R, e2) for entire training set in index form
    indexed_training_data = data_to_indexed(raw_training_data, entities_list, relations_list)
    print("Load embeddings...")
    (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)

    num_entities = len(entities_list)
    num_relations = len(relations_list)

    num_iters = params.num_iter
    batch_size = params.batch_size
    corrupt_size = params.corrupt_size
    slice_size = params.slice_size

    with tf.Graph().as_default():
        print("Starting to build graph "+str(datetime.datetime.now()))
        batch_placeholders = [tf.placeholder(tf.int32, shape=(None, 3), name='batch_'+str(i)) for i in range(num_relations)]
        label_placeholders = [tf.placeholder(tf.float32, shape=(None, 1), name='label_'+str(i)) for i in range(num_relations)]

        corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt?
        inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, \
                num_entities, num_relations, slice_size, batch_size, False, label_placeholders)
        loss = ntn.loss(inference, params.regularization)
        training = ntn.training(loss, params.learning_rate)

	# Create a session for running Ops on the Graph.
	sess = tf.Session()

	# Run the Op to initialize the variables.
	init = tf.initialize_all_variables()
	sess.run(init)
        saver = tf.train.Saver(tf.trainable_variables())
        for i in range(1, num_iters):
            print("Starting iter "+str(i)+" "+str(datetime.datetime.now()))
            data_batch = get_batch(batch_size, indexed_training_data, num_entities, corrupt_size)
            relation_batches = split_batch(data_batch, num_relations)

            if i % params.save_per_iter == 0:
                saver.save(sess, params.output_path+"/"+params.data_name+str(i)+'.sess')

	    feed_dict = fill_feed_dict(relation_batches, params.train_both, batch_placeholders, label_placeholders, corrupt_placeholder)
            _, loss_value = sess.run([training, loss], feed_dict=feed_dict)
예제 #6
0
def run_training():
    print("Begin!")
    #python list of (e1, R, e2) for entire training set in string form
    print("Load training data...")
    raw_training_data = ntn_input.load_training_data(params.data_path)
    print("Load entities and relations...")
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_relations(params.data_path)
    #python list of (e1, R, e2) for entire training set in index form
    indexed_training_data = data_to_indexed(raw_training_data, entities_list, relations_list)
    print("Load embeddings...")
    (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)

    num_entities = len(entities_list)
    num_relations = len(relations_list)

    num_iters = params.num_iter
    batch_size = params.batch_size
    corrupt_size = params.corrupt_size
    slice_size = params.slice_size

    with tf.Graph().as_default():
        print("Starting to build graph "+str(datetime.datetime.now()))
        batch_placeholders = [tf.placeholder(tf.int32, shape=(None, 3), name='batch_'+str(i)) for i in range(num_relations)]
        label_placeholders = [tf.placeholder(tf.float32, shape=(None, 1), name='label_'+str(i)) for i in range(num_relations)]

        corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt?
        inference = ntn.inference(batch_placeholders, corrupt_placeholder, init_word_embeds, entity_to_wordvec, \
                num_entities, num_relations, slice_size, batch_size, False, label_placeholders)
        loss = ntn.loss(inference, params.regularization)
        training = ntn.training(loss, params.learning_rate)

	# Create a session for running Ops on the Graph.
	sess = tf.Session()

	# Run the Op to initialize the variables.
	init = tf.initialize_all_variables()
	sess.run(init)
        saver = tf.train.Saver(tf.trainable_variables())
        for i in range(1, num_iters):
            print("Starting iter "+str(i)+" "+str(datetime.datetime.now()))
            data_batch = get_batch(batch_size, indexed_training_data, num_entities, corrupt_size)
            relation_batches = split_batch(data_batch, num_relations)

            if i % params.save_per_iter == 0:
                saver.save(sess, params.output_path+"/"+params.data_name+str(i)+'.sess')

	    feed_dict = fill_feed_dict(relation_batches, params.train_both, batch_placeholders, label_placeholders, corrupt_placeholder)
            _, loss_value = sess.run([training, loss], feed_dict=feed_dict)
예제 #7
0
def getThresholds():
    dev_data = ntn_input.load_dev_data()
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_entities(params.data_path)
    
    num_entities = len(entities_list)
    num_relations = len(relations_list)

    slice_size = params.slice_size
    (init_word_embeds, entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)

    batch_placeholder = tf.placeholder(tf.float32, shape=(4, batch_size))
    corrupt_placeholder = tf.placeholder(tf.bool, shape=(1)) #Which of e1 or e2 to corrupt?
    predictions_list = ntn.inference(batch_placeholder, corrupt_placeholder, init_word_embeds,entity_to_wordvec, num_entities, num_relations, slice_size, batch_size)

    min_score = tf.reduce_min(predictions_list)
    max_score = tf.reduce_max(predictions_list)

    # initialize thresholds and accuracies
    best_thresholds = tf.zeros([params.num_relations, 1])
    best_accuracies = tf.zeros([params.num_relations, 1])

    for i in range(params.num_relations):
        best_thresholds[i, :] = score_min
        best_accuracies[i, :] = -1

    score = min_score
    increment = 0.01

    while(score <= max_score):
        # iterate through relations list to find 
        for i in range(params.num_relations):
            current_relation_list = (dev_data[:, 1] == i)
            predictions = (predictions_list[current_relation_list, 0] <= score) * 2 - 1
            accuracy = tf.reduce_mean((predictions == dev_labels[current_relations_list, 0]))

            # update threshold and accuracy
            if(accuracy > best_accuracies[i, 0]):
                best_accuracies[i, 0] = accuracy
                best_thresholds[i, 0] = score

        score += increment

    # store threshold values
    return best_thresholds
예제 #8
0
def run_training():
    print("Begin!")
    # python list of (e1, R, e2) for entire training set in string form
    print("Load training data...")
    # shape of raw training data: (112581, 3)
    raw_training_data = ntn_input.load_training_data(params.data_path)
    raw_dev_data = ntn_input.load_dev_data(params.data_path)
    raw_test_data = ntn_input.load_test_data(params.data_path)

    print("Load entities and relations...")
    entities_list = ntn_input.load_entities(params.data_path)
    relations_list = ntn_input.load_relations(params.data_path)
    num_entities = len(entities_list)  # entity: 38696
    num_relations = len(relations_list)  # relations: 11
    # python list of (e1, R, e2) for entire training set in index form
    indexed_training_data = data_to_indexed(raw_training_data, entities_list,
                                            relations_list)
    indexed_dev_data = data_to_indexed(raw_dev_data, entities_list,
                                       relations_list)
    indexed_test_data = data_to_indexed(raw_test_data, entities_list,
                                        relations_list)

    print("Load embeddings...")
    # shape of word embeds: 67447, 100; number of entities: 38696
    (init_word_embeds,
     entity_to_wordvec) = ntn_input.load_init_embeds(params.data_path)

    num_epoches = params.epoches
    batch_size = params.batch_size
    corrupt_size = params.corrupt_size
    slice_size = params.slice_size

    n_iterations_per_epoch = len(indexed_training_data) // batch_size
    n_iterations_validation = len(indexed_dev_data) // batch_size
    n_iterations_evaluation = len(indexed_test_data) // batch_size
    print("# of iteration/epoch", n_iterations_per_epoch)
    print("# of iteration/validation", n_iterations_validation)
    print("# of iteration/evaluation", n_iterations_evaluation)

    with tf.Graph().as_default():
        print("Starting to build graph " + str(datetime.datetime.now()))
        batch_placeholders = [
            tf.placeholder(tf.int32, shape=(None, 3), name='batch_' + str(i))
            for i in range(num_relations)
        ]
        label_placeholders = [
            tf.placeholder(tf.float32, shape=(None, 1), name='label_' + str(i))
            for i in range(num_relations)
        ]
        corrupt_placeholder = tf.placeholder(tf.bool, shape=1)
        train_inference = ntn.inference(batch_placeholders,
                                        corrupt_placeholder, init_word_embeds,
                                        entity_to_wordvec, num_entities,
                                        num_relations, slice_size, batch_size,
                                        False, label_placeholders)
        test_inference = ntn.inference(batch_placeholders, corrupt_placeholder,
                                       init_word_embeds, entity_to_wordvec,
                                       num_entities, num_relations, slice_size,
                                       batch_size, True, label_placeholders)
        train_loss = ntn.loss(train_inference, params.regularization)
        training = ntn.training(train_loss, params.learning_rate)

        # Create a session for running Ops on the Graph.
        sess = tf.Session()

        # Run the Op to initialize the variables.
        init = tf.global_variables_initializer()
        sess.run(init)
        saver = tf.train.Saver(tf.trainable_variables())

        # training
        for i in range(1, num_epoches):
            print("Starting iter " + str(i) + " " +
                  str(datetime.datetime.now()))
            for j in range(1, n_iterations_per_epoch + 1):
                data_train_batch = get_train_batch(batch_size,
                                                   indexed_training_data,
                                                   num_entities, corrupt_size)
                relation_train_batches = split_train_batch(
                    data_train_batch, num_relations)
                feed_dict_training = fill_feed_dict(relation_train_batches,
                                                    params.train_both,
                                                    batch_placeholders,
                                                    label_placeholders,
                                                    corrupt_placeholder)
                _, train_loss_value, train_eval_value = sess.run(
                    [training, train_loss,
                     ntn.eval(train_inference)],
                    feed_dict=feed_dict_training)
                print("Iter {}, batch {}, Training data loss = {}".format(
                    i, j, train_eval_value))
            if i % params.save_per_iter == 0:
                saver.save(
                    sess, params.output_path + "/" + params.data_name +
                    str(i) + '.sess')
                print("Model saved at iter {}".format(i))

            # At the end of each epoch, test the dev data
            for j in range(1, n_iterations_validation + 1):
                data_dev_batch = get_test_batch(batch_size, indexed_dev_data)
                relation_dev_batches = split_test_batch(
                    data_dev_batch, num_relations)
                feed_dict_dev = fill_feed_dict(relation_dev_batches,
                                               params.train_both,
                                               batch_placeholders,
                                               label_placeholders,
                                               corrupt_placeholder)
                dev_eval_value = sess.run(ntn.eval(test_inference),
                                          feed_dict=feed_dict_dev)
                print("Iter {}, batch {}, Dev data loss = {}".format(
                    i, j, dev_eval_value))

        # testing
        for j in range(1, n_iterations_evaluation):
            data_test_batch = get_test_batch(batch_size, indexed_test_data)
            relation_test_batches = split_test_batch(data_test_batch,
                                                     num_relations)
            feed_dict_testing = fill_feed_dict(relation_test_batches,
                                               params.train_both,
                                               batch_placeholders,
                                               label_placeholders,
                                               corrupt_placeholder)
            test_eval_value = sess.run(ntn.eval(test_inference),
                                       feed_dict=feed_dict_testing)
            print("Final Test Accuracy = {}".format(test_eval_value))
예제 #9
0
def prepare_data(corrupt_samples):
    raw_training_data = ntn_input.load_training_data(ntn_input.data_path)
    raw_dev_data = ntn_input.load_dev_data(ntn_input.data_path)
    print("Load entities and relations...")
    entities_list = ntn_input.load_entities(ntn_input.data_path)
    relations_list = ntn_input.load_relations(ntn_input.data_path)
    #python list of (e1, R, e2) for entire training set in index form
    indexed_training_data = data_to_indexed(raw_training_data, entities_list,
                                            relations_list)
    indexed_dev_data = data_to_indexed(raw_dev_data, entities_list,
                                       relations_list)
    print("Load embeddings...")
    (init_word_embeds,
     entity_to_wordvec) = ntn_input.load_init_embeds(ntn_input.data_path)

    num_entities = len(entities_list)
    num_relations = len(relations_list)

    e1, e2, labels_train, labels_dev, t1, t2 = {}, {}, [], [], {}, {}

    for i in indexed_training_data:
        try:
            e1[i[1]].append(init_word_embeds[i[0]])
            e2[i[1]].append(init_word_embeds[i[2]])
        except:
            e1[i[1]] = []
            e2[i[1]] = []

    max_len_e1 = max([len(e1[i]) for i in e1])
    labels_train = [1] * max_len_e1
    e1, e2 = fill_entity(e1, e2, max_len_e1)
    #bre
    for i in range(max_len_e1):
        for j in range(corrupt_samples):
            for k in range(11):
                e1[k].append(init_word_embeds[indexed_training_data[i][0]])
                e2[k].append(init_word_embeds[random.randrange(
                    0, len(init_word_embeds))])
        labels_train.append(0)

    for i in indexed_dev_data:
        try:
            t1[i[1]].append(init_word_embeds[i[0]])
            t2[i[1]].append(init_word_embeds[i[2]])
        except:
            t1[i[1]] = []
            t2[i[1]] = []

    max_len_t1 = max([len(t1[i]) for i in t1])
    labels_dev = [1] * max_len_t1

    t1, t2 = fill_entity(t1, t2, max_len_t1)

    for i in range(max_len_t1):
        for j in range(corrupt_samples):
            for k in range(11):
                t1[k].append(init_word_embeds[indexed_dev_data[i][0]])
                t2[k].append(init_word_embeds[random.randrange(
                    0, len(init_word_embeds))])
        labels_dev.append(0)

    labels_train, labels_dev = np.array(labels_train), np.array(labels_dev)
    new_lab_train, new_lab_dev = [], []

    for i in labels_train:
        new_lab_train.append([i] * 11)

    for j in labels_train:
        new_lab_dev.append([j] * 11)

    return e1, e2, np.array(new_lab_train), t1, t2, np.array(
        new_lab_dev), num_relations