def main(): print("Staring main...") #define default parameters interactions = [(3, (0, 1)), (4, (1, 2)), (5, (2, 0))] thresh_mentions = 6 neg = 2 #negative samples per positive rank = 5 #embedding dimensionality #n_train = 287044 n_iter = 20 #choose a dot product to use. # can be #multilinear #multilinear_square_product #generalised_multilinear_dot_product dot_product = generalised_multilinear_dot_product minibatch_size = 1000 learning_rate = 0.001 l2 = 0.0 eval_file_name = 'output.txt' # use input arguments to define parameters (i.e. not using default above) for x in sys.argv: if len(x.split("__")) > 1: arg = x.split("__")[1] if "interactions" in x: interactions = arg print("Interactions not defined yet") #TODO elif "n_iter" in x: n_iter = int(x.split("__")[1]) elif "thresh_mentions" in x: thresh_mentions = int(arg) elif "rank" in x: rank = int(arg) elif "n_train" in x: n_train = int(arg) elif "neg" in x: neg = int(arg) elif "minibatch" in x: minibatch_size = int(arg) elif "learning_rate" in x: learning_rate = float(arg) elif "L2" in x: l2 = float(arg) elif "eval_file_name" in x: eval_file_name = str(arg) print("Done parsing input arguments.") # load data & dictionaries train, String2Int, Int2String = dr.load_FB15K237_FB( 'train', None, None, interactions, thresh_mentions) valid, String2Int, Int2String = dr.load_FB15K237_FB( 'valid', String2Int, Int2String, interactions) test, String2Int, Int2String = dr.load_FB15K237_FB('test', String2Int, Int2String, interactions) # in case not the full train data should be used #train = train[:n_train] n_train = len(train) values = [1.0 for x in range(n_train)] # sample some random negative validation tuples n_test = len(valid) valid_negative = np.random.randint(0, 1000, [n_test, 3 + len(interactions)]) print("Done loading data.") # initialise embeddings, reserve one extra entry (the first) for unknown n_emb = len(Int2String.keys()) + 1 emb0 = np.random.normal(size=(n_emb, rank)) * 0.1 # initialise the norm scalers norm_scalers = np.random.normal(size=[3 + len(interactions)]) * 0.1 # set other factorization inputs optim = tf.train.AdamOptimizer(learning_rate=learning_rate) scoring = lf.generalised_multilinear_dot_product_scorer # factorize the train tuples, obtain all model parameters # print("Starting factorisation...") params = lf.factorize_tuples((train, values), rank, minibatch_size=minibatch_size, emb0=emb0, n_iter=n_iter, negative_prop=neg, loss_type="logistic", tf_optim=optim, scoring=scoring, norm_scalers=norm_scalers) #define two placeholders to feed in train and valid data for evaluation inputs_train = tf.placeholder("int32", [n_train, len(train[0])]) inputs_valid = tf.placeholder("int32", [len(valid), len(train[0])]) # define prediction ops for train and valid set pred_train = dot_product(params, inputs_train) pred_valid = dot_product(params, inputs_valid) # define the data feeders feed1 = {inputs_train: train} feed2 = {inputs_valid: valid_negative} feed3 = {inputs_valid: valid} print("Start evaluation...") # obtain predictions for train, valid and negative validation set with tf.Session() as sess: sess.run(tf.initialize_all_variables()) print("Generating predictions... training data") prediction_values = sigmoid(sess.run([pred_train], feed_dict=feed1)[0]) print("Generating predictions... validation data (negative)") prediction_values2 = sigmoid( sess.run([pred_valid], feed_dict=feed2)[0]) print("Generating predictions... validation data (positive)") prediction_values3 = sigmoid( sess.run([pred_valid], feed_dict=feed3)[0]) # evaluation: avg prediction among the different tuple sets m1 = np.mean(prediction_values) m2 = np.mean(prediction_values2) m3 = np.mean(prediction_values3) print("Avg Pos Train Prediction", m1) print("Avg Random Neg Prediction", m2) print("Avg Pos Test Prediction", m3) # do batch ranking evaluation, compute MRR and HITS@10 on valid set. h = 20 #h=len(valid) #params[0][0,:] = np.zeros([params[0].shape[1]]) MRR, H10 = ranking_evaluation(valid[:h], prediction_values3[:h], interactions, 2, String2Int, Int2String, params, dot_product) print("MRR:", MRR) print("HITS@10:", H10) print("Writing results to file " + eval_file_name) with open(eval_file_name, 'w') as f: for x in sys.argv: f.write(x + "\n") f.write("---------\n") f.write("MRR: " + str(MRR) + "\n") f.write("HITS@10: " + str(H10) + "\n") f.write("Mean train score:" + str(m1) + "\n") f.write("Mean val score (neg):" + str(m2) + "\n") f.write("Mean val score (pos):" + str(m3) + "\n") print("EOF.")
thresh_mentions = 5 neg = 2 #negative samples per positive rank = 5 #embedding dimensionality n_train = 287044 n_iter = 20 #choose a dot product to use. # can be #multilinear #multilinear_square_product #generalised_multilinear_dot_product dot_product = generalised_multilinear_dot_product # load data & dictionaries train, String2Int, Int2String = dr.load_FB15K237_FB('train', None, None, interactions, thresh_mentions) valid, String2Int, Int2String = dr.load_FB15K237_FB('valid', String2Int, Int2String, interactions) test, String2Int, Int2String = dr.load_FB15K237_FB('test', String2Int, Int2String, interactions) # in case not the full train data should be used train = train[:n_train] values = [1.0 for x in range(n_train)] # sample some random negative validation tuples n_test = len(valid) valid_negative = np.random.randint(0, 1000 ,[n_test,3+len(interactions)])
def main(): print("Staring main...") #define default parameters interactions = [(3,(0,1)), (4,(1,2)), (5,(2,0))] thresh_mentions = 6 neg = 2 #negative samples per positive rank = 5 #embedding dimensionality #n_train = 287044 n_iter = 20 #choose a dot product to use. # can be #multilinear #multilinear_square_product #generalised_multilinear_dot_product dot_product = generalised_multilinear_dot_product minibatch_size=1000 learning_rate = 0.001 l2 = 0.0 eval_file_name = 'output.txt' # use input arguments to define parameters (i.e. not using default above) for x in sys.argv: if len( x.split("__") ) > 1: arg = x.split("__")[1] if "interactions" in x: interactions = arg print("Interactions not defined yet") #TODO elif "n_iter" in x: n_iter = int(x.split("__")[1]) elif "thresh_mentions" in x: thresh_mentions = int(arg) elif "rank" in x: rank = int(arg) elif "n_train" in x: n_train = int(arg) elif "neg" in x: neg = int(arg) elif "minibatch" in x: minibatch_size = int(arg) elif "learning_rate" in x: learning_rate = float(arg) elif "L2" in x: l2 = float(arg) elif "eval_file_name" in x: eval_file_name = str(arg) print("Done parsing input arguments.") # load data & dictionaries train, String2Int, Int2String = dr.load_FB15K237_FB('train', None, None, interactions, thresh_mentions) valid, String2Int, Int2String = dr.load_FB15K237_FB('valid', String2Int, Int2String, interactions) test, String2Int, Int2String = dr.load_FB15K237_FB('test', String2Int, Int2String, interactions) # in case not the full train data should be used #train = train[:n_train] n_train = len(train) values = [1.0 for x in range(n_train)] # sample some random negative validation tuples n_test = len(valid) valid_negative = np.random.randint(0, 1000 ,[n_test,3+len(interactions)]) print("Done loading data.") # initialise embeddings, reserve one extra entry (the first) for unknown n_emb = len(Int2String.keys()) + 1 emb0 = np.random.normal(size=(n_emb, rank)) * 0.1 # initialise the norm scalers norm_scalers = np.random.normal(size = [3+len(interactions)]) * 0.1 # set other factorization inputs optim = tf.train.AdamOptimizer(learning_rate=learning_rate) scoring = lf.generalised_multilinear_dot_product_scorer # factorize the train tuples, obtain all model parameters # print("Starting factorisation...") params = lf.factorize_tuples((train, values), rank, minibatch_size=minibatch_size, emb0=emb0, n_iter=n_iter, negative_prop = neg, loss_type = "logistic", tf_optim=optim, scoring=scoring, norm_scalers = norm_scalers) #define two placeholders to feed in train and valid data for evaluation inputs_train = tf.placeholder("int32", [n_train, len(train[0])]) inputs_valid = tf.placeholder("int32", [len(valid), len(train[0])]) # define prediction ops for train and valid set pred_train = dot_product(params, inputs_train) pred_valid = dot_product(params, inputs_valid) # define the data feeders feed1 = {inputs_train: train} feed2 = {inputs_valid: valid_negative} feed3 = {inputs_valid: valid} print("Start evaluation...") # obtain predictions for train, valid and negative validation set with tf.Session() as sess: sess.run(tf.initialize_all_variables()) print("Generating predictions... training data") prediction_values = sigmoid( sess.run([pred_train], feed_dict=feed1)[0] ) print("Generating predictions... validation data (negative)") prediction_values2 = sigmoid( sess.run([pred_valid], feed_dict=feed2)[0] ) print("Generating predictions... validation data (positive)") prediction_values3 = sigmoid( sess.run([pred_valid], feed_dict=feed3)[0] ) # evaluation: avg prediction among the different tuple sets m1 = np.mean(prediction_values) m2 = np.mean(prediction_values2) m3 = np.mean(prediction_values3) print ("Avg Pos Train Prediction", m1) print ("Avg Random Neg Prediction", m2) print ("Avg Pos Test Prediction", m3) # do batch ranking evaluation, compute MRR and HITS@10 on valid set. h = 20 #h=len(valid) #params[0][0,:] = np.zeros([params[0].shape[1]]) MRR, H10 = ranking_evaluation(valid[:h], prediction_values3[:h], interactions, 2, String2Int, Int2String, params, dot_product) print ("MRR:", MRR) print ("HITS@10:", H10) print("Writing results to file " + eval_file_name) with open(eval_file_name, 'w') as f: for x in sys.argv: f.write(x+"\n") f.write("---------\n") f.write("MRR: "+ str(MRR)+"\n") f.write("HITS@10: "+ str(H10)+"\n") f.write("Mean train score:" + str(m1)+"\n") f.write("Mean val score (neg):" + str(m2)+"\n") f.write("Mean val score (pos):" + str(m3)+"\n") print("EOF.")
interactions = [(3, (0, 1)), (4, (1, 2))] #interactions = [] thresh_mentions = 5 neg = 2 #negative samples per positive rank = 5 #embedding dimensionality n_train = 287044 n_iter = 20 #choose a dot product to use. # can be #multilinear #multilinear_square_product #generalised_multilinear_dot_product dot_product = generalised_multilinear_dot_product # load data & dictionaries train, String2Int, Int2String = dr.load_FB15K237_FB( 'train', None, None, interactions, thresh_mentions) valid, String2Int, Int2String = dr.load_FB15K237_FB( 'valid', String2Int, Int2String, interactions) test, String2Int, Int2String = dr.load_FB15K237_FB('test', String2Int, Int2String, interactions) # in case not the full train data should be used train = train[:n_train] values = [1.0 for x in range(n_train)] # sample some random negative validation tuples n_test = len(valid) valid_negative = np.random.randint(0, 1000, [n_test, 3 + len(interactions)])