def main(): print("Staring main...") #define default parameters interactions = [(3, (0, 1)), (4, (1, 2)), (5, (2, 0))] thresh_mentions = 6 neg = 2 #negative samples per positive rank = 5 #embedding dimensionality #n_train = 287044 n_iter = 20 #choose a dot product to use. # can be #multilinear #multilinear_square_product #generalised_multilinear_dot_product dot_product = generalised_multilinear_dot_product minibatch_size = 1000 learning_rate = 0.001 l2 = 0.0 eval_file_name = 'output.txt' # use input arguments to define parameters (i.e. not using default above) for x in sys.argv: if len(x.split("__")) > 1: arg = x.split("__")[1] if "interactions" in x: interactions = arg print("Interactions not defined yet") #TODO elif "n_iter" in x: n_iter = int(x.split("__")[1]) elif "thresh_mentions" in x: thresh_mentions = int(arg) elif "rank" in x: rank = int(arg) elif "n_train" in x: n_train = int(arg) elif "neg" in x: neg = int(arg) elif "minibatch" in x: minibatch_size = int(arg) elif "learning_rate" in x: learning_rate = float(arg) elif "L2" in x: l2 = float(arg) elif "eval_file_name" in x: eval_file_name = str(arg) print("Done parsing input arguments.") # load data & dictionaries train, String2Int, Int2String = dr.load_FB15K237_FB( 'train', None, None, interactions, thresh_mentions) valid, String2Int, Int2String = dr.load_FB15K237_FB( 'valid', String2Int, Int2String, interactions) test, String2Int, Int2String = dr.load_FB15K237_FB('test', String2Int, Int2String, interactions) # in case not the full train data should be used #train = train[:n_train] n_train = len(train) values = [1.0 for x in range(n_train)] # sample some random negative validation tuples n_test = len(valid) valid_negative = np.random.randint(0, 1000, [n_test, 3 + len(interactions)]) print("Done loading data.") # initialise embeddings, reserve one extra entry (the first) for unknown n_emb = len(Int2String.keys()) + 1 emb0 = np.random.normal(size=(n_emb, rank)) * 0.1 # initialise the norm scalers norm_scalers = np.random.normal(size=[3 + len(interactions)]) * 0.1 # set other factorization inputs optim = tf.train.AdamOptimizer(learning_rate=learning_rate) scoring = lf.generalised_multilinear_dot_product_scorer # factorize the train tuples, obtain all model parameters # print("Starting factorisation...") params = lf.factorize_tuples((train, values), rank, minibatch_size=minibatch_size, emb0=emb0, n_iter=n_iter, negative_prop=neg, loss_type="logistic", tf_optim=optim, scoring=scoring, norm_scalers=norm_scalers) #define two placeholders to feed in train and valid data for evaluation inputs_train = tf.placeholder("int32", [n_train, len(train[0])]) inputs_valid = tf.placeholder("int32", [len(valid), len(train[0])]) # define prediction ops for train and valid set pred_train = dot_product(params, inputs_train) pred_valid = dot_product(params, inputs_valid) # define the data feeders feed1 = {inputs_train: train} feed2 = {inputs_valid: valid_negative} feed3 = {inputs_valid: valid} print("Start evaluation...") # obtain predictions for train, valid and negative validation set with tf.Session() as sess: sess.run(tf.initialize_all_variables()) print("Generating predictions... training data") prediction_values = sigmoid(sess.run([pred_train], feed_dict=feed1)[0]) print("Generating predictions... validation data (negative)") prediction_values2 = sigmoid( sess.run([pred_valid], feed_dict=feed2)[0]) print("Generating predictions... validation data (positive)") prediction_values3 = sigmoid( sess.run([pred_valid], feed_dict=feed3)[0]) # evaluation: avg prediction among the different tuple sets m1 = np.mean(prediction_values) m2 = np.mean(prediction_values2) m3 = np.mean(prediction_values3) print("Avg Pos Train Prediction", m1) print("Avg Random Neg Prediction", m2) print("Avg Pos Test Prediction", m3) # do batch ranking evaluation, compute MRR and HITS@10 on valid set. h = 20 #h=len(valid) #params[0][0,:] = np.zeros([params[0].shape[1]]) MRR, H10 = ranking_evaluation(valid[:h], prediction_values3[:h], interactions, 2, String2Int, Int2String, params, dot_product) print("MRR:", MRR) print("HITS@10:", H10) print("Writing results to file " + eval_file_name) with open(eval_file_name, 'w') as f: for x in sys.argv: f.write(x + "\n") f.write("---------\n") f.write("MRR: " + str(MRR) + "\n") f.write("HITS@10: " + str(H10) + "\n") f.write("Mean train score:" + str(m1) + "\n") f.write("Mean val score (neg):" + str(m2) + "\n") f.write("Mean val score (pos):" + str(m3) + "\n") print("EOF.")
def test_tuples_factorization_rectangular_matrix(demo=False): """ In this test, we compare the solution of the factorization given by an exact SVD and the solution given by the factorize_tuple function because with fully-observed matrix data and quadratic loss the solutions should match exactly. :param demo: True for demo mode where explanations are printed in the standard output. Otherwise a test is run. :return: Nothing """ from naga.factorix.learn_factorization import factorize_tuples from scipy.sparse.linalg import svds # Create initial data n = 7 # number of rows m = 6 # number of column rk0 = 4 # size of the embeddings rk = 4 # embedding size for the model noise = 1 # noise level oracle_init = False # do we initialize at the exact solution? u0_mat = np.random.randn(n, rk0) v0_mat = np.random.randn(m, rk0) y_mat = np.random.randn(n, m) * noise + np.dot(u0_mat, v0_mat.transpose()) # svd solution u1_mat, d1_vec, v1_matt = svds(y_mat, rk) v1_mat = v1_matt.transpose() d1_diag_matrix = np.zeros((rk, rk)) for i in range(rk): d1_diag_matrix[i, i] = np.sqrt(d1_vec[i]) x_mat_est1 = np.dot(np.dot(u1_mat, np.square(d1_diag_matrix)), v1_matt) if demo: print('We obtained a first exact solution by Singular Value Decomposition') print('The difference between the observation matrix and the estimated solution is:') print(np.linalg.norm(x_mat_est1-y_mat)) print() # #### sgd solution #### # conversion to tuples indices = [[i, n + j] for i in range(n) for j in range(m)] values = [y_mat[i, j] for i in range(n) for j in range(m)] # initialization if oracle_init: emb0_u_re = np.dot(u1_mat[:, :2], d1_diag_matrix[:2, :2]) emb0_u_im = np.dot(u1_mat[:, 2:], d1_diag_matrix[2:, 2:]) emb0_v_re = np.dot(v1_mat[:, :2], d1_diag_matrix[:2, :2]) emb0_v_im = np.dot(v1_mat[:, 2:], d1_diag_matrix[2:, 2:]) emb0_u = 1.0 * (np.concatenate([emb0_u_im, -emb0_u_re], axis=1) + np.concatenate([emb0_u_re, emb0_u_im], axis=1)) emb0_v = np.concatenate([emb0_v_re, emb0_v_im], axis=1) emb0 = np.concatenate([emb0_u, emb0_v], axis=0) else: # random initialization emb0 = np.random.normal(size=(n + m, rk)) * 0.1 # x_mat_init = hermitian_dot(emb0[:n], emb0[n:]) x_mat_init = np.dot(emb0[:n], emb0[n:].T) # choose an optimizer (Adam deems to be the most reliable) # optim = tf.train.GradientDescentOptimizer(learning_rate=1.) optim = tf.train.AdamOptimizer(learning_rate=0.1) # optim = tf.train.RMSPropOptimizer(learning_rate=1., decay=0.1) # optim = tf.train.AdagradOptimizer(learning_rate=.1) # optim = tf.train.FtrlOptimizer(1.0, -0.5, l2_regularization_strength=0.0, initial_accumulator_value=1e-8) # choose a scoring function (for rectangular matrices, the standard or the complex dot products work the same) scoring = lambda inputs: hermitian_tuple_scorer(inputs, rank=rk, n_emb=n + m, emb0=emb0, symmetry_coef=(1.0, 1.0), learn_symmetry_coef=True) # scoring = None # optimization (we specify optional parameters, but the values by default could work as well) u2, coefs = factorize_tuples((indices, values), rk, emb0=emb0, n_iter=300, tf_optim=optim, scoring=scoring) # recover the matrix based on the hermitian dot product of the embeddings x_mat_est2_cplx = hermitian_dot(u2[:n, :], u2[n:, :].T) # clpx2real(hermitian_dot(u2[:n], u2[n:])) x_mat_est2 = x_mat_est2_cplx[0] * coefs[0] + x_mat_est2_cplx[1] * coefs[1] if demo: print(x_mat_est2.shape, x_mat_init) print('We computed an estimator by minimizing the square loss on the tuples extracted from the matrix') print('The difference between the initial solution and the estimated solution is:') print(np.linalg.norm(x_mat_est2-x_mat_init)) print('The difference between the observations and the estimated solution is:') print(np.linalg.norm(y_mat-x_mat_est2)) print('The difference between the exact solution and the estimated solution is:') print(np.linalg.norm(x_mat_est1-x_mat_est2)) print('Symmetry coefficients: ', coefs) assert(np.linalg.norm(x_mat_est1-x_mat_est2) < 1e-3)
def test_tuples_factorization_rectangular_matrix(demo=False): """ In this test, we compare the solution of the factorization given by an exact SVD and the solution given by the factorize_tuple function because with fully-observed matrix data and quadratic loss the solutions should match exactly. :param demo: True for demo mode where explanations are printed in the standard output. Otherwise a test is run. :return: Nothing """ from naga.factorix.learn_factorization import factorize_tuples from scipy.sparse.linalg import svds # Create initial data n = 7 # number of rows m = 6 # number of column rk0 = 4 # size of the embeddings rk = 4 # embedding size for the model noise = 1 # noise level oracle_init = False # do we initialize at the exact solution? u0_mat = np.random.randn(n, rk0) v0_mat = np.random.randn(m, rk0) y_mat = np.random.randn(n, m) * noise + np.dot(u0_mat, v0_mat.transpose()) # svd solution u1_mat, d1_vec, v1_matt = svds(y_mat, rk) v1_mat = v1_matt.transpose() d1_diag_matrix = np.zeros((rk, rk)) for i in range(rk): d1_diag_matrix[i, i] = np.sqrt(d1_vec[i]) x_mat_est1 = np.dot(np.dot(u1_mat, np.square(d1_diag_matrix)), v1_matt) if demo: print( 'We obtained a first exact solution by Singular Value Decomposition' ) print( 'The difference between the observation matrix and the estimated solution is:' ) print(np.linalg.norm(x_mat_est1 - y_mat)) print() # #### sgd solution #### # conversion to tuples indices = [[i, n + j] for i in range(n) for j in range(m)] values = [y_mat[i, j] for i in range(n) for j in range(m)] # initialization if oracle_init: emb0_u_re = np.dot(u1_mat[:, :2], d1_diag_matrix[:2, :2]) emb0_u_im = np.dot(u1_mat[:, 2:], d1_diag_matrix[2:, 2:]) emb0_v_re = np.dot(v1_mat[:, :2], d1_diag_matrix[:2, :2]) emb0_v_im = np.dot(v1_mat[:, 2:], d1_diag_matrix[2:, 2:]) emb0_u = 1.0 * (np.concatenate([emb0_u_im, -emb0_u_re], axis=1) + np.concatenate([emb0_u_re, emb0_u_im], axis=1)) emb0_v = np.concatenate([emb0_v_re, emb0_v_im], axis=1) emb0 = np.concatenate([emb0_u, emb0_v], axis=0) else: # random initialization emb0 = np.random.normal(size=(n + m, rk)) * 0.1 # x_mat_init = hermitian_dot(emb0[:n], emb0[n:]) x_mat_init = np.dot(emb0[:n], emb0[n:].T) # choose an optimizer (Adam deems to be the most reliable) # optim = tf.train.GradientDescentOptimizer(learning_rate=1.) optim = tf.train.AdamOptimizer(learning_rate=0.1) # optim = tf.train.RMSPropOptimizer(learning_rate=1., decay=0.1) # optim = tf.train.AdagradOptimizer(learning_rate=.1) # optim = tf.train.FtrlOptimizer(1.0, -0.5, l2_regularization_strength=0.0, initial_accumulator_value=1e-8) # choose a scoring function (for rectangular matrices, the standard or the complex dot products work the same) scoring = lambda inputs: hermitian_tuple_scorer(inputs, rank=rk, n_emb=n + m, emb0=emb0, symmetry_coef=(1.0, 1.0), learn_symmetry_coef=True) # scoring = None # optimization (we specify optional parameters, but the values by default could work as well) u2, coefs = factorize_tuples((indices, values), rk, emb0=emb0, n_iter=300, tf_optim=optim, scoring=scoring) # recover the matrix based on the hermitian dot product of the embeddings x_mat_est2_cplx = hermitian_dot( u2[:n, :], u2[n:, :].T) # clpx2real(hermitian_dot(u2[:n], u2[n:])) x_mat_est2 = x_mat_est2_cplx[0] * coefs[0] + x_mat_est2_cplx[1] * coefs[1] if demo: print(x_mat_est2.shape, x_mat_init) print( 'We computed an estimator by minimizing the square loss on the tuples extracted from the matrix' ) print( 'The difference between the initial solution and the estimated solution is:' ) print(np.linalg.norm(x_mat_est2 - x_mat_init)) print( 'The difference between the observations and the estimated solution is:' ) print(np.linalg.norm(y_mat - x_mat_est2)) print( 'The difference between the exact solution and the estimated solution is:' ) print(np.linalg.norm(x_mat_est1 - x_mat_est2)) print('Symmetry coefficients: ', coefs) assert (np.linalg.norm(x_mat_est1 - x_mat_est2) < 1e-3)
def main(): print("Staring main...") #define default parameters interactions = [(3,(0,1)), (4,(1,2)), (5,(2,0))] thresh_mentions = 6 neg = 2 #negative samples per positive rank = 5 #embedding dimensionality #n_train = 287044 n_iter = 20 #choose a dot product to use. # can be #multilinear #multilinear_square_product #generalised_multilinear_dot_product dot_product = generalised_multilinear_dot_product minibatch_size=1000 learning_rate = 0.001 l2 = 0.0 eval_file_name = 'output.txt' # use input arguments to define parameters (i.e. not using default above) for x in sys.argv: if len( x.split("__") ) > 1: arg = x.split("__")[1] if "interactions" in x: interactions = arg print("Interactions not defined yet") #TODO elif "n_iter" in x: n_iter = int(x.split("__")[1]) elif "thresh_mentions" in x: thresh_mentions = int(arg) elif "rank" in x: rank = int(arg) elif "n_train" in x: n_train = int(arg) elif "neg" in x: neg = int(arg) elif "minibatch" in x: minibatch_size = int(arg) elif "learning_rate" in x: learning_rate = float(arg) elif "L2" in x: l2 = float(arg) elif "eval_file_name" in x: eval_file_name = str(arg) print("Done parsing input arguments.") # load data & dictionaries train, String2Int, Int2String = dr.load_FB15K237_FB('train', None, None, interactions, thresh_mentions) valid, String2Int, Int2String = dr.load_FB15K237_FB('valid', String2Int, Int2String, interactions) test, String2Int, Int2String = dr.load_FB15K237_FB('test', String2Int, Int2String, interactions) # in case not the full train data should be used #train = train[:n_train] n_train = len(train) values = [1.0 for x in range(n_train)] # sample some random negative validation tuples n_test = len(valid) valid_negative = np.random.randint(0, 1000 ,[n_test,3+len(interactions)]) print("Done loading data.") # initialise embeddings, reserve one extra entry (the first) for unknown n_emb = len(Int2String.keys()) + 1 emb0 = np.random.normal(size=(n_emb, rank)) * 0.1 # initialise the norm scalers norm_scalers = np.random.normal(size = [3+len(interactions)]) * 0.1 # set other factorization inputs optim = tf.train.AdamOptimizer(learning_rate=learning_rate) scoring = lf.generalised_multilinear_dot_product_scorer # factorize the train tuples, obtain all model parameters # print("Starting factorisation...") params = lf.factorize_tuples((train, values), rank, minibatch_size=minibatch_size, emb0=emb0, n_iter=n_iter, negative_prop = neg, loss_type = "logistic", tf_optim=optim, scoring=scoring, norm_scalers = norm_scalers) #define two placeholders to feed in train and valid data for evaluation inputs_train = tf.placeholder("int32", [n_train, len(train[0])]) inputs_valid = tf.placeholder("int32", [len(valid), len(train[0])]) # define prediction ops for train and valid set pred_train = dot_product(params, inputs_train) pred_valid = dot_product(params, inputs_valid) # define the data feeders feed1 = {inputs_train: train} feed2 = {inputs_valid: valid_negative} feed3 = {inputs_valid: valid} print("Start evaluation...") # obtain predictions for train, valid and negative validation set with tf.Session() as sess: sess.run(tf.initialize_all_variables()) print("Generating predictions... training data") prediction_values = sigmoid( sess.run([pred_train], feed_dict=feed1)[0] ) print("Generating predictions... validation data (negative)") prediction_values2 = sigmoid( sess.run([pred_valid], feed_dict=feed2)[0] ) print("Generating predictions... validation data (positive)") prediction_values3 = sigmoid( sess.run([pred_valid], feed_dict=feed3)[0] ) # evaluation: avg prediction among the different tuple sets m1 = np.mean(prediction_values) m2 = np.mean(prediction_values2) m3 = np.mean(prediction_values3) print ("Avg Pos Train Prediction", m1) print ("Avg Random Neg Prediction", m2) print ("Avg Pos Test Prediction", m3) # do batch ranking evaluation, compute MRR and HITS@10 on valid set. h = 20 #h=len(valid) #params[0][0,:] = np.zeros([params[0].shape[1]]) MRR, H10 = ranking_evaluation(valid[:h], prediction_values3[:h], interactions, 2, String2Int, Int2String, params, dot_product) print ("MRR:", MRR) print ("HITS@10:", H10) print("Writing results to file " + eval_file_name) with open(eval_file_name, 'w') as f: for x in sys.argv: f.write(x+"\n") f.write("---------\n") f.write("MRR: "+ str(MRR)+"\n") f.write("HITS@10: "+ str(H10)+"\n") f.write("Mean train score:" + str(m1)+"\n") f.write("Mean val score (neg):" + str(m2)+"\n") f.write("Mean val score (pos):" + str(m3)+"\n") print("EOF.")
# initialise embeddings, reserve one extra entry (the first) for unknown n_emb = len(Int2String.keys()) + 1 emb0 = np.random.normal(size=(n_emb, rank)) * 0.1 # initialise the norm scalers norm_scalers = np.random.normal(size = [3+len(interactions)]) * 0.1 # set other factorization inputs optim = tf.train.AdamOptimizer(learning_rate=0.001) scoring = lf.generalised_multilinear_dot_product_scorer minibatch_size=1000 # factorize the train tuples, obtain all model parameters params = lf.factorize_tuples((train, values), rank, minibatch_size=minibatch_size, emb0=emb0, n_iter=n_iter, negative_prop = neg, loss_type = "logistic", tf_optim=optim, scoring=scoring, norm_scalers = norm_scalers) #define two placeholders to feed in train and valid data for evaluation inputs_train = tf.placeholder("int32", [n_train, len(train[0])]) inputs_valid = tf.placeholder("int32", [len(valid), len(train[0])]) # define prediction ops for train and valid set pred_train = dot_product(params, inputs_train) pred_valid = dot_product(params, inputs_valid) # define the data feeders feed1 = {inputs_train: train} feed2 = {inputs_valid: valid_negative} feed3 = {inputs_valid: valid}
emb0 = np.random.normal(size=(n_emb, rank)) * 0.1 # initialise the norm scalers norm_scalers = np.random.normal(size=[3 + len(interactions)]) * 0.1 # set other factorization inputs optim = tf.train.AdamOptimizer(learning_rate=0.001) scoring = lf.generalised_multilinear_dot_product_scorer minibatch_size = 1000 # factorize the train tuples, obtain all model parameters params = lf.factorize_tuples((train, values), rank, minibatch_size=minibatch_size, emb0=emb0, n_iter=n_iter, negative_prop=neg, loss_type="logistic", tf_optim=optim, scoring=scoring, norm_scalers=norm_scalers) #define two placeholders to feed in train and valid data for evaluation inputs_train = tf.placeholder("int32", [n_train, len(train[0])]) inputs_valid = tf.placeholder("int32", [len(valid), len(train[0])]) # define prediction ops for train and valid set pred_train = dot_product(params, inputs_train) pred_valid = dot_product(params, inputs_valid) # define the data feeders feed1 = {inputs_train: train}