def split_mupb(v, pc, pmf, i, limit): """ Splits the orginal problem v with pmf in two parts, from 0 to i - 1 and from i + 1 to len (v) """ pmf.split_in(pmf.get_quarter(2), pmf.get_median_block()) # print ("Splitting") blocks = pmf.get_blocks() half_size = len(blocks) // 2 # print ("Dividing: ") blocks1 = blocks[0:half_size] blocks2 = blocks[half_size:len(blocks)] translate_blocks(blocks1) translate_blocks(blocks2) n1 = sum((block.end - block.start) for block in blocks1) n2 = len(v) - n1 v1 = v[0:n1] v2 = v[n1:len(v)] # print ("n = ", len (v), ", |blocks| = ", len (blocks), ", half_size = ", half_size, ", n1 = ", n1, ", n2 = ", n2) pmf1 = PMF(n1, blocks1) pmf2 = PMF(n2, blocks2) sol1 = None eval1 = 0 sol2 = None eval2 = 0 if (len(v1) > 0): [sol1, eval1] = mupb(v1, pc, pmf1, limit) limit -= eval1 if (len(v2) > 0): [sol2, eval2] = mupb(v2, pc, pmf2, limit) return [min(min_with_none(sol1, sol2), v[i]), eval1 + eval2]
def main(): # PARSE COMMANDLINE ARGUMENTS N = int(sys.argv[1]) # NUMBER OF STEPS M = int(sys.argv[2]) # NUMBER OF WALKS mode = sys.argv[3] # INITIALIZE STEP SAMPLER sampler = StepSampler(fetchPRNG(mode, 7)) # INITIALIZE PMF TO ACCUMULATE ENDPOINTS pmf = PMF() # MC SIMULATION TO SAMPLE ENDPOINTS for m in range(M): rw = RandomWalker1D(0.0, sampler.discreteSymmetric) rw.walk(N) pmf.add(rw.x) chiSquareTest((N, M), pmf)
def upb (v, pc, pmf = None, limit = None): """ U-Curve Probabilistic Bisection This function receives a vector, that describes approximately u-shaped curve, as argument and return the minimum element of this vector """ n = len (v) total_s = time () update_time = 0 nof_updates = 0 # Probability Mass Function if (pmf is None): pmf = PMF (n) evaluations = 0 if (limit is None): # limit = n limit = 1 + 2 * int_log2 (n) * int_log2 (n) first_qt = pmf.get_quarter (1) median = pmf.get_quarter (2) third_qt = pmf.get_quarter (3) while ((first_qt is not median and \ median is not third_qt) and limit > 0): evaluations += 3 direction = select_side (v, median) if (direction is 0): if (median - first_qt > third_qt - median): direction = -1 else: direction = 1 alpha = pmf.get_quarter_mass (2) pmf.update (pc, median, alpha, direction) first_qt = pmf.get_quarter (1) median = pmf.get_quarter (2) third_qt = pmf.get_quarter (3) limit -= 1 return [v[median], evaluations]
def main(): # PARSE COMMANDLINE ARGUMENTS N = int(sys.argv[1]) # NUMBER OF STEPS M = int(sys.argv[2]) # NUMBER OF WALKS mode = sys.argv[3] # INITIALIZE STEP SAMPLER sampler = StepSampler(fetchPRNG(mode)) # INITIALIZE PMF TO ACCUMULATE ENDPOINTS pmf = PMF() # MC SIMULATION TO SAMPLE ENDPOINTS for m in range(M): rw = RandomWalker1D(0.0, sampler.discreteSymmetric) rw.walk(N) pmf.add(rw.x) # WRITE SIMULATION SUMMARY TO STDOUT print "# (x) (f(x)) (p(x)) (W(x,N)) (p(x,N))" for (k,v) in sorted(pmf.pmf.items()): print k, v, float(v)/pmf.n, W(k,N), p(k,N)
ensemble = [] for item, votes in enumerate(item_vote): insort(ensemble, (votes, item)) return [(item, votes) for votes, item in ensemble[-topk:][::-1]] if __name__ == "__main__": DATASET = 'fake' if DATASET == 'fake': (ratings, u, i) = fake_ratings() ratings = np.array(ratings).astype(float) MF_list = [] for lambdar in [0.0, 0.5, 1.0]: pmf = PMF(ratings, latent_d=5, regularization_strength=lambdar) pmf.gradient_descent(ratings) MF_list.append(pmf) else: filename = 'ml-100k/u.data' ratings = read_ratings(filename) ratings = np.array(ratings).astype(float) MF_list = nise(ratings) ens = Majority(MF_list) uid = 50.0 #gerar usuario teste nitems = MF_list[0].num_items user_vector = np.zeros((1, nitems), dtype=float) for line in ratings: if line[0] == uid:
from load_data import load_rating_data, split_rating_dat from pmf import PMF import matplotlib.pyplot as plt if __name__ == '__main__': file_path = r"PMF\data\ml-1m\ratings.dat" pmf = PMF() pmf.set_params({ "num_feature": 10, "max_epoch": 50, "num_batch": 50, "batch_size": 1000, "epsilon": 0.01, "_lambda": 0.1 }) ratings_data = load_rating_data(file_path) train, test = split_rating_dat(ratings_data, size=0.2) pmf.fit(train, test) # Check performance by plotting train and test errors plt.plot(range(pmf.max_epoch), pmf.rmse_train, marker='o', label='Training Data') plt.plot(range(pmf.max_epoch), pmf.rmse_test, marker='v', label='Test Data') plt.title('The MovieLens Dataset Learning Curve') plt.xlabel('Number of Epochs') plt.ylabel('RMSE')
def main(): # build the model print "Rank of embedding is %d" % rank if model_type == "pmf": print "Building Graph for Probabilisic MF" model = PMF(num_users, num_items, rank, batch_size, learning_rate, lambda_) if model_type == "robust_gaussian": print "Building Graph for Robust Gaussian MF" model = RobustGaussianMF(num_users, num_items, rank, batch_size, learning_rate, lambda_) if model_type == "robust_poisson": print "Building Graph for Robust Poisson MF" model = RobustPoissonMF(num_users, num_items, rank, batch_size, learning_rate, lambda_) model.build_graph() print "Model Built!" with tf.Session() as sess: # train the model print "Optimizing the Model" sess.run(tf.global_variables_initializer()) for epoch_idx in xrange(n_epochs): loss_tracker = 0. for batch_idx in xrange(n_batches): X_user_batch = train_data[batch_idx * batch_size:(batch_idx + 1) * batch_size, 0] X_item_batch = train_data[batch_idx * batch_size:(batch_idx + 1) * batch_size, 1] Y_batch = train_data[batch_idx * batch_size:(batch_idx + 1) * batch_size, 2] # perform update loss_batch, _ = sess.run( [model.loss, model.optimizer], feed_dict={ model.X_user: X_user_batch, model.X_item: X_item_batch, model.Y: Y_batch }) loss_tracker += loss_batch if (epoch_idx + 1) % 10 == 0: print "Epoch %d. Obj: %.3f" % (epoch_idx + 1, loss_tracker) print "Epoch %d. Beta: %.3f" % (epoch_idx + 1, model.beta.eval()) if model_type == "robust_gaussian": print "Epoch %d. std: %.3f" % (epoch_idx + 1, model.sigma.eval()) print "Optimizaiton Finished!" np.savetxt(datafile + "U_dim_%d.csv" % rank, model.U.eval(), delimiter=',') np.savetxt(datafile + "V_dim_%d.csv" % rank, model.V.eval(), delimiter=',') # test the model test_X_user = tf.convert_to_tensor(test_data[:, 0]) test_X_item = tf.convert_to_tensor(test_data[:, 1]) test_Y = tf.to_float(tf.convert_to_tensor(test_data[:, 2])) if model_type == "pmf" or "robust_gaussian": test_user_embed = tf.nn.embedding_lookup(model.U, test_X_user, name="X_user_embed") test_item_embed = tf.nn.embedding_lookup(model.V, test_X_item, name="X_item_embed") test_pred = tf.reduce_sum(test_user_embed * test_item_embed, 1) # clip prediction to 1 - 5 for movielens predicton test_pred = tf.clip_by_value(test_pred, 1, 5) test_error = tf.reduce_mean( tf.squared_difference(test_pred, test_Y)) print "Avg. Square Error per Point of Test Data is %.3f" % test_error.eval( ) if model_type == "robust_poisson": # compute avg. log likelihood under categorical distribution score_mat = tf.exp(tf.matmul(model.U, tf.transpose(model.V))) score_mat /= tf.expand_dims(tf.reduce_sum(score_mat, 1), 1) index = tf.transpose(tf.concat([[test_X_user], [test_X_item]], 0)) log_catogorical = tf.reduce_sum(test_Y * tf.log( tf.gather_nd(score_mat, index))) / tf.reduce_sum(test_Y) print "Avg. Log Catogerical Likelihood per Point on Test Data is %.3f" % log_catogorical.eval( ) # compute avg. log likelihood under Poisson distribution test_user_embed = tf.nn.embedding_lookup(model.U, test_X_user, name="X_user_embed") test_item_embed = tf.nn.embedding_lookup(model.V, test_X_item, name="X_item_embed") test_pred = tf.reduce_sum(test_user_embed * test_item_embed, 1) test_Poissons = tf.contrib.distributions.Poisson( rate=tf.exp(test_pred)) log_poisson = tf.reduce_mean(test_Poissons.log_prob(test_Y)) print "Avg. Log Poisson Likelihood per Point on Test Data is %.3f" % log_poisson.eval( )
def mupb (v, pc, pmf = None, limit = None): """ Mid-neighbour U-Curve Probabilistic Bisection This function receives a vector, that describes approximately u-shaped curve, as argument and return the minimum element of this vector """ n = len (v) # Probability Mass Function if (pmf is None): pmf = PMF (n) evaluations = 0 if (limit is None): # limit = 1 + 2 * int_log2 (n) * int_log2 (n) limit = n first_qt = pmf.get_quarter (1) median = pmf.get_quarter (2) third_qt = pmf.get_quarter (3) while (first_qt is not median or \ median is not third_qt and limit > 0): evaluations += 3 # print (v) d = float (v[third_qt] - v[first_qt]) if (abs (d) < 1e-8): d = 0 else: d = d / abs (d) if (d is 0): if (v[median] < v[first_qt]): pmf.update (pc, first_qt, pmf.get_quarter_mass (1), 1) if (pmf.get_quarter (1) != pmf.get_quarter (2) and \ pmf.get_quarter (3) != pmf.get_quarter (2)): pmf.update (pc, third_qt, pmf.get_quarter_mass (3), -1) else: if (median - first_qt > third_qt - median): pmf.update (pc, first_qt, pmf.get_quarter_mass (1), 1) else: pmf.update (pc, third_qt, pmf.get_quarter_mass (3), -1) # [result, sub_eval] = split_mupb (v, pc, pmf, median, limit) # return [result, evaluations + sub_eval] else: if (d > 0): pmf.update (pc, third_qt, pmf.get_quarter_mass (3), -1) else: pmf.update (pc, first_qt, pmf.get_quarter_mass (1), 1) first_qt = pmf.get_quarter (1) median = pmf.get_quarter (2) third_qt = pmf.get_quarter (3) limit -= 1 return [v[median], evaluations]
num_items = cut_data_len(alldata, 'asin') fp = open("log.txt", "a") fp.write("dataset:" + "Musical_Instruments_5" + "\n") fp.write("ratio:" + str(ratio) + "\n") fp.write("latent_factor:" + str(latent_size) + "\n") fp.write("learning_rate:" + str(learning_rate) + "\n") for lambda_value in lambda_value_list: lambda_U = lambda_value[0] lambda_V = lambda_value[1] # initialization pmf_model = PMF(U=None, V=None, lambda_U=lambda_U, lambda_V=lambda_V, latent_size=latent_size, momentum=0.8, learning_rate=learning_rate, iterations=iterations) s = ( 'parameters are:ratio={:f}, reg_u={:f}, reg_v={:f}, latent_size={:d},' + 'learning_rate={:f}, iterations={:d}') print( s.format(ratio, lambda_U, lambda_V, latent_size, learning_rate, iterations)) U = None V = None fp.write("=============================== Lambda Value =============" +
# -- coding: utf-8 -- import numpy as np import load_data from pmf import PMF import matplotlib.pyplot as plt if __name__ == "__main__": file_path = 'data/u.data' data, n_users, n_movies = load_data.load_data(file_path) train_data, test_data = load_data.train_test_split(data) n_epoches = 25 pmf = PMF(n_feat=20, lr=0.005, lam_u=0.1, lam_v=0.1, n_epoches=n_epoches) pmf.set_num(n_users, n_movies) train_rmse, test_rmse = pmf.fit(train_data, test_data) # 画RMSE曲线图 plt.plot(range(n_epoches), train_rmse, marker='o', label='Training Data') plt.plot(range(n_epoches), test_rmse, marker='v', label='Test Data') plt.title('PMF in movielens RMSE curve') plt.xlabel('epoch') plt.ylabel('RMSE') plt.legend() plt.show() # 计算用户2的3个推荐电影候选 top_k = pmf.top_k(2, 5) print("用户2的推荐序列如下") for (i, r) in top_k: print("(电影:{}, 推荐分值:{})".format(int(i), round(r, 2)))
ratings = np.asarray(list(data[2])) print('len(movies)', len(movies)) print('len(users)', len(users)) print('len(ratings)', len(ratings)) print('max(data[0])', max(data[0])) print('max(data[1])', max(data[1])) alpha = 0.005 lambda_u = 0.001 lambda_v = 0.001 batch_size = 10 num_iterations = 150 num_features = 30 pmf = PMF(num_features, max(data[1]) + 1, max(data[0] + 1)) pmf.train(users, movies, ratings, alpha, lambda_u, lambda_v, batch_size, num_iterations) path_dev_data = 'C:\\Users\\chlee\\PycharmProjects\\ML_Text_Mining_HW2\\hw2-handout\\data\\dev.csv' import pandas as pd dev_data = pd.read_csv(path_dev_data, header=None, delimiter=',') dev_data = np.asarray(dev_data) dev_movies = dev_data[:, 0] dev_users = dev_data[:, 1] # ratings = np.asarray(list(dev_data[2])) for i in range(len(dev_movies)): predicted = pmf.predict(dev_users[i], dev_movies[i]) print(predicted)
def mupb(v, pc, pmf=None, limit=None): """ Mid-neighbour U-Curve Probabilistic Bisection This function receives a vector, that describes approximately u-shaped curve, as argument and return the minimum element of this vector """ n = len(v) # Probability Mass Function if (pmf is None): pmf = PMF(n) evaluations = 0 if (limit is None): # limit = 1 + 2 * int_log2 (n) * int_log2 (n) limit = n first_qt = pmf.get_quarter(1) median = pmf.get_quarter(2) third_qt = pmf.get_quarter(3) while (first_qt is not median or \ median is not third_qt and limit > 0): evaluations += 3 # print (v) d = float(v[third_qt] - v[first_qt]) if (abs(d) < 1e-8): d = 0 else: d = d / abs(d) if (d is 0): if (v[median] < v[first_qt]): pmf.update(pc, first_qt, pmf.get_quarter_mass(1), 1) if (pmf.get_quarter (1) != pmf.get_quarter (2) and \ pmf.get_quarter (3) != pmf.get_quarter (2)): pmf.update(pc, third_qt, pmf.get_quarter_mass(3), -1) else: if (median - first_qt > third_qt - median): pmf.update(pc, first_qt, pmf.get_quarter_mass(1), 1) else: pmf.update(pc, third_qt, pmf.get_quarter_mass(3), -1) # [result, sub_eval] = split_mupb (v, pc, pmf, median, limit) # return [result, evaluations + sub_eval] else: if (d > 0): pmf.update(pc, third_qt, pmf.get_quarter_mass(3), -1) else: pmf.update(pc, first_qt, pmf.get_quarter_mass(1), 1) first_qt = pmf.get_quarter(1) median = pmf.get_quarter(2) third_qt = pmf.get_quarter(3) limit -= 1 return [v[median], evaluations]
def upb(v, pc, pmf=None, limit=None): """ U-Curve Probabilistic Bisection This function receives a vector, that describes approximately u-shaped curve, as argument and return the minimum element of this vector """ n = len(v) total_s = time() update_time = 0 nof_updates = 0 # Probability Mass Function if (pmf is None): pmf = PMF(n) evaluations = 0 if (limit is None): # limit = n limit = 1 + 2 * int_log2(n) * int_log2(n) first_qt = pmf.get_quarter(1) median = pmf.get_quarter(2) third_qt = pmf.get_quarter(3) while ((first_qt is not median and \ median is not third_qt) and limit > 0): evaluations += 3 direction = select_side(v, median) if (direction is 0): if (median - first_qt > third_qt - median): direction = -1 else: direction = 1 alpha = pmf.get_quarter_mass(2) pmf.update(pc, median, alpha, direction) first_qt = pmf.get_quarter(1) median = pmf.get_quarter(2) third_qt = pmf.get_quarter(3) limit -= 1 return [v[median], evaluations]