Пример #1
0
def split_mupb(v, pc, pmf, i, limit):
    """ Splits the orginal problem v with pmf in two parts, from 0 to i - 1 and from
    i + 1 to len (v) """
    pmf.split_in(pmf.get_quarter(2), pmf.get_median_block())

    # print ("Splitting")

    blocks = pmf.get_blocks()
    half_size = len(blocks) // 2
    # print ("Dividing: ")
    blocks1 = blocks[0:half_size]
    blocks2 = blocks[half_size:len(blocks)]
    translate_blocks(blocks1)
    translate_blocks(blocks2)

    n1 = sum((block.end - block.start) for block in blocks1)
    n2 = len(v) - n1
    v1 = v[0:n1]
    v2 = v[n1:len(v)]
    # print ("n = ", len (v), ", |blocks| = ", len (blocks), ", half_size = ", half_size, ", n1 = ", n1, ", n2 = ", n2)

    pmf1 = PMF(n1, blocks1)
    pmf2 = PMF(n2, blocks2)

    sol1 = None
    eval1 = 0
    sol2 = None
    eval2 = 0
    if (len(v1) > 0):
        [sol1, eval1] = mupb(v1, pc, pmf1, limit)
        limit -= eval1
    if (len(v2) > 0):
        [sol2, eval2] = mupb(v2, pc, pmf2, limit)

    return [min(min_with_none(sol1, sol2), v[i]), eval1 + eval2]
Пример #2
0
def main():
    # PARSE COMMANDLINE ARGUMENTS
    N = int(sys.argv[1])  # NUMBER OF STEPS
    M = int(sys.argv[2])  # NUMBER OF WALKS
    mode = sys.argv[3]

    # INITIALIZE STEP SAMPLER
    sampler = StepSampler(fetchPRNG(mode, 7))

    # INITIALIZE PMF TO ACCUMULATE ENDPOINTS
    pmf = PMF()

    # MC SIMULATION TO SAMPLE ENDPOINTS
    for m in range(M):
        rw = RandomWalker1D(0.0, sampler.discreteSymmetric)
        rw.walk(N)
        pmf.add(rw.x)

    chiSquareTest((N, M), pmf)
Пример #3
0
def upb (v, pc, pmf = None, limit = None):
    """ U-Curve Probabilistic Bisection 
    This function receives a vector, that describes approximately u-shaped curve, as
    argument and return the minimum element of this vector """
    n = len (v)

    total_s = time ()
    update_time = 0
    nof_updates = 0
    
    # Probability Mass Function
    if (pmf is None):
        pmf = PMF (n)
    
    evaluations = 0
    if (limit is None):
        # limit = n 
        limit = 1 + 2 * int_log2 (n) * int_log2 (n)

    first_qt = pmf.get_quarter (1)
    median = pmf.get_quarter (2)
    third_qt = pmf.get_quarter (3)
    while ((first_qt is not median and \
           median is not third_qt) and limit > 0):
        evaluations += 3

        direction = select_side (v, median)
        if (direction is 0):
            if (median - first_qt > third_qt - median):
                direction = -1
            else:
                direction = 1
        
        alpha = pmf.get_quarter_mass (2)
        pmf.update (pc, median, alpha, direction)
        first_qt = pmf.get_quarter (1)
        median = pmf.get_quarter (2)
        third_qt = pmf.get_quarter (3)
        limit -= 1

    return [v[median], evaluations]
Пример #4
0
def main():
    # PARSE COMMANDLINE ARGUMENTS 
    N = int(sys.argv[1])  # NUMBER OF STEPS 
    M = int(sys.argv[2])  # NUMBER OF WALKS
    mode = sys.argv[3]

    # INITIALIZE STEP SAMPLER 
    sampler = StepSampler(fetchPRNG(mode))
    
    # INITIALIZE PMF TO ACCUMULATE ENDPOINTS
    pmf = PMF()
  
    # MC SIMULATION TO SAMPLE ENDPOINTS
    for m in range(M):
        rw = RandomWalker1D(0.0, sampler.discreteSymmetric)
        rw.walk(N)
        pmf.add(rw.x)

    # WRITE SIMULATION SUMMARY TO STDOUT
    print "# (x) (f(x)) (p(x)) (W(x,N)) (p(x,N))"
    for (k,v) in sorted(pmf.pmf.items()):
        print k, v, float(v)/pmf.n, W(k,N), p(k,N)
Пример #5
0
        ensemble = []
        for item, votes in enumerate(item_vote):
            insort(ensemble, (votes, item))
        return [(item, votes) for votes, item in ensemble[-topk:][::-1]]


if __name__ == "__main__":

    DATASET = 'fake'

    if DATASET == 'fake':
        (ratings, u, i) = fake_ratings()
        ratings = np.array(ratings).astype(float)
        MF_list = []
        for lambdar in [0.0, 0.5, 1.0]:
            pmf = PMF(ratings, latent_d=5, regularization_strength=lambdar)
            pmf.gradient_descent(ratings)
            MF_list.append(pmf)
    else:
        filename = 'ml-100k/u.data'
        ratings = read_ratings(filename)
        ratings = np.array(ratings).astype(float)
        MF_list = nise(ratings)

    ens = Majority(MF_list)
    uid = 50.0
    #gerar usuario teste
    nitems = MF_list[0].num_items
    user_vector = np.zeros((1, nitems), dtype=float)
    for line in ratings:
        if line[0] == uid:
Пример #6
0
from load_data import load_rating_data, split_rating_dat
from pmf import PMF
import matplotlib.pyplot as plt

if __name__ == '__main__':
    file_path = r"PMF\data\ml-1m\ratings.dat"
    pmf = PMF()
    pmf.set_params({
        "num_feature": 10,
        "max_epoch": 50,
        "num_batch": 50,
        "batch_size": 1000,
        "epsilon": 0.01,
        "_lambda": 0.1
    })
    ratings_data = load_rating_data(file_path)
    train, test = split_rating_dat(ratings_data, size=0.2)
    pmf.fit(train, test)

    # Check performance by plotting train and test errors
    plt.plot(range(pmf.max_epoch),
             pmf.rmse_train,
             marker='o',
             label='Training Data')
    plt.plot(range(pmf.max_epoch),
             pmf.rmse_test,
             marker='v',
             label='Test Data')
    plt.title('The MovieLens Dataset Learning Curve')
    plt.xlabel('Number of Epochs')
    plt.ylabel('RMSE')
Пример #7
0
def main():
    # build the model
    print "Rank of embedding is %d" % rank
    if model_type == "pmf":
        print "Building Graph for Probabilisic MF"
        model = PMF(num_users, num_items, rank, batch_size, learning_rate,
                    lambda_)
    if model_type == "robust_gaussian":
        print "Building Graph for Robust Gaussian MF"
        model = RobustGaussianMF(num_users, num_items, rank, batch_size,
                                 learning_rate, lambda_)
    if model_type == "robust_poisson":
        print "Building Graph for Robust Poisson MF"
        model = RobustPoissonMF(num_users, num_items, rank, batch_size,
                                learning_rate, lambda_)
    model.build_graph()
    print "Model Built!"

    with tf.Session() as sess:
        # train the model
        print "Optimizing the Model"
        sess.run(tf.global_variables_initializer())
        for epoch_idx in xrange(n_epochs):
            loss_tracker = 0.
            for batch_idx in xrange(n_batches):
                X_user_batch = train_data[batch_idx *
                                          batch_size:(batch_idx + 1) *
                                          batch_size, 0]
                X_item_batch = train_data[batch_idx *
                                          batch_size:(batch_idx + 1) *
                                          batch_size, 1]
                Y_batch = train_data[batch_idx * batch_size:(batch_idx + 1) *
                                     batch_size, 2]
                # perform update
                loss_batch, _ = sess.run(
                    [model.loss, model.optimizer],
                    feed_dict={
                        model.X_user: X_user_batch,
                        model.X_item: X_item_batch,
                        model.Y: Y_batch
                    })
                loss_tracker += loss_batch
            if (epoch_idx + 1) % 10 == 0:
                print "Epoch %d. Obj: %.3f" % (epoch_idx + 1, loss_tracker)
                print "Epoch %d. Beta: %.3f" % (epoch_idx + 1,
                                                model.beta.eval())
                if model_type == "robust_gaussian":
                    print "Epoch %d. std: %.3f" % (epoch_idx + 1,
                                                   model.sigma.eval())
        print "Optimizaiton Finished!"

        np.savetxt(datafile + "U_dim_%d.csv" % rank,
                   model.U.eval(),
                   delimiter=',')
        np.savetxt(datafile + "V_dim_%d.csv" % rank,
                   model.V.eval(),
                   delimiter=',')

        # test the model
        test_X_user = tf.convert_to_tensor(test_data[:, 0])
        test_X_item = tf.convert_to_tensor(test_data[:, 1])
        test_Y = tf.to_float(tf.convert_to_tensor(test_data[:, 2]))

        if model_type == "pmf" or "robust_gaussian":
            test_user_embed = tf.nn.embedding_lookup(model.U,
                                                     test_X_user,
                                                     name="X_user_embed")
            test_item_embed = tf.nn.embedding_lookup(model.V,
                                                     test_X_item,
                                                     name="X_item_embed")
            test_pred = tf.reduce_sum(test_user_embed * test_item_embed, 1)
            # clip prediction to 1 - 5 for movielens predicton
            test_pred = tf.clip_by_value(test_pred, 1, 5)
            test_error = tf.reduce_mean(
                tf.squared_difference(test_pred, test_Y))
            print "Avg. Square Error per Point of Test Data is %.3f" % test_error.eval(
            )

        if model_type == "robust_poisson":
            # compute avg. log likelihood under categorical distribution
            score_mat = tf.exp(tf.matmul(model.U, tf.transpose(model.V)))
            score_mat /= tf.expand_dims(tf.reduce_sum(score_mat, 1), 1)
            index = tf.transpose(tf.concat([[test_X_user], [test_X_item]], 0))
            log_catogorical = tf.reduce_sum(test_Y * tf.log(
                tf.gather_nd(score_mat, index))) / tf.reduce_sum(test_Y)
            print "Avg. Log Catogerical Likelihood per Point on Test Data is %.3f" % log_catogorical.eval(
            )

            # compute avg. log likelihood under Poisson distribution
            test_user_embed = tf.nn.embedding_lookup(model.U,
                                                     test_X_user,
                                                     name="X_user_embed")
            test_item_embed = tf.nn.embedding_lookup(model.V,
                                                     test_X_item,
                                                     name="X_item_embed")
            test_pred = tf.reduce_sum(test_user_embed * test_item_embed, 1)
            test_Poissons = tf.contrib.distributions.Poisson(
                rate=tf.exp(test_pred))
            log_poisson = tf.reduce_mean(test_Poissons.log_prob(test_Y))
            print "Avg. Log Poisson Likelihood per Point on Test Data is %.3f" % log_poisson.eval(
            )
Пример #8
0
def mupb (v, pc, pmf = None, limit = None):
    """ Mid-neighbour U-Curve Probabilistic Bisection 
    This function receives a vector, that describes approximately u-shaped curve, as
    argument and return the minimum element of this vector """
    n = len (v)
    
    # Probability Mass Function
    if (pmf is None):
        pmf = PMF (n)
    
    evaluations = 0
    if (limit is None):
        # limit = 1 + 2 * int_log2 (n) * int_log2 (n)
        limit = n

    first_qt = pmf.get_quarter (1)
    median = pmf.get_quarter (2)
    third_qt = pmf.get_quarter (3)
    while (first_qt is not median or \
           median is not third_qt and limit > 0):
        evaluations += 3

        # print (v)
        d = float (v[third_qt] - v[first_qt])
        if (abs (d) < 1e-8):
            d = 0
        else:
            d = d / abs (d)

        if (d is 0):
            if (v[median] < v[first_qt]):
                pmf.update (pc, first_qt, pmf.get_quarter_mass (1), 1)
                if (pmf.get_quarter (1) != pmf.get_quarter (2) and \
                    pmf.get_quarter (3) != pmf.get_quarter (2)):
                    pmf.update (pc, third_qt, pmf.get_quarter_mass (3), -1)
            else:
                if (median - first_qt > third_qt - median):
                    pmf.update (pc, first_qt, pmf.get_quarter_mass (1), 1)
                else:
                    pmf.update (pc, third_qt, pmf.get_quarter_mass (3), -1)
                # [result, sub_eval] = split_mupb (v, pc, pmf, median, limit)
                # return [result, evaluations + sub_eval]
        else:
            if (d > 0):
                pmf.update (pc, third_qt, pmf.get_quarter_mass (3), -1)
            else:
                pmf.update (pc, first_qt, pmf.get_quarter_mass (1), 1)
        
        first_qt = pmf.get_quarter (1)
        median = pmf.get_quarter (2)
        third_qt = pmf.get_quarter (3)
        limit -= 1

    return [v[median], evaluations]
Пример #9
0
    num_items = cut_data_len(alldata, 'asin')

    fp = open("log.txt", "a")
    fp.write("dataset:" + "Musical_Instruments_5" + "\n")
    fp.write("ratio:" + str(ratio) + "\n")
    fp.write("latent_factor:" + str(latent_size) + "\n")
    fp.write("learning_rate:" + str(learning_rate) + "\n")

    for lambda_value in lambda_value_list:
        lambda_U = lambda_value[0]
        lambda_V = lambda_value[1]
        # initialization
        pmf_model = PMF(U=None,
                        V=None,
                        lambda_U=lambda_U,
                        lambda_V=lambda_V,
                        latent_size=latent_size,
                        momentum=0.8,
                        learning_rate=learning_rate,
                        iterations=iterations)

        s = (
            'parameters are:ratio={:f}, reg_u={:f}, reg_v={:f}, latent_size={:d},'
            + 'learning_rate={:f}, iterations={:d}')
        print(
            s.format(ratio, lambda_U, lambda_V, latent_size, learning_rate,
                     iterations))

        U = None
        V = None

        fp.write("=============================== Lambda Value =============" +
Пример #10
0
# -- coding: utf-8 --
import numpy as np
import load_data
from pmf import PMF
import matplotlib.pyplot as plt

if __name__ == "__main__":
    file_path = 'data/u.data'
    data, n_users, n_movies = load_data.load_data(file_path)
    train_data, test_data = load_data.train_test_split(data)
    n_epoches = 25
    pmf = PMF(n_feat=20, lr=0.005, lam_u=0.1, lam_v=0.1, n_epoches=n_epoches)
    pmf.set_num(n_users, n_movies)
    train_rmse, test_rmse = pmf.fit(train_data, test_data)
    # 画RMSE曲线图
    plt.plot(range(n_epoches), train_rmse, marker='o', label='Training Data')
    plt.plot(range(n_epoches), test_rmse, marker='v', label='Test Data')
    plt.title('PMF in movielens RMSE curve')
    plt.xlabel('epoch')
    plt.ylabel('RMSE')
    plt.legend()
    plt.show()
    # 计算用户2的3个推荐电影候选
    top_k = pmf.top_k(2, 5)
    print("用户2的推荐序列如下")
    for (i, r) in top_k:
        print("(电影:{}, 推荐分值:{})".format(int(i), round(r, 2)))
Пример #11
0
ratings = np.asarray(list(data[2]))

print('len(movies)', len(movies))
print('len(users)', len(users))
print('len(ratings)', len(ratings))

print('max(data[0])', max(data[0]))
print('max(data[1])', max(data[1]))

alpha = 0.005
lambda_u = 0.001
lambda_v = 0.001
batch_size = 10
num_iterations = 150
num_features = 30
pmf = PMF(num_features, max(data[1]) + 1, max(data[0] + 1))
pmf.train(users, movies, ratings, alpha, lambda_u, lambda_v, batch_size,
          num_iterations)

path_dev_data = 'C:\\Users\\chlee\\PycharmProjects\\ML_Text_Mining_HW2\\hw2-handout\\data\\dev.csv'
import pandas as pd
dev_data = pd.read_csv(path_dev_data, header=None, delimiter=',')
dev_data = np.asarray(dev_data)

dev_movies = dev_data[:, 0]
dev_users = dev_data[:, 1]
# ratings = np.asarray(list(dev_data[2]))

for i in range(len(dev_movies)):
    predicted = pmf.predict(dev_users[i], dev_movies[i])
    print(predicted)
Пример #12
0
def mupb(v, pc, pmf=None, limit=None):
    """ Mid-neighbour U-Curve Probabilistic Bisection 
    This function receives a vector, that describes approximately u-shaped curve, as
    argument and return the minimum element of this vector """
    n = len(v)

    # Probability Mass Function
    if (pmf is None):
        pmf = PMF(n)

    evaluations = 0
    if (limit is None):
        # limit = 1 + 2 * int_log2 (n) * int_log2 (n)
        limit = n

    first_qt = pmf.get_quarter(1)
    median = pmf.get_quarter(2)
    third_qt = pmf.get_quarter(3)
    while (first_qt is not median or \
           median is not third_qt and limit > 0):
        evaluations += 3

        # print (v)
        d = float(v[third_qt] - v[first_qt])
        if (abs(d) < 1e-8):
            d = 0
        else:
            d = d / abs(d)

        if (d is 0):
            if (v[median] < v[first_qt]):
                pmf.update(pc, first_qt, pmf.get_quarter_mass(1), 1)
                if (pmf.get_quarter (1) != pmf.get_quarter (2) and \
                    pmf.get_quarter (3) != pmf.get_quarter (2)):
                    pmf.update(pc, third_qt, pmf.get_quarter_mass(3), -1)
            else:
                if (median - first_qt > third_qt - median):
                    pmf.update(pc, first_qt, pmf.get_quarter_mass(1), 1)
                else:
                    pmf.update(pc, third_qt, pmf.get_quarter_mass(3), -1)
                # [result, sub_eval] = split_mupb (v, pc, pmf, median, limit)
                # return [result, evaluations + sub_eval]
        else:
            if (d > 0):
                pmf.update(pc, third_qt, pmf.get_quarter_mass(3), -1)
            else:
                pmf.update(pc, first_qt, pmf.get_quarter_mass(1), 1)

        first_qt = pmf.get_quarter(1)
        median = pmf.get_quarter(2)
        third_qt = pmf.get_quarter(3)
        limit -= 1

    return [v[median], evaluations]
Пример #13
0
def upb(v, pc, pmf=None, limit=None):
    """ U-Curve Probabilistic Bisection 
    This function receives a vector, that describes approximately u-shaped curve, as
    argument and return the minimum element of this vector """
    n = len(v)

    total_s = time()
    update_time = 0
    nof_updates = 0

    # Probability Mass Function
    if (pmf is None):
        pmf = PMF(n)

    evaluations = 0
    if (limit is None):
        # limit = n
        limit = 1 + 2 * int_log2(n) * int_log2(n)

    first_qt = pmf.get_quarter(1)
    median = pmf.get_quarter(2)
    third_qt = pmf.get_quarter(3)
    while ((first_qt is not median and \
           median is not third_qt) and limit > 0):
        evaluations += 3

        direction = select_side(v, median)
        if (direction is 0):
            if (median - first_qt > third_qt - median):
                direction = -1
            else:
                direction = 1

        alpha = pmf.get_quarter_mass(2)
        pmf.update(pc, median, alpha, direction)
        first_qt = pmf.get_quarter(1)
        median = pmf.get_quarter(2)
        third_qt = pmf.get_quarter(3)
        limit -= 1

    return [v[median], evaluations]