Exemplo n.º 1
0
def train(reg):
    logdir = 'logs/mf/numpy'
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    print("Loading data...")
    movie_titles, ratings, rating_indices, n_users, n_items = get_netflix_data(n_samples=1000000)
    print("number of users with ratings: {}".format(len(np.unique(rating_indices[:,0]))))
    print("number of movies with ratings: {}".format(len(np.unique(rating_indices[:,1]))))

    method = 'als'
    if reg:
        print("Performing cross validation with reg: {}.".format(reg))
    else:
        print("Finding optimal regularization penalty.")
    reg_vals = [0.01, 0.1, 1, 10]
    best_reg = 0
    mean_loss = 0.0
    n_splits = 5
    n_features = 15
    loss_path = np.zeros((len(reg_vals), n_splits))
    kf = KFold(n_splits=n_splits, shuffle=True)
    kf.get_n_splits(rating_indices)
    for k, (train_index, test_index) in enumerate(kf.split(rating_indices)):
        print("Fold {}".format(k))
        train_indices, test_indices = rating_indices[train_index], rating_indices[test_index]
        train_indices = (train_indices[:,0], train_indices[:,1], train_indices[:,2])
        test_indices = (test_indices[:,0], test_indices[:,1], test_indices[:,2])
        if reg:
            start = time.time() 
            model = MF(n_users, n_items, n_features, method=method)
            model.fit(train_indices, verbose=1) 
            acc, loss = model.predict(test_indices)
            print("val_loss: {:.4f} - val_acc: {:.4f}".format(loss, acc))
            mean_loss = (mean_loss*k + loss) / (k+1)
        else:
            for i, reg in enumerate(reg_vals):
                print("lambda: {}".format(reg))
                start = time.time()
                model = MF(n_users, n_items, n_features, method=method)
                model.fit(train_indices, verbose=1)
                acc, loss = model.predict(test_indices)
                print("val_loss: {:.4f} - val_acc: {:.4f}".format(loss, acc))
                loss_path[i, k] = loss
    if reg:
        print("mean loss: {:.4f}".format(mean_loss))
    else:
        loss_means = np.mean(loss_path, axis=1)
        print(loss_means)
        best_reg = reg_vals[np.argmin(loss_means)]
        best_loss = np.amin(loss_means)
        print("best lambda: {} - loss: {}".format(best_reg, best_loss))
    print("Successfully finished training MF. See logs directory.")
Exemplo n.º 2
0
index2userid = {y: x for x, y in userid2index.items()}
index2itemid = {y: x for x, y in itemid2index.items()}

nonzero_row, nonzero_col = rating_matrix.nonzero()
inds = zip(nonzero_row.tolist(), nonzero_col.tolist())

import sys

sys.path.append('../tpmrec/')

from mf import MF

mf = MF(rating_matrix, inds, 10, 0.0001, 0.01)

mf.train(10)

for userindex in range(1000):
    userid = index2userid[userindex]
    if len(userid2itemindexes[userid]) > 20:
        continue
    pr = mf.predict()
    user_predict = pr[userindex, :]
    top_item_indexes = np.argsort(user_predict)[::-1][:10]
    print "userid = ", userid
    for itemid in userid2itemindexes[userid]:
        print itemid, itemid2name[itemid]
    print "recommend item"
    for itemindex in top_item_indexes:
        itemid = index2itemid[itemindex]
        print itemid, itemid2name[itemid]