def save_factorized():
    svd = SVD(learning_rate=0.001,
              regularization=0.005,
              n_epochs=10000,
              n_factors=15,
              min_rating=1,
              max_rating=5)
    svd.fit(X=rating)
    print("finish computing factorization")
    saveFileToPickle('user.pkl', svd.pu)
    saveFileToPickle('book.pkl', svd.qi)
Exemplo n.º 2
0
def train_model(df):
    train = df.sample(frac=0.8, random_state=7)
    val = df.drop(train.index.tolist()).sample(frac=1.0, random_state=8)

    svd = SVD(learning_rate=0.1, regularization=0.005, n_epochs=10,
              n_factors=10, min_rating=1, max_rating=10)

    svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

    outfile = open('svd_model', 'wb')
    pickle.dump(svd, outfile)
    outfile.close()
Exemplo n.º 3
0
def main():
    # Hyperparameters
    m = 5  # Number of bootstrap resamples

    irt_iters = 30
    irt_lr = 0.009

    svd_lr = 0.01
    svd_reg = 0.10
    svd_k = 50
    svd_iters = 500

    train_data = load_train_csv("../data")
    val_data = load_valid_csv("../data")
    test_data = load_private_test_csv('../data')

    val_svd = {'u_id': val_data['user_id'], 'i_id': val_data['question_id'], 'rating': val_data['is_correct']}
    test_svd = {'u_id': test_data['user_id'], 'i_id': test_data['question_id'], 'rating': test_data['is_correct']}

    svd_train_resamples = generate_resamples(train_data, m)
    irt_train_resamples = generate_resamples(train_data, m)

    svd_test_pred, irt_test_pred = [], []
    for i in range(m):
        curr_irt, curr_svd = irt_train_resamples[i], svd_train_resamples[i]

        # Train 2-PL IRT
        theta, a, beta, train_acc, val_acc, train_log_likes, val_log_likes, final = \
            irt(curr_irt, val_data, irt_lr, irt_iters)

        irt_test_pred.append(irt_predict(test_data, theta, a, beta)[0])

        # Train Funk SVD
        curr_svd = {'u_id': curr_svd['user_id'], 'i_id': curr_svd['question_id'],
                     'rating': curr_svd['is_correct']}

        svd = SVD(learning_rate=svd_lr, regularization=svd_reg, n_epochs=svd_iters, n_factors=svd_k, min_rating=0, max_rating=1)
        svd.fit(X=pd.DataFrame(curr_svd), X_val=pd.DataFrame(val_svd), early_stopping=False, shuffle=False)

        svd_test_pred.append(svd.predict(test_svd))

    test_avg = np.sum(irt_test_pred + svd_test_pred, axis=0) / (2 * m)

    binary_pred = [0 if x < 0.5 else 1 for x in test_avg]
    test_data['is_correct'] = binary_pred
    save_private_test_csv(test_data)
Exemplo n.º 4
0
def svd(data, svd_data, lr=0.01, reg=0.1, k=10, iters=500):
    train_data, val_data = data['train_data'], data['val_data']
    train_svd, val_svd = svd_data['train_svd'], svd_data['val_svd']

    svd = SVD(learning_rate=lr,
              regularization=reg,
              n_epochs=iters,
              n_factors=k,
              min_rating=0,
              max_rating=1)
    svd.fit(X=pd.DataFrame(train_svd),
            X_val=pd.DataFrame(val_svd),
            early_stopping=False,
            shuffle=False)

    # Train Accuracy
    pred = svd.predict(train_svd)
    train_acc = evaluate(train_data, pred)

    # Validate Accuracy
    pred = svd.predict(val_svd)
    val_acc = evaluate(val_data, pred)

    return train_acc, val_acc
Exemplo n.º 5
0
            # 否则新加入movie_id
            movie_id.append(row['i_id'])
            movie_count.append(1)
            movie_total_rating.append(row['rating'])
print('Total movie count:', len(movie_id))

# Funk SVD for item representation
train = data[data['u_id'].isin(train_id)]
test = data[data['u_id'].isin(test_id)]
svd = SVD(learning_rate=1e-3,
          regularization=0.005,
          n_epochs=200,
          n_factors=128,
          min_rating=0,
          max_rating=5)
svd.fit(X=data, X_val=test, early_stopping=True, shuffle=False)
item_matrix = svd.qi


def get_feature(input_id):
    # 根据输入的movie_id得出相应的feature
    movie_index = np.where(movie_id == input_id)
    return item_matrix[movie_index]


def action_mapping(input_id):
    '''
    convert input movie id to index
    :param input_id: movie id
    :return: index of movie.
    '''
Exemplo n.º 6
0
def main():
    train_data = load_train_csv("../data")
    val_data = load_valid_csv("../data")
    test_data = load_public_test_csv("../data")

    train_svd = {
        'u_id': train_data['user_id'],
        'i_id': train_data['question_id'],
        'rating': train_data['is_correct']
    }
    val_svd = {
        'u_id': val_data['user_id'],
        'i_id': val_data['question_id'],
        'rating': val_data['is_correct']
    }
    test_svd = {
        'u_id': test_data['user_id'],
        'i_id': test_data['question_id'],
        'rating': test_data['is_correct']
    }

    data = {"train_data": train_data, "val_data": val_data}
    svd_data = {"train_svd": train_svd, "val_svd": val_svd}

    lrs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
    regs = [0.001, 0.01, 0.05, 0.1, 0.5, 1]
    ks = [1, 5, 10, 20, 50, 100]
    iters = [10, 50, 100, 500, 1000, 2000]

    lr_train_results = []
    lr_val_results = []
    reg_train_results = []
    reg_val_results = []
    ks_train_results = []
    ks_val_results = []
    iters_train_results = []
    iters_val_results = []

    for lr in lrs:
        train_result, val_result = svd(data, svd_data, lr=lr)
        lr_train_results.append(train_result)
        lr_val_results.append(val_result)

    for reg in regs:
        train_result, val_result = svd(data, svd_data, reg=reg)
        reg_train_results.append(train_result)
        reg_val_results.append(val_result)

    for k in ks:
        train_result, val_result = svd(data, svd_data, k=k)
        ks_train_results.append(train_result)
        ks_val_results.append(val_result)

    for iter in iters:
        train_result, val_result = svd(data, svd_data, iters=iter)
        iters_train_results.append(train_result)
        iters_val_results.append(val_result)

    best_lr = lrs[lr_val_results.index(max(lr_val_results))]
    print("Best learning rate: ", best_lr)
    best_reg = regs[reg_val_results.index(max(reg_val_results))]
    print("Best regularization value: ", best_reg)
    best_k = ks[ks_val_results.index(max(ks_val_results))]
    print("Best k: ", best_k)
    best_iter = iters[iters_val_results.index(max(iters_val_results))]
    print("Best iterations: ", best_iter)

    plot(lrs, lr_train_results, lr_val_results, "Learning Rates")
    plot(regs, reg_train_results, reg_val_results, "Regularized Rates")
    plot(ks, ks_train_results, ks_val_results, "K-Values")
    plot(iters, iters_train_results, iters_val_results, "Iterations")

    final_svd = SVD(learning_rate=best_lr,
                    regularization=best_reg,
                    n_epochs=best_iter,
                    n_factors=best_k,
                    min_rating=0,
                    max_rating=1)
    final_svd.fit(X=pd.DataFrame(train_svd),
                  X_val=pd.DataFrame(val_svd),
                  early_stopping=False,
                  shuffle=False)

    # Train Accuracy
    pred = final_svd.predict(train_svd)
    train_acc = evaluate(train_data, pred)
    print("Final Train Accuracy: ", train_acc)

    # Validate Accuracy
    pred = final_svd.predict(val_svd)
    val_acc = evaluate(val_data, pred)
    print("Final Validation Accuracy: ", val_acc)

    # Test Accuracy
    pred = final_svd.predict(test_svd)
    test_acc = evaluate(test_data, pred)
    print("Final Test Accuracy: ", test_acc)
Exemplo n.º 7
0
import pandas as pd
import numpy as np

from funk_svd.dataset import fetch_ml20m_ratings
from funk_svd import SVD

from sklearn.metrics import mean_absolute_error

df = fetch_ml20m_ratings()

train = df.sample(frac=0.8, random_state=7)
val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = df.drop(train.index.tolist()).drop(val.index.tolist())

svd = SVD(learning_rate=0.001,
          regularization=0.005,
          n_epochs=100,
          n_factors=15,
          min_rating=1,
          max_rating=5)

svd.fit(X=train, X_val=val, early_stopping=True, shuffle=False)

pred = svd.predict(test)
mae = mean_absolute_error(test["rating"], pred)

print("Test MAE: {:.2f}".format(mae))
class SVD_Explainable_2(Strategy):

    def __init__(self, data_items, ratings_train, ratings_validation):
        self.name = 'SVD_Explainable_2'
        self.data_items = data_items
        self.ratings_train = ratings_train
        self.ratings_validation = ratings_validation
        self.explanations_matrix = pd.read_csv('explanation_matrix_user_based.csv')
        self.svd = SVD(self.explanations_matrix, learning_rate=0.005, regularization=0.005, n_epochs=1000, n_factors=15, min_rating=1, max_rating=2, lambda_=0.000)
        self.svd.fit(X=ratings_train,X_val=ratings_validation, early_stopping=True, shuffle=False)
        # self.predicted_matrix = pd.DataFrame(self.mf.full_matrix(), index=data_items.index, columns=data_items.columns)

    def get_users_of_project(self,project):
        users_of_project = self.data_items[project]
        users_of_project = users_of_project[users_of_project > 0].index.values
        return users_of_project

    def get_user_projects(self, user_id):
        known_user_likes = self.data_items.loc[user_id]
        known_user_likes = known_user_likes[known_user_likes > 0].index.values
        return known_user_likes

    def calc_explanation_score_user_based(self,user_id,project,cf_user_user):
        k=50
        similar_users = cf_user_user.find_k_similar_users(user_id, k=k).index
        user_liked_project = self.get_users_of_project(project)
        return len(np.intersect1d(similar_users, user_liked_project))/len(similar_users)

    def calc_explanation_score_item_based(self,user_id,project,cf_item_item):
        k=10
        similar_projects = cf_item_item.get_k_similar_projects(project, k=k)
        known_user_projects = self.get_user_projects(user_id)
        return len(np.intersect1d(similar_projects, known_user_projects))/len(similar_projects)

    def get_explanations_matrix(self):
        i=0
        #cf_item_item = CFItemItem(self.data_items)
        cf_user_user = CFUserUser(self.data_items)
        explanation_matrix = pd.DataFrame(0, columns=self.data_items.columns, index=self.data_items.index)
        print (explanation_matrix.shape)
        for user_id in explanation_matrix.index:
            print (i)
            i += 1
            for project in explanation_matrix.columns:
                explanation_matrix.loc[user_id][project] = self.calc_explanation_score_user_based(user_id, project,cf_user_user)
        return explanation_matrix

    def get_recommendations(self, user_index, known_user_projects, k, ip_address):
        projects_predicted_ratings = \
            [[project, self.svd.predict_pair(user_index, project, clip=False)]
             for project in self.data_items.columns
             if project not in known_user_projects]

        # projects_predicted_ratings = \
        #     [[project, self.predicted_matrix.loc[user_index][project]]
        #      for project in self.data_items.columns
        #      if project not in known_user_projects]
        projects_predicted_ratings = sorted(projects_predicted_ratings, key=lambda i: i[1], reverse=True)
        self.projects_predicted_ratings = projects_predicted_ratings
        self.user = user_index
        projects_predicted_ratings = [i[0] for i in projects_predicted_ratings]
        projects_predicted_ratings = self.remove_non_active_projects(projects_predicted_ratings)
        # projects_predicted_ratings = self.remove_unreachable_projects(projects_predicted_ratings, ip_address)
        return projects_predicted_ratings[:k]

    @staticmethod
    def remove_non_active_projects(recommended_projects):
        from Recommender import non_active_projects
        return [project for project in recommended_projects if project not in non_active_projects['project'].values]

    @staticmethod
    def remove_unreachable_projects(recommended_projects, ip_address):
        user_loc = get_user_loc(ip_address)
        return [project for project in recommended_projects if is_project_reachable_to_user(user_loc, project)]

    def get_highest_online_project(self):
        from Recommender import is_online_project, recommend_default_online
        online_similar_projects = list(filter(lambda x: is_online_project(x[0]), self.projects_predicted_ratings))
        if len(online_similar_projects) == 0:
            return recommend_default_online(self.user)
        return online_similar_projects[0][0]
Exemplo n.º 9
0
from funk_svd.dataset import fetch_ml_ratings
from funk_svd import SVD

from sklearn.metrics import mean_absolute_error

df = fetch_ml_ratings(variant='100k')

train = df.sample(frac=0.8, random_state=7)
val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = df.drop(train.index.tolist()).drop(val.index.tolist())

svd = SVD(lr=0.001,
          reg=0.005,
          n_epochs=100,
          n_factors=15,
          early_stopping=True,
          shuffle=False,
          min_rating=1,
          max_rating=5)

svd.fit(X=train, X_val=val)

pred = svd.predict(test)
mae = mean_absolute_error(test['rating'], pred)

print(f'Test MAE: {mae:.2f}')