Exemplo n.º 1
0
     algo = SVDpp()
     name = "SVD++"
 elif klass.__name__ == 'PMF':
     algo = SVD(biased=False)
     name = "PMF"
 # elif klass.__name__ == 'NMF':
 #     algo = NMF()
 #     name="NMF"
 elif klass.__name__ == 'SlopeOne':
     algo = SlopeOne()
     name = "SlopeOne"
 elif klass.__name__ == 'KNNBasic':
     # 基于用户的协同过滤
     algo = KNNBasic(k=40,
                     sim_options={
                         'name': 'cosine',
                         'user_based': True
                     })
     name = "CFBaseUser"
 elif klass.__name__ == 'KNNBasic':
     # 基于物品的协同过滤
     algo = KNNBasic(k=40,
                     sim_options={
                         'name': 'cosine',
                         'user_based': False
                     })
     name = "CFBaseItem"
 elif klass == 'KNNWithMeans':
     algo = KNNWithMeans()
     name = "KNNWithMeans"
 elif klass.__name__ == 'KNNBaseline':
Exemplo n.º 2
0
        print("[Fold %d], UserItemAvg" % (f+1))
        print("==========================================================================")
        UserItemAvg_preds = BaselineOnly().fit(trainset).test(testset)
        errors, ttest_results, pairwise_results, mannwhitneyu_results = evaluate_for_each_group(UserItemAvg_preds, U1, U2, U3, U4, beyms, ms)
        UserItemAvg_errors_per_fold.append(errors)
        UserItemAvg_predictions.extend(UserItemAvg_preds)
        print("Mean Absolute Error: " + str(errors))
        print("t-Test: " + str(ttest_results))
        print("ANOVA: " + str(pairwise_results["ANOVA"]))
        print(pairwise_results["TukeyHSD"])
        print("Mann-Whiteney-U Test: " + str(mannwhitneyu_results))

        print("==========================================================================")
        print("[Fold %d], UserKNN" % (f+1))
        print("==========================================================================")
        UserKNN_preds = KNNBasic(k=40, sim_options={"name": "cosine"}).fit(trainset).test(testset)
        errors, ttest_results, pairwise_results, mannwhitneyu_results = evaluate_for_each_group(UserKNN_preds, U1, U2, U3, U4, beyms, ms)
        UserKNN_errors_per_fold.append(errors)
        UserKNN_predictions.extend(UserKNN_preds)
        print("Mean Absolute Error: " + str(errors))
        print("t-Test: " + str(ttest_results))
        print("ANOVA: " + str(pairwise_results["ANOVA"]))
        print(pairwise_results["TukeyHSD"])
        print("Mann-Whiteney-U Test: " + str(mannwhitneyu_results))

        print("==========================================================================")
        print("[Fold %d], UserKNNAvg" % (f+1))
        print("==========================================================================")
        UserKNNAvg_preds = KNNWithMeans(k=40, sim_options={"name": "cosine"}).fit(trainset).test(testset)
        errors, ttest_results, pairwise_results, mannwhitneyu_results = evaluate_for_each_group(UserKNNAvg_preds, U1, U2, U3, U4, beyms, ms)
        UserKNNAvg_errors_per_fold.append(errors)
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold
from surprise.model_selection import cross_validate
#This script is mainly test on different parameter of KNN and SVD to get the best result

data = Dataset.load_builtin('ml-100k')

# iterate the k in the KNN, to get the optimal MAE score
i = 3
while i != 20:
    print("K = ", i)
    # Use the famous knn algorithm
    algo = KNNBasic(user_based=False, k=i)

    # Run 5-fold cross-validation and print results.
    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    i += 1

# iterate the n_factor in the KNN, to get the optimal MAE score

i = [50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]
for index in i:
    print("n_factor = ", index)
    # Use the famous knn algorithm
    algo = SVD(n_factors=index)

    # Run 5-fold cross-validation and print results.
    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
Exemplo n.º 4
0
def compute_recommendations(user_id, prediction_table, numeric_prediction_table):


    algo = 'Item-based KNN'

    sim_options = {'user_based': False}

    algorithm = KNNBasic(sim_options=sim_options)


    # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview



    engine = create_engine(config.DB_URI, echo=True)
    session = scoped_session(sessionmaker(bind=engine,
                                      autocommit = False,
                                      autoflush = False))



    #reading in the database


    df_ratings = pd.read_sql('SELECT * FROM ratings;', con = engine)
    df_ratings=df_ratings[['user_id','item_id','rating']]
    df_ratings = df_ratings.dropna()
    df_ratings = df_ratings.drop_duplicates()


    df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False)
    df_ratings2 = df_ratings2.rename(columns = {'movie_id': 'item_id'})
    df_ratings2 = df_ratings2[['user_id','item_id','rating']]
    df_ratings2 = df_ratings2.dropna()
    df_ratings2 = df_ratings2.drop_duplicates()

    df_ratings = pd.concat([df_ratings, df_ratings2], axis=0)




    reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10))
    data = Dataset.load_from_df(df_ratings, reader=reader)

    trainset = data.build_full_trainset()


#     algorithm = eval(algo + "()")# set the algorithm...............................................


    algorithm.train(trainset)

    items = pd.read_sql('SELECT distinct id FROM items;', con = engine)
    df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id]
    total_items = items.id.unique()
    user_items = df_user_items.item_id.unique()
    # user_id = str(user_id)
    prediction_items = [x for x in total_items if x not in user_items]

    predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction'])


    predicted_ratings = []

    for i in prediction_items:
        a = user_id
        b = i
        est = algorithm.predict(a, b)
        predicted_ratings.append(est[3])

    predictions['item_id'] = prediction_items
    predictions['user_id'] = pd.Series([user_id for x in range(len(predictions.index))], index=predictions.index)


    predictions['prediction'] = predicted_ratings


    predictions = predictions.sort_values('prediction', ascending=False)
    test_prediction = predictions
    predictions = predictions.head(n=10)


    cols =['pred_1', 'pred_2','pred_3','pred_4',
                                   'pred_5','pred_6','pred_7','pred_8',
                                  'pred_9','pred_10']




    df_pred = predictions[['item_id']].T

    df_pred.columns = cols

    df_pred['id'] = user_id



    df_pred = df_pred[['id','pred_1', 'pred_2','pred_3','pred_4',
                                       'pred_5','pred_6','pred_7','pred_8',
                                      'pred_9','pred_10']]

    df_pred['id'] = df_pred['id'].astype(int)



    df_pred.to_sql(prediction_table, engine,if_exists='append', index=False)#if_exists='append'
    session.commit()


    df_num_ratings = test_prediction

    df_num_ratings = df_num_ratings.head(n=20)

    df_num_ratings['algorithm'] = algo
    df_num_ratings.rename(columns={'prediction':'predicted_rating'}, inplace=True)


    df_num_ratings.to_sql('numeric_predictions',engine,if_exists='append', index=False)#if_exists='append'
    session.commit()


    predcols =['num_1', 'num_2','num_3','num_4',
                                       'num_5','num_6','num_7','num_8',
                                      'num_9','num_10']

    df_num_ratings_transpose = predictions[['prediction']].T
    df_num_ratings_transpose.columns = predcols




    df_num_ratings_transpose['id'] = user_id

    df_num_ratings_transpose = df_num_ratings_transpose[['id','num_1', 'num_2','num_3','num_4',
                                       'num_5','num_6','num_7','num_8',
                                      'num_9','num_10']]

    df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int)








    df_num_ratings_transpose.to_sql(numeric_prediction_table,engine,if_exists='append', index=False)#if_exists='append'
    session.commit()
Exemplo n.º 5
0
train_data = data.build_full_trainset()

# Load test data
file_path = "../data/test.csv"
reader = Reader(line_format='user item rating',
                rating_scale=(0, 10),
                sep=',',
                skip_lines=1)
print("Loading test data from file...")
data = Dataset.load_from_file(file_path, reader=reader)
test = data.build_full_trainset()
test_data = test.build_testset()

# User-based CF
user_sim_options = {'name': 'cosine', 'user_based': True}
user_knn = KNNBasic(sim_options=user_sim_options)
#user_knn = KNNBasic(min_k=15,sim_options=user_sim_options)
#user_knn = KNNBasic(k=30,sim_options=user_sim_options)
print("Fitting KNN user model to training set")
user_knn.fit(train_data)

print("Making user-based predictions on the test set...")
user_predictions = user_knn.test(test_data)

print("User-based stats ...")
RMSE = accuracy.rmse(user_predictions, verbose=False)
print("RMSE: " + str(RMSE))
precisions, recalls = precision_recall_at_k(user_predictions, 15, threshold=7)
precision = sum(precisions.values())
recall = sum(recalls.values())
total = precision + recall
Exemplo n.º 6
0
import os

app = Flask(__name__)
CORS(app)
print("initiated the app! With a name of:", __name__)

cache = TTLCache(maxsize=100, ttl=300)


@app.route('/api', methods=['GET'])
def isActive():
    return jsonify(status="service is online", api_version="0.1")


algo = KNNBasic()


@app.route('/train', methods=['GET'])
def train_model():
    histories = requests.get(
        'https://whispering-refuge-67560.herokuapp.com/api/histories')
    history_data = json.loads(histories.content.decode('utf-8'))
    data_train = pd.DataFrame.from_dict(history_data, orient='columns')
    data_train.drop(
        columns=['booking_id', 'createdAt', 'history_id', 'updatedAt'])
    data_train = data_train[['tid', 'gid', 'rating']]

    sim_options = {'name': 'cosine', 'user_based': False}

    global algo
Exemplo n.º 7
0
def run_rec(dataset, num_rec=20):
    r_cols = ['user_id', 'item_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv(
        'ml-100k/sample.txt',
        sep=' ',
        names=['user_id', 'item_id', 'rating', 'unix_timestamp'])
    # ratings = pd.read_csv('ml-100k/ua.base.txt', sep='\t', names=r_cols)
    # ratings = pd.DataFrame(dataset, columns=['user_id', 'item_id', 'rating'])

    train_data = ratings.to_numpy()
    n_rows, n_cols = train_data.shape

    # normalized_data = train_data.copy()
    normalized_data = np.ndarray((n_rows, n_cols), dtype=object)
    for r in range(n_rows):
        normalized_data[r, 0] = train_data[r, 0]
        normalized_data[r, 1] = train_data[r, 1]
        normalized_data[r, 2] = float(train_data[r, 2])
        normalized_data[r, 3] = train_data[r, 3]

    # User mean
    # users = train_data[:, 0]
    # n_users = int(np.max(train_data[:, 0]))
    # mean_rating_matrix = np.zeros((n_users + 1,))
    # for u in range(1, n_users + 1):
    #     indices = np.where(users == u)[0].astype(np.int32)
    #     temp_ratings = train_data[indices, 2]
    #     # temp_ratings = [float(temp) for temp in train_data[indices, 2]]
    #     mean_rating_matrix[u] = np.mean(temp_ratings) if indices.size > 0 else 0
    #     normalized_data[indices, 2] = temp_ratings - mean_rating_matrix[u]

    # Item mean
    items = train_data[:, 1]
    n_items = int(np.max(train_data[:, 1]))
    mean_rating_matrix = np.zeros((n_items + 1, ))
    for i in range(1, n_items + 1):
        indices = np.where(items == i)[0].astype(np.int32)
        temp_ratings = train_data[indices, 2]
        # temp_ratings = [float(temp) for temp in train_data[indices, 2]]
        mean_rating_matrix[i] = np.mean(
            temp_ratings) if indices.size > 0 else 0
        normalized_data[indices, 2] = temp_ratings - mean_rating_matrix[i]

    new_ratings = pd.DataFrame(
        normalized_data,
        columns=['user_id', 'item_id', 'rating', 'unix_timestamp'])

    reader = Reader()

    data = Dataset.load_from_df(new_ratings[['user_id', 'item_id', 'rating']],
                                reader)
    trainset = data.build_full_trainset()

    sim_options = {'name': 'cosine', 'user_based': False}
    algo = KNNBasic(sim_options=sim_options)

    algo.fit(trainset)
    print(algo.sim)

    item_raw_id = 1
    item_inner_id = algo.trainset.to_inner_iid(item_raw_id)
    # print(item_inner_id)

    item_neighbors_inner_ids = algo.get_neighbors(item_inner_id, k=num_rec)
    # for inner_id in item_neighbors_inner_ids:
    #     print(inner_id)

    item_neighbors_raw_ids = (algo.trainset.to_raw_iid(inner_id)
                              for inner_id in item_neighbors_inner_ids)
    print('Start')
    for raw_id in item_neighbors_raw_ids:
        print(raw_id)

    print('Done')
logging.basicConfig(filename='benchmark.log', level=logging.DEBUG)

reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(
    grid_new[['user_name', 'game_genres', 'scaled_score']], reader)

benchmark = []
# Iterate over all algorithms
for algorithm in [
        SVD(),
        SVDpp(),
        SlopeOne(),
        NMF(),
        NormalPredictor(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        BaselineOnly(),
        CoClustering()
]:

    # Perform cross validation
    results = cross_validate(algorithm,
                             data,
                             measures=['RMSE'],
                             cv=3,
                             verbose=False)

    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
Exemplo n.º 9
0
################################################# Algorithm Selection #######################################
print(
    'Select the model: \n\n 1-SVD \n 2-KNN \n 3-Cosine Similarity \n 4-Pearson Similarity'
)
algo_select = input("Enter number 1-4: ")
algo_select = int(algo_select)

if algo_select == 1:
    # Using the famous SVD algorithm to build the model
    print('\nModel is SVD ')
    algo = SVD()

elif algo_select == 2:
    # Use KNNBasic algorithm to build the model
    print('\nModel is KNN ')
    algo = KNNBasic()

elif algo_select == 3:
    # use Cosine similarity measure to estimate rating
    print('\nModel is Cosine Similarity ')
    sim_options = {
        'name': 'cosine',
        'user_based': False  # compute  similarities between items
    }
    algo = KNNBasic(sim_options=sim_options)

else:
    # use Pearson similarity measure to estimate rating
    print('\nModel is Pearson Similarity ')
    sim_options = {
        'name': 'pearson_baseline',
Exemplo n.º 10
0
def train_model():
    # Send request to Nodejs server for authentication.
    if not is_good_request(request):
        return abort(400)

    # Extract data from request.
    data = request.get_json()
    dataset, data_header, model_name, params, train_type, save_on_server, save_on_local = data.values()

    # if not is_header_valid(data_header):
    #     return jsonify({'message': '[ERROR] Incorrect dataset format.'})

    # Use the data uploaded or data on server.
    df = pd.DataFrame(dataset, columns=data_header) if dataset else pd.read_csv('./data/data-new.csv', header=0)
    try:
        train_set, test_set = build_train_test(df, Reader(), full=train_type == 'full')
    except ValueError:
        return jsonify({'error': 'Incorrect dataset format.'})

    if model_name == 'insvd':
        n_factors, n_epochs, lr_all, reg_all, random_state = params.values()
        ALLOWED_EXTENSIONS = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'}
        # Parse data types.
        n_factors = int(n_factors)
        n_epochs = int(n_epochs)
        lr_all = float(lr_all)
        reg_all = float(reg_all)
        random_state = int(random_state)
        model = InSVD(n_factors=n_factors, n_epochs=n_epochs,
                      lr_all=lr_all, reg_all=reg_all, random_state=random_state)
    else:
        k, sim_options, random_state = params.values()
        model = KNNBasic(k=int(k), sim_options={'name': sim_options, 'user_based': False},
                         random_state=int(random_state))

    # Fitting and testing.
    model.fit(train_set)
    predictions = model.test(test_set)

    # Add suffix if not save on server.
    model_path = f'./model/{model_name}' if save_on_server else f'./model/{model_name}-temp'

    # Save.
    dump.dump(model_path, algo=model, predictions=predictions)
    model_info = {
        'rmse': rmse(predictions),
        'mae': mae(predictions),
    }

    # Zip the trained model.
    try:
        zip_obj = ZipFile(f'{model_path}.zip', 'w')
        zip_obj.write(model_path)
        zip_obj.close()
    except FileNotFoundError:
        return abort(404)

    @after_this_request
    def remove_dump_files(response):
        # If not save model on server, delete model dump file.
        if not save_on_server:
            os.remove(model_path)

        # Always delete the .zip file.
        os.remove(f'{model_path}.zip')

        return response

    if save_on_local:
        with open(f'{model_path}.zip', 'rb') as f:
            model_zip = f.readlines()

        resp = Response(model_zip)
        resp.headers['X-Model-Info'] = json.dumps(model_info)
        resp.headers['Content-Type'] = 'application/zip'
        resp.headers['Content-Disposition'] = 'attachment; filename=%s;' % 'model.zip'

        return resp
        # return Response(model_zip, headers={
        #     'X-Info': json.dumps(model_info),
        #     'Content-Type': 'application/zip',
        #     'Content-Disposition': 'attachment; filename=%s;' % 'model.zip',
        # })

        # return send_from_directory('./model', model_file), 200

    return jsonify(model_info)
Exemplo n.º 11
0

#load data from a file
file_path = os.path.expanduser('restaurant_ratings.txt')
reader = Reader(line_format='user item rating timestamp',
                sep='\t',
                skip_lines=0)
# print(reader)
data = Dataset.load_from_file(file_path, reader=reader)
data.folds()

#Splitting data into 3 folds
data.split(n_folds=3, shuffle=False)

#Item based Collaborative Filtering algorithm.
algo = KNNBasic(sim_options={'user_based': False})
#Using the value of K
# algo = KNNBasic(k=20, sim_options = {'name':'MSD', 'user_based': True })

#To used MSD in Item Based Algorithm
# algo = KNNBasic(sim_options = {'name':'MSD','user_based': False })
#To used Cosine in Item Based Algorithm
#algo = KNNBasic(sim_options = {'name':'cosine','user_based': False })
#To used Pearson in Item Based Algorithm
#algo = KNNBasic(sim_options = {'name':'pearson','user_based': False })

#Printing the result
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
# def printItem():
#     print()
#     pt(perf)
Exemplo n.º 12
0
from surprise import Dataset, evaluate
from surprise import KNNBasic
from collections import defaultdict
import os, io

data = Dataset.load_builtin("ml-100k")
trainingSet = data.build_full_trainset()

sim_options = {'name': 'cosine', 'user_based': False}

knn = KNNBasic(sim_options=sim_options)
knn.fit(trainingSet)

testSet = trainingSet.build_anti_testset()
predictions = knn.test(testSet)


def get_top3_recommendations(predictions, topN=3):

    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))

    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_recs[uid] = user_ratings[:topN]

    return top_recs


def read_item_names():
Exemplo n.º 13
0
def generate_sim_matrix(trainSet, sim_metric, is_user=True):
    sim_options = {'name': sim_metric, 'user_based': is_user}
    model = KNNBasic(sim_options=sim_options, verbose=False)
    model.fit(trainSet)
    simsMatrix = model.compute_similarities()
    return simsMatrix
Exemplo n.º 14
0
from surprise import KNNBasic
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import Reader
import os
#load data from a file
file_path = os.path.expanduser('restaurant_ratings.txt')
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)
data.split(n_folds=3)

for k in range(10, 100, 10):
	print ("K is " +  str(k))
	algo = KNNBasic(k, sim_options = {'name':'MSD','user_based': True})
	perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
	print_perf(perf)
Exemplo n.º 15
0
    data = ml.loadMovieLensLatestSmall()
    print("\nComputing movie popularity ranks so we can measure novelty later...")
    rankings = ml.getPopularityRanks()
    return (ml, data, rankings)

np.random.seed(0)
random.seed(0)

# Load up common data set for the recommender algorithms
(ml, evaluationData, rankings) = LoadMovieLensData()

# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings)

# User-based KNN
UserKNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': True})
evaluator.AddAlgorithm(UserKNN, "User KNN")

# Item-based KNN
ItemKNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': False})
evaluator.AddAlgorithm(ItemKNN, "Item KNN")

# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")

# Fight!
evaluator.Evaluate(False)

evaluator.SampleTopNRecs(ml)
Exemplo n.º 16
0
    fig, ax = plt.subplots(figsize=(10, 5))
    plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    ax.plot(x, fit_time, marker='o', label="Fit Time")
    ax.plot(x, test_time, marker='o', label="Test Time")

    # Chart setup
    plt.title("Model Time Evaluation", fontsize=12)
    plt.xlabel("Cross Validation Folds", fontsize=10)
    plt.ylabel("Times in seconds", fontsize=10)
    plt.legend()
    plt.show()

benchmark = []
# Iterate over all algorithms --> First Fold ist train, k-1 Folds for testing
#for algorithm in [SVD(), KNNBasic()]:
algorithm = KNNBasic()
# Perform cross validation
results = cross_validate(algorithm, data, measures=['RMSE', 'MAE', "MSE"], cv=3, verbose=False)

# Get results & append algorithm name
tmp = pd.DataFrame.from_dict(results).mean(axis=0)
tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
benchmark.append(tmp)
plot_Errors(results)
#plot_times(results)

pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

benchmark[0]

kf = KFold(n_splits=3)
Exemplo n.º 17
0
    return (ml, data, rankings)


np.random.seed(0)
random.seed(0)

# Load up common data set for the recommender algorithms
(ml, evaluationData, rankings) = LoadMovieLensData()

# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings)

# User-based KNN
UserKNN = KNNBasic(sim_options={
    'name': 'pearson_baseline',
    'user_based': True,
    'shrinkage': 0
})
evaluator.AddAlgorithm(UserKNN, "User KNN")

# Item-based KNN
#ItemKNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': False})
#evaluator.AddAlgorithm(ItemKNN, "Item KNN")

# Just make random recommendations
#Random = NormalPredictor()
#evaluator.AddAlgorithm(Random, "Random")

# Fight!
evaluator.Evaluate(True)
Exemplo n.º 18
0
def run():
    for algorithm in [SVD(), KNNBasic()]:
        prec_rec_test(algorithm)
    for algorithm in [SVD(), KNNBasic()]:
        plot_PRF(algorithm)
Exemplo n.º 19
0
 def __calc_sim_matrix(self):
     algo = KNNBasic(sim_options=self.sim_options)
     algo.fit(self.trainset)
     self.similarity_matrix = algo.compute_similarities()
Exemplo n.º 20
0
    def train_surprise(self,
                       model_type,
                       trainset,
                       testset,
                       k_recommend,
                       sql_db,
                       k_fold,
                       knowledge,
                       model_name,
                       result_name,
                       system_eval=False):

        knn_user_based = self.config['SURPRISE_KNN'].getboolean(
            'knn_user_based')
        knn_similarity = self.config['SURPRISE_KNN']['knn_similarity']
        sim_options = {'name': knn_similarity, 'user_based': knn_user_based}
        verbose_switch = self.config['DEFAULT'].getboolean('verbose_switch')
        # Selección de modelo a utilizar
        if (model_type == "svd"):
            # Obtener valores de configuracion
            svd_grid_search = self.config['SURPRISE_SVD'].getboolean(
                'svd_grid_search')
            svd_grid_metric = self.config['SURPRISE_SVD']['svd_grid_metric']
            svd_n_factors = int(self.config['SURPRISE_SVD']['svd_n_factors'])
            svd_n_epochs = int(self.config['SURPRISE_SVD']['svd_n_epochs'])
            svd_biased = self.config['SURPRISE_SVD'].getboolean('svd_biased')
            svd_init_mean = float(self.config['SURPRISE_SVD']['svd_init_mean'])
            svd_init_std_dev = float(
                self.config['SURPRISE_SVD']['svd_init_std_dev'])
            svd_lr_all = float(self.config['SURPRISE_SVD']['svd_lr_all'])
            svd_reg_all = float(self.config['SURPRISE_SVD']['svd_reg_all'])

            if (self.common_functions.validate_available_sql_data(
                    'svd_params', sql_db) == True):
                results = pd.read_sql_query('select * from svd_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "svd")]
                if (real_results.empty == False):
                    svd_n_factors = int(real_results.iloc[0]['svd_n_factors'])
                    svd_n_epochs = int(real_results.iloc[0]['svd_n_epochs'])
                    svd_init_std_dev = float(
                        real_results.iloc[0]['svd_init_std_dev'])
                    svd_lr_all = float(real_results.iloc[0]['svd_lr_all'])
                    svd_reg_all = float(real_results.iloc[0]['svd_reg_all'])

            algo = SVD(n_factors=svd_n_factors,
                       n_epochs=svd_n_epochs,
                       biased=svd_biased,
                       init_mean=svd_init_mean,
                       init_std_dev=svd_init_std_dev,
                       lr_all=svd_lr_all,
                       reg_all=svd_reg_all,
                       verbose=verbose_switch)

        elif (model_type == "SVDpp"):
            # Obtener valores de configuracion
            svdpp_grid_search = self.config['SURPRISE_SVDPP'].getboolean(
                'svdpp_grid_search')
            svdpp_grid_metric = self.config['SURPRISE_SVDPP'][
                'svdpp_grid_metric']
            svdpp_n_factors = int(
                self.config['SURPRISE_SVDPP']['svdpp_n_factors'])
            svdpp_n_epochs = int(
                self.config['SURPRISE_SVDPP']['svdpp_n_epochs'])
            svdpp_init_mean = float(
                self.config['SURPRISE_SVDPP']['svdpp_init_mean'])
            svdpp_init_std_dev = float(
                self.config['SURPRISE_SVDPP']['svdpp_init_std_dev'])
            svdpp_lr_all = float(self.config['SURPRISE_SVDPP']['svdpp_lr_all'])
            svdpp_reg_all = float(
                self.config['SURPRISE_SVDPP']['svdpp_reg_all'])

            if (self.common_functions.validate_available_sql_data(
                    'svdpp_params', sql_db) == True):
                results = pd.read_sql_query('select * from svdpp_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "svdpp")]
                if (real_results.empty == False):
                    svdpp_n_factors = int(
                        real_results.iloc[0]['svdpp_n_factors'])
                    svdpp_n_epochs = int(
                        real_results.iloc[0]['svdpp_n_epochs'])
                    svdpp_init_std_dev = float(
                        real_results.iloc[0]['svdpp_init_std_dev'])
                    svdpp_lr_all = float(real_results.iloc[0]['svdpp_lr_all'])
                    svdpp_reg_all = float(
                        real_results.iloc[0]['svdpp_reg_all'])

            algo = SVDpp(n_factors=svdpp_n_factors,
                         n_epochs=svdpp_n_epochs,
                         init_mean=svdpp_init_mean,
                         init_std_dev=svdpp_init_std_dev,
                         lr_all=svdpp_lr_all,
                         reg_all=svdpp_reg_all,
                         verbose=verbose_switch)

        elif (model_type == "NMF"):
            # Obtener valores de configuracion
            nmf_grid_search = self.config['SURPRISE_NMF'].getboolean(
                'nmf_grid_search')
            nmf_grid_metric = self.config['SURPRISE_NMF']['nmf_grid_metric']
            nmf_n_factors = int(self.config['SURPRISE_NMF']['nmf_n_factors'])
            nmf_n_epochs = int(self.config['SURPRISE_NMF']['nmf_n_epochs'])
            nmf_biased = self.config['SURPRISE_NMF'].getboolean('nmf_biased')
            nmf_reg_pu = float(self.config['SURPRISE_NMF']['nmf_reg_pu'])
            nmf_reg_qi = float(self.config['SURPRISE_NMF']['nmf_reg_qi'])
            nmf_reg_bu = float(self.config['SURPRISE_NMF']['nmf_reg_bu'])
            nmf_reg_bi = float(self.config['SURPRISE_NMF']['nmf_reg_bi'])
            nmf_lr_bu = float(self.config['SURPRISE_NMF']['nmf_lr_bu'])
            nmf_lr_bi = float(self.config['SURPRISE_NMF']['nmf_lr_bi'])
            nmf_init_low = float(self.config['SURPRISE_NMF']['nmf_init_low'])
            nmf_init_high = int(self.config['SURPRISE_NMF']['nmf_init_high'])

            if (self.common_functions.validate_available_sql_data(
                    'nmf_params', sql_db) == True):
                results = pd.read_sql_query('select * from nmf_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "nmf")]
                if (real_results.empty == False):
                    nmf_n_factors = int(real_results.iloc[0]['nmf_n_factors'])
                    nmf_n_epochs = int(real_results.iloc[0]['nmf_n_epochs'])
                    nmf_reg_pu = float(real_results.iloc[0]['nmf_reg_pu'])
                    nmf_reg_qi = float(real_results.iloc[0]['nmf_reg_qi'])
                    nmf_init_low = float(real_results.iloc[0]['nmf_init_low'])

            algo = NMF(n_factors=nmf_n_factors,
                       n_epochs=nmf_n_epochs,
                       biased=nmf_biased,
                       reg_pu=nmf_reg_pu,
                       reg_qi=nmf_reg_qi,
                       reg_bu=nmf_reg_bu,
                       reg_bi=nmf_reg_bi,
                       lr_bu=nmf_lr_bu,
                       lr_bi=nmf_lr_bi,
                       init_low=nmf_init_low,
                       init_high=nmf_init_high,
                       verbose=verbose_switch)

        elif (model_type == "NormalPredictor"):
            algo = NormalPredictor()

        elif (model_type == "BaselineOnly"):
            algo = BaselineOnly(verbose=verbose_switch)

        elif (model_type == "KNNBasic"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnbasic_params', sql_db) == True):
                results = pd.read_sql_query('select * from knnbasic_params;',
                                            sql_db,
                                            index_col='index')
                real_results = results[(results["knowledge"] == knowledge)
                                       & (results["algorithm"] == "knnbasic")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNBasic(k=knn_k,
                            min_k=knn_min_k,
                            sim_options=sim_options,
                            verbose=verbose_switch)

        elif (model_type == "KNNWithMeans"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnwithmeans_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from knnwithmeans_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) & (
                    results["algorithm"] == "knnwithmeans")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNWithMeans(k=knn_k,
                                min_k=knn_min_k,
                                sim_options=sim_options,
                                verbose=verbose_switch)

        elif (model_type == "KNNWithZScore"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnwithzscore_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from knnwithzscore_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) & (
                    results["algorithm"] == "knnwithzscore")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNWithZScore(k=knn_k,
                                 min_k=knn_min_k,
                                 sim_options=sim_options,
                                 verbose=verbose_switch)

        elif (model_type == "KNNBaseline"):
            # Obtener valores de configuracion
            knn_k = int(self.config['SURPRISE_KNN']['knn_k'])
            knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k'])
            knn_grid_search = self.config['SURPRISE_KNN'].getboolean(
                'knn_grid_search')
            knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric']

            if (self.common_functions.validate_available_sql_data(
                    'knnbaseline_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from knnbaseline_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) &
                                       (results["algorithm"] == "knnbaseline")]
                if (real_results.empty == False):
                    knn_k = int(real_results.iloc[0]['knn_k'])
                    knn_min_k = int(real_results.iloc[0]['knn_min_k'])

            algo = KNNBaseline(k=knn_k,
                               min_k=knn_min_k,
                               sim_options=sim_options,
                               verbose=verbose_switch)

        elif (model_type == "SlopeOne"):
            algo = SlopeOne()

        elif (model_type == "CoClustering"):
            # Obtener valores de configuracion
            cc_grid_search = self.config['SURPRISE_COCLUSTERING'].getboolean(
                'cc_grid_search')
            cc_grid_metric = self.config['SURPRISE_COCLUSTERING'][
                'cc_grid_metric']
            cc_n_cltr_u = int(
                self.config['SURPRISE_COCLUSTERING']['cc_n_cltr_u'])
            cc_n_cltr_i = int(
                self.config['SURPRISE_COCLUSTERING']['cc_n_cltr_i'])
            cc_n_epochs = int(
                self.config['SURPRISE_COCLUSTERING']['cc_n_epochs'])

            if (self.common_functions.validate_available_sql_data(
                    'coclustering_params', sql_db) == True):
                results = pd.read_sql_query(
                    'select * from coclustering_params;',
                    sql_db,
                    index_col='index')
                real_results = results[(results["knowledge"] == knowledge) & (
                    results["algorithm"] == "coclustering")]
                if (real_results.empty == False):
                    cc_n_cltr_u = int(real_results.iloc[0]['cc_n_cltr_u'])
                    cc_n_cltr_i = int(real_results.iloc[0]['cc_n_cltr_i'])
                    cc_n_epochs = int(real_results.iloc[0]['cc_n_epochs'])

            algo = CoClustering(n_cltr_u=cc_n_cltr_u,
                                n_cltr_i=cc_n_cltr_i,
                                n_epochs=cc_n_epochs,
                                verbose=verbose_switch)
        else:
            return {
                "status": False,
                "result": "Defined model_type does not exist"
            }

        st = default_timer()
        print("STARTING to train model: " + str(model_name))
        algo.fit(trainset)
        train_model_runtime = default_timer() - st
        # Almacenar tiempo de proceso en base de datos
        self.common_functions.save_process_time(
            st,
            event=str(model_name) + "_training",
            description="Time for model to be trained on dataset")

        # Guardar modelo
        # Crear directorio si no existe
        if (os.path.isdir(self.models_path + model_name) == False):
            try:
                os.makedirs(self.models_path + model_name)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    return {"status": False, "result": e}
        # Almacenar modelo en file system
        #file_name =  self.models_path+model_name+"/model"
        #dump.dump(file_name, algo=algo)

        st = default_timer()
        print("STARTING to generate predictions with the trained model: " +
              str(model_name))
        predictions = algo.test(testset)
        runtime = default_timer() - st

        print(
            "Tiempo de ejecucion total de la generacion de predicciones para Surprise Time:",
            round(runtime, 2))
        self.common_functions.save_process_time(
            st,
            event=str(model_name) + "_generate_recommendations",
            description="Time for predictions to be generated using the model")

        # Guardar predicciones para hibridación
        # Crear directorio si no existe
        if (os.path.isdir(self.models_path + model_name + "/predictions/" +
                          str(k_fold)) == False):
            try:
                os.makedirs(self.models_path + model_name + "/predictions/" +
                            str(k_fold))
            except OSError as e:
                if e.errno != errno.EEXIST:
                    return {"status": False, "result": e}

        # Almacenar predicciones para hibridación
        eval_result = pd.DataFrame(
            columns=['user_id', 'item_id', 'r_ui', 'est'])
        for uid, iid, true_r, est, _ in predictions:
            eval_result = eval_result.append(
                {
                    'user_id': uid,
                    'item_id': iid,
                    'r_ui': true_r,
                    'est': est
                },
                ignore_index=True)
        eval_result.to_csv(path_or_buf=self.models_path + model_name +
                           "/predictions/" + str(k_fold) + "/predictions.csv",
                           encoding='latin1',
                           sep=str(u';').encode('utf-8'),
                           index=False)

        # ---------------------------

        if (system_eval == False):
            # Procesar y evaluar las recomendaciones para el modelo
            st = default_timer()
            print("STARTING to evaluate recommendations with model: " +
                  str(model_name))
            process_evaluate_result = self.evaluation.surprise_process_evaluate(
                predictions,
                knowledge,
                model_name,
                result_name,
                train_model_runtime,
                k_recommend,
                sql_db,
                k_fold,
                is_surprise=True)
            # Almacenar tiempo de proceso en base de datos
            self.common_functions.save_process_time(
                st,
                event=str(model_name) + "_evaluate_model",
                description="Time for model to be evaluated in test dataset")
            if (process_evaluate_result["status"] == True):
                del (process_evaluate_result)
                return {"status": True, "result": ""}
            else:
                del (process_evaluate_result)
                return {
                    "status": False,
                    "result":
                    "no se pudo ejecutar correctamente content_explicit"
                }
        else:
            print("decide what to do")
            #result_model.save(self.models_path+model)

        return {"status": True, "result": ""}
Exemplo n.º 21
0
        recall = recalls[uid]

        # F - Score = 2 * Precision * Recall / (Precision + Recall)
        f1scores[uid] = 2 * precision * recall / (precision + recall) if (
            precision + recall) else 0

    return precisions, recalls, f1scores


# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Run 5-fold cross-validation and print results.
for algo in [
        KNNBaseline(verbose=False),
        KNNBasic(verbose=False),
        KNNWithMeans(verbose=False),
        KNNWithZScore(verbose=False)
]:
    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

print("\n-----------------------------------------------------\n")

sim_options = {
    'name': 'cosine',
    'user_based': False  # compute  similarities between items
}

# Calculate precision and recall
for algo in [
        KNNBaseline(verbose=False),
Exemplo n.º 22
0
                sep=',',
                rating_scale=(0.5, 5),
                skip_lines=1)
data = Dataset.load_from_file('./ml-latest-parsed.csv', reader=reader)

trainset = data.build_full_trainset()

testset = trainset.build_anti_testset()
#testset = trainset.build_anti_testset()

#trainset, testset = train_test_split(data, test_size=.3)

# We'll use the famous SVD algorithm.
print("Creating Model")
sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 2}
algo = KNNBasic(k=40, min_k=2, sim_options=sim_options)

algo = SVDpp()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset, verbose=False)

rid_to_name, name_to_rid = read_item_names()

top_n = get_top_n(predictions, n=8)

user_predictions_table = open("recommendations2.csv", "w+")
user_predictions_table.write("userId,movieId,movieName,prediction,trueValue\n")
user_predictions_readable = open("recommendations_readable2.txt", "w+")
writer = csv.writer(user_predictions_table)
Exemplo n.º 23
0
from surprise import KNNBasic
from surprise import Dataset
from surprise import evaluate, print_perf
import os
from surprise import Reader
#load data from a file
file_path = os.path.expanduser('restaurant_ratings.txt')
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)
data.split(n_folds=3)
algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf)
Exemplo n.º 24
0
from surprise import KNNBasic
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import Reader
import os
#load data from a file
file_path = os.path.expanduser('restaurant_ratings.txt')
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)
data.split(n_folds=3)
algo = KNNBasic(sim_options = {
'name':’MSD’,
'user_based': True
})
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf)
Exemplo n.º 25
0
def simpleUserCFGive(id):
    testSubject = str(id)
    k = 10

    # Load our data set and compute the user similarity matrix
    ml = MovieLens()
    data = ml.loadMovieLensLatestSmall()

    trainSet = data.build_full_trainset()

    sim_options = {'name': 'cosine', 'user_based': True}

    model = KNNBasic(sim_options=sim_options)
    model.fit(trainSet)
    simsMatrix = model.compute_similarities()

    # Get top N similar users to our test subject
    # (Alternate approach would be to select users up to some similarity threshold - try it!)
    testUserInnerID = trainSet.to_inner_uid(testSubject)
    similarityRow = simsMatrix[testUserInnerID]

    similarUsers = []
    for innerID, score in enumerate(similarityRow):
        if (innerID != testUserInnerID):
            similarUsers.append((innerID, score))

    kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

    # Get the stuff they rated, and add up ratings for each item, weighted by user similarity
    candidates = defaultdict(float)
    for similarUser in kNeighbors:
        innerID = similarUser[0]
        userSimilarityScore = similarUser[1]
        theirRatings = trainSet.ur[innerID]
        for rating in theirRatings:
            candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore

    # Build a dictionary of stuff the user has already seen
    watched = {}
    for itemID, rating in trainSet.ur[testUserInnerID]:
        watched[itemID] = 1

    # Get top-rated items from similar users:
    s = "\n" + str(id)
    pos = 0
    for itemID, ratingSum in sorted(candidates.items(),
                                    key=itemgetter(1),
                                    reverse=True):
        if not itemID in watched:
            movieID = trainSet.to_raw_iid(itemID)
            s += "," + ml.getMovieName(int(movieID))
            pos += 1
            if (pos > 10):
                break
    file = open("E:\\Neeraj\\SimpleUserCFBase.txt", "r")
    alld = file.readlines()
    file.close()
    file1 = open("E:\\Neeraj\\SimpleUserCFBase.txt", "w")
    for r1 in alld:
        print(r1)
        u = r1.find(",")
        if (r1[0:u] == str(id)):
            pass
        else:
            file1.write(r1)
    file1.write(s)
    file1.close()
    print("\nDone")
                        unicode_literals)

from surprise import KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset  UserID::MovieID::Rating::Timestamp
data = Dataset.load_builtin('ml-1m')
trainset, testset = train_test_split(data, test_size=.5)

# Configura o algoritmo. K = número de vizinhos. Name = Tipo de medida de similiradade. User based = filtragem por usuário ou item.

print("Usando o algoritmo KNNWithMeans com 50 vizinhos")
print("Algoritmo de similiraridade: Pearson")
algoritmo = KNNBasic(k=50, sim_options={'name': 'pearson', 'user_based': True, 'verbose' : True})

algoritmo.fit(trainset)

# Selecionamos o usuário e o filme que será analisado
# User 49. Tem entre 18 e 24 anos. É programador e mora em Huston, Texas
uid = str(49)  
# Filme visto e avaliado: Negotiator, The (1998)::Action|Thriller. Avaliação 4
iid = str(2058)  # raw item id

# get a prediction for specific users and items.
print("Predição de avaliação: ")
pred = algoritmo.predict(uid, iid, r_ui=4, verbose=True)

# run the trained model against the testset
test_pred = algoritmo.test(testset)
Exemplo n.º 27
0
    for playlist_name in playlist_neighbors_name:
        print(playlist_name, name_id_dic[playlist_name])


playlist_recommend_main()

file_path = os.path.expanduser('neteasy_playlist_recommend_data.csv')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 分成5折 改成了, cv=5
# music_data.split(n_folds=5)

# userCF - default
# itemCF - KNNBasic(sim_options={"user_based": False})
algo = KNNBasic(sim_options={"user_based": False})
perf = cross_validate(algo,
                      music_data,
                      measures=['RMSE', 'MAE'],
                      cv=5,
                      verbose=True)
print(perf)
# 回归算法的评价指标MSE(均方误差),RMSE(均方根误差),MAE(平均绝对误差)

# algo.fit(music_data.build_full_trainset())

# print(algo.get_neighbors(algo.trainset.to_inner_uid('2150055953'), 10))

print(algo.get_neighbors(algo.trainset.to_inner_iid("424262401"), 3))
Exemplo n.º 28
0
    def computeNovelCf(userid):
        testSubject = userid
        k = 10

        # Load our data set and compute the user similarity matrix
        ml = NovelLens()
        data = ml.loadNovelLensLatestSmall()

        trainSet = data.build_full_trainset()

        sim_options = {'name': 'cosine', 'user_based': True}

        model = KNNBasic(sim_options=sim_options)
        model.fit(trainSet)
        simsMatrix = model.compute_similarities()

        # Get top N similar users to our test subject
        # (Alternate approach would be to select users up to some similarity threshold - try it!)
        testUserInnerID = trainSet.to_inner_uid(testSubject)
        similarityRow = simsMatrix[testUserInnerID]

        similarUsers = []
        for innerID, score in enumerate(similarityRow):
            if (innerID != testUserInnerID):
                similarUsers.append((innerID, score))

        kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

        # Get the stuff they rated, and add up ratings for each item, weighted by user similarity
        candidates = defaultdict(float)
        for similarUser in kNeighbors:
            innerID = similarUser[0]
            userSimilarityScore = similarUser[1]
            theirRatings = trainSet.ur[innerID]
        for rating in theirRatings:
            candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore

# Build a dictionary of stuff the user has already seen
        watched = {}
        for itemID, rating in trainSet.ur[testUserInnerID]:
            watched[itemID] = 1

# Get top-rated items from similar users:
        pos = 0
        noveldatapro = []
        novels = []
        for itemID, ratingSum in sorted(candidates.items(),
                                        key=itemgetter(1),
                                        reverse=True):

            if not itemID in watched:
                novelID = trainSet.to_raw_iid(itemID)
                noveldatapro.append(novelID)
                print(ml.getNovelName(int(novelID)), ratingSum)
                novels.append(ml.getNovelName(int(novelID)))
                pos += 1
                if (pos > 9):
                    print("The top 10 novels for the user: " + testSubject)
                    print(noveldatapro)
                    break
        return novels
Exemplo n.º 29
0
from time import time

#create object for MovieLens class
ml = MovieLens()
#run the fuction to load the dataset as data
data = ml.loadMovieLensDataset()

#Build a full trainset to be used for calculating the similarity using KNN algorithm
trainSet = data.build_full_trainset()

#Define a similarity measure to estimate a rating. Here we are using cosine similarity.
#since we need to compute similarity based on items, 'user_based':False

sim_options = {'name': 'cosine', 'user_based': False}

model = KNNBasic(sim_options=sim_options)
t0 = time()
#to train the algorithm on the trainSet
model.fit(trainSet)

#To generate the similarity matrix
similarityMatrix = model.compute_similarities()

testUser = '******'

#convert the raw ID of the user we want the predictions for into inner ID that can be used by the surprise library
user_inner_id = trainSet.to_inner_uid(testUser)

#Get the default dict of list of ratings for the items the user has already rated
check_user_ratings = trainSet.ur[user_inner_id]
Exemplo n.º 30
0
from surprise import KNNBasic
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import Reader
import os

#load data from a file 
file_path = os.path.expanduser('restaurant_ratings.txt')
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)

data.split(n_folds=5)
algo = KNNBasic(sim_options = {
               'user_based': True 
               })
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf)