def final_model(data):
    """Pickles the collaborative filtering recommendation system model for repeat customers.

    Args:
    data -- a dataframe containing user id, item id, and ratings columns in that order.
    """
    # Creates a user ratings surprise matrix for fitting model
    user_ratings_matrix = surprise_df(data)

    # Splits dataset into train and test datasets to generate predictions
    train_set, test_set = train_test_split(user_ratings_matrix,
                                           test_size=0.2,
                                           random_state=19)

    # Best params determined using GridSearchCV
    params = {'n_factors': 10, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1}

    svdpp = SVDpp(n_factors=params['n_factors'],
                  n_epochs=params['n_epochs'],
                  lr_all=params['lr_all'],
                  reg_all=params['reg_all'])

    svdpp.fit(train_set)
    predictions = svdpp.test(test_set)

    # Use surprise wrapper to pickle model
    dump.dump('repeat_customer_model',
              predictions=predictions,
              algo=svdpp,
              verbose=0)
示例#2
0
def top_ten_df (df):
    '''
    inputs:
    df (Pandas DF) the dataframe that you would like to train on/NOTE: use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF

    outputs:
    top_ten_df (DataFrame Pandas) returns a dataframe with the top ten predictions for every user in your original dataframe
    '''

    data= f.read_data_surprise(df)#use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF

    # First train an SVD algorithm on entire dataset (choose 6x name filter)
    trainset = data.build_full_trainset()
    algo = SVDpp()#n_epochs= 18, lr_all= 0.01, reg_all= 0.175
    algo.fit(trainset)

    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset()#HEAVY THIS TAKES THE MOST RAM
    predictions = algo.test(testset)

    #create a dictionary of predictions
    top_n = f.get_top_n(predictions, n=10)

    #Turn the dictionary into a df
    top_ten_df = pd.DataFrame(top_n)

    return top_ten_df
示例#3
0
    def __init__(self, dataset_path, books_path, model_path, algo='SVD'):
        """Init the recommendation engine given a Spark context and a dataset
        """
        self.ratings_path = dataset_path
        self.model_path = model_path
        self.books_path = books_path
        if algo == 'SVD':
            self.algo = algo
            self.SVD = SVDpp()
            (bk, data, rankings) = self.loadBookData()
            self.bk = bk
            self.rankings = rankings
            self.dataset = EvaluationData(data, rankings)

        # ALS algorithm part
        self.spark = SparkSession\
                    .builder\
                    .appName("ReadMore")\
                    .config("spark.executor.cores", '4')\
                    .getOrCreate()

        self.bookID_to_name = self.loadBookNames()
        self.als = ALS(maxIter=5,
                       regParam=0.01,
                       userCol="userId",
                       itemCol="bookId",
                       ratingCol="rating",
                       coldStartStrategy="drop")
        self.ratings = self.loadDataFrame(dataset_path)
        self.train_ALSmodel()
示例#4
0
        def svd(user_id, area):
            algo = SVDpp()
            algo = SVDpp(n_factors=100, n_epochs=15)
            # 3. train model 저장
            file_name = os.path.expanduser('./dump')
            #dump.dump(file_name, algo=algo) # 한번 학습하고 여기는 주석처리
            _, algo = dump.load(file_name)

            Area = pd.read_csv('./area.csv')  ## { 상품아이디(학습데이터), area, 상품ID }

            #nowarea="C"
            #user=str("A2CX7LUOHB2NDG") # usre ID 받아오기
            neww = Area[Area['area'] == area]['productID'].tolist()  # 구역 받아오기
            predictions = [
                algo.predict(str(user_id), str(productID))
                for productID in neww
            ]  # 예측

            ######
            def sortkey_est(pred):
                return pred.est

            predictions.sort(key=sortkey_est, reverse=True)
            #print(predictions)
            top_product_id = [int(pred.iid) for pred in predictions]
            top_product_id = top_product_id[:5]
            return top_product_id
示例#5
0
def time_location_model(df):
    """
        Shows the performance of model based on just bias
    """
    lower = df['date_dist_rating'].min()
    upper = df['date_dist_rating'].max()
    df = df.drop(columns=["rating", "dist_rating", "date_rating"], axis=1)

    reader = Reader(rating_scale=(lower, upper))  #TODO figure out

    data = surprise.dataset.Dataset.load_from_df(df=df, reader=reader)

    ts = data.build_full_trainset()
    dusers = ts._raw2inner_id_users
    ditems = ts._raw2inner_id_items

    # breakpoint()
    trainset, testset = train_test_split(data)

    algo = SVDpp()
    algo.fit(trainset)

    # testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    print('\n')
    return (trainset, testset, predictions, dusers, ditems)
 def __init__(self, k=5):
     if not isinstance(k, int) or k <= 0:
         raise IOError("Parameter k should be a positive integer.")
     self.data = None
     self.k = k
     self.algo = SVDpp(n_factors=self.k)
     self.predictions = pd.DataFrame()
示例#7
0
def trainSVD_surprise3D(
        training_data,
        colorlabels,
        plot=True,
        savefig="figures/"):  #colorlabels, sizelabels, plot=True, savefig=True
    # algo = SVD(n_factors=4, n_epochs=1000, biased=True)
    # algo = SVD(n_factors=20, n_epochs=500, biased=False)
    algo = SVDpp(n_factors=10, n_epochs=1000)
    algo.fit(training_data)
    U = algo.pu
    if plot:
        fig = plt.figure(figsize=(8, 8))
        # ax = fig.add_subplot(1,1,1)
        ax = fig.add_subplot(111, projection='3d')
        ax.set_xlabel('First', fontsize=15)
        ax.set_ylabel('Second', fontsize=15)
        ax.set_title('Reduced SVD', fontsize=20)
        scatter = ax.scatter(
            U[:, 0], U[:, 1], U[:, 2], c=colorlabels, s=10, alpha=0.7
        )  #explore labeling colors with features like demographics, age
        ax.grid()
        cbar = fig.colorbar(scatter, ax=ax)
        cbar.set_label("state")
        if savefig:
            plt.savefig(savefig + "svd_counties_3D")
        plt.show()
def svdPP(data):  #SVDPP algorithm
    print("\nTraining SVDPP model..\n")
    global x_test, y_test, testlen, trainlen, model_params, x_train, y_train, X, Y, avg_rat, cold_itm
    p1, p2, p3 = [
        model_params[1]['n_epochs'], model_params[1]['lr_all'],
        model_params[1]['reg_all']
    ]
    svdModel = SVDpp(n_epochs=p1, lr_all=p2, reg_all=p3)
    svdModel.fit(data.build_full_trainset())
    print("\nTraining done..\nPrediction started..")
    test = [(x_test[i][0], x_test[i][1], y_test[i]) for i in range(testlen)]
    #train_=[(x_train[i][0],x_train[i][1],y_train[i]) for i in range(trainlen)]
    #total_=[(X[i][0],X[i][1],Y[i]) for i in range(trainlen+testlen)]
    predict = svdModel.test(test)

    #trainset, testset = t_t_s(data, test_size=.25)
    svdModel_1 = SVDpp()
    svdModel_1.fit(data.build_full_trainset())
    predict1 = svdModel_1.test(test)
    #predict_train = svdModel_1.test(train_)
    #predict_tot = svdModel_1.test(total_)
    usrA = [int(i[0]) - 1 for i in predict]
    itmA = [int(i[1]) - 1 for i in predict]
    res = [i[3] for i in predict]
    res1 = [i[3] for i in predict1]
    for i in range(testlen):
        if itmA[i] in cold_itm:
            res[i] = avg_rat[usrA[i]]
            res1[i] = avg_rat[usrA[i]]
    #restrain=[i[3] for i in predict_train]
    print("\nPrediction done..\n")
    return [res, res1, svdModel, svdModel_1]  #,restrain, predict_tot
示例#9
0
class RecommenderSVDpp(Recommender):
    def __init__(self, recommendation_dataset: RecommendationDataSet):
        super(RecommenderSVDpp, self).__init__(recommendation_dataset.movies)
        self.algorithm = SVDpp()
        self.recommendation_dataset = recommendation_dataset

    def fit(self, dataset):
        return self.algorithm.fit(dataset)

    def test(self, test_set):
        return self.algorithm.test(test_set)

    def get_recommendation(self, watched, k=20):
        # get dataset 
        new_user_id, full_dataset = self.recommendation_dataset.get_dataset_with_extended_user(watched)
        inner_user_id = full_dataset.to_inner_uid(new_user_id)

        # after new dataset we need again train our model with the new user for the whole 
        # dataset with the new user.
        self.algorithm.fit(full_dataset)

        # watched movies
        watched = {full_dataset.to_inner_iid(key): value for key,value in watched.items()}

        # Calculate for all similar user, predictions
        test_items = [
            self.algorithm.predict(new_user_id, full_dataset.to_raw_iid(i))
            for i in range(0, full_dataset.n_items)
            if i not in watched
        ]

        topn_items = [i[0] for i in get_top_n(test_items, n=k, minimum_rating=1.0)[new_user_id]]
        return self.movies.get_movie_by_movie_ids(topn_items)
示例#10
0
def predict():
	global top_n
	global user_id
	print("--predict start--------------------------------")

	# dataset import
	rating_data = pd.DataFrame(get_default_ratings()) 
    

	reader = Reader(rating_scale=(0, 5))
	data = Dataset.load_from_df(df=rating_data, reader=reader)

	trainset_2, testset_2 = train_test_split(data, test_size=0.3)

	# print("--test2--------------------------------")


	algo = SVDpp()
	predictions = algo.fit(trainset_2).test(testset_2)

	# print("--test1--------------------------------")

	top_n = get_top_n(predictions, n=10)

	print("--predict end--------------------------------")
示例#11
0
def test_SVDpp_parameters():
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = SVDpp(n_factors=1, n_epochs=1, random_state=1)
    rmse_default = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']

    # n_factors
    algo = SVDpp(n_factors=2, n_epochs=1, random_state=1)
    rmse_factors = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_factors

    # AMAN
    algo = SVDpp(n_factors=1,
                 n_epochs=1,
                 amau=False,
                 missing_val=0,
                 downweight=.001)
    rmse_aman = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_aman

    # Mean - centered
    algo = SVDpp(n_factors=1, n_epochs=1, mean_centered=False)
    rmse_mean_centered = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_mean_centered

    # biased
    algo = SVD(n_factors=1, n_epochs=1, biased=False)
    rmse_biased = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_biased

    # The rest is OK but just takes too long for now...
    """
def fit_model(mlr_df):
    algo = SVDpp()
    # Object to parse the data
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(mlr_df[['userId', 'id', 'rating']], reader)
    trainset = data.build_full_trainset()
    PREDICTOR = algo.fit(trainset)
    return PREDICTOR
示例#13
0
def SVDpp_calculation(data , trainset, testset, time, cv):
    start = time.time()
    algo = SVDpp()
    algo.fit(trainset)
    predictions = algo.test(testset)
    cross_validate_svdpp_dict = cross_validate(algo, data, measures = ['RMSE'],cv=cv,verbose=True)
    end = time.time()
    time = end-start
    
    return time, cross_validate_svdpp_dict
示例#14
0
def svdpp(trainset, testset):
    # Matrix factorization - SVD++
    print("\n" + "-" * 5 + " SVD++ algorithm using surprise package " +
          "-" * 5)
    algo = SVDpp()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
示例#15
0
class TrainModel:

    # def __init__(self, method='als', n_epochs=20, sim_option='pearson_baseline'):
    #
    #     self.algo = KNNBasic(bsl_options={'method': method,'n_epochs': n_epochs},
    #                          sim_options={'name': sim_option, 'user_based': False})
    def __init__(self, lr_all=0.006, n_epochs=40):
        self.algo = SVDpp(lr_all=lr_all, n_epochs=n_epochs)
        self.reader = Reader(rating_scale=(0, 1))
        self.filename = 'trained_model.pkl'

    def read_from_df(self, dataframe, user_col, item_col, rating_col):
        data = Dataset.load_from_df(
            dataframe[[user_col, item_col, rating_col]], self.reader)
        trainset = data.build_full_trainset()
        return trainset

    def train_mod(self, dataframe, user_col, item_col, rating_col):
        self.algo.fit(
            self.read_from_df(dataframe, user_col, item_col, rating_col))

    def dump_model(self, predictions):
        saved_ent = dump.dump(self.filename,
                              algo=self.algo,
                              predictions=predictions)
        return saved_ent

    def load_model(self):
        predictions, loaded_ent = dump.load(self.filename)
        return predictions, loaded_ent

    def get_user_pred(self,
                      user_id,
                      dataframe,
                      user_col,
                      item_col,
                      rating_col,
                      n=2):
        data = Dataset.load_from_df(
            dataframe[[user_col, item_col, rating_col]], self.reader)
        testset = data.build_full_trainset().build_anti_testset()
        predictions = self.algo.test(testset)
        top_n = dict()
        for uid, iid, _, est, _ in predictions:
            if uid == user_id: top_n[iid] = est
        top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True)
        return predictions, top_n[:n]

    def get_user_pred_stable(self, user_id, predictions, n=2):
        top_n = dict()
        for uid, iid, _, est, _ in predictions:
            if uid == user_id: top_n[iid] = est
        top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True)
        # top_nn = {k: top_n[k] for k in top_n.keys()[0][:n]}
        return top_n[:n]
示例#16
0
 def create_model(self):
     n = 1000000
     raw_data = self.get_ratings()[:n].fillna(0)[["userId", "id", "rating"]]
     reader = Reader()
     data = Dataset.load_from_df(raw_data, reader)
     data.split(n_folds=5)
     svdpp = SVDpp()
     trainset = data.build_full_trainset()
     svdpp.fit(trainset)
     filename = "C:/datasets/the-movies-dataset/models/collaborative_based/coll_svdpp.sav"
     joblib.dump(svdpp, filename)
示例#17
0
def svdpp(dataset):
    start = time.time()
    algo = SVDpp()
    kf = KFold(n_splits=5)
    for trainset, testset in kf.split(dataset):
        algo.fit(trainset)
        predictions = algo.test(testset)
        acc = accuracy.rmse(predictions, verbose=True)
    end = time.time()
    print('svdpp花分钟数为:', (end - start) / 60)
    return acc
示例#18
0
class SvdPP(RecommenderBase):

    """
        SVDpp algorithm.
        Actually woring bad, just a draft
    """

    def __init__(self, URM):

        print('train set built')
        # double check if training set is built fine for sgd
        # for u, i, r in self.trainset.all_ratings():
        #     a = 1

    def fit(self, urm, n_factors=20, n_epochs=20, lr_all=0.007, reg_all=0.02, init_mean=0,
            init_std_dev=0.1, verbose=True):
        # create the training set
        r, c = urm.nonzero()
        ones = np.ones(len(r), dtype=np.int32)
        d = np.vstack((r, c, ones)).transpose()
        df = pd.DataFrame(d)
        df.columns = ['userID', 'itemID', 'rating']
        reader = Reader()
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
        self.trainset = data.build_full_trainset()

        # fit
        self.algo = SVDpp(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, init_mean=init_mean,
                          init_std_dev=init_std_dev, verbose=verbose)
        self.algo.fit(self.trainset)

    def recommend(self, userid, N=10, urm=None, filter_already_liked=True, with_scores=True, items_to_exclude=[]):
        if len(items_to_exclude) > 1:
            raise NotImplementedError('Items to exclude functionality is not implemented yet')

        r = np.empty([1])
        for i in range(d.N_TRACKS):
            p = self.algo.predict(userid, i)
            r = np.array([p[3]]) if i == 0 else np.concatenate((r, np.array([p[3]])))

        if filter_already_liked:
            if urm == None:
                raise ValueError('Please provide a URM in order to items already liked')
            else:
                r[urm.getrow(userid).nonzero()[1]] = 0

        l = [userid]
        ind = np.argpartition(r, -10)[-10:]
        for i in ind:
            if with_scores:
                l.append((i, r[i]))
            else:
                l.append(i)
        return l
示例#19
0
    def train(self, data):
        ratings_dict = {'itemID': data[:,1],
                        'userID': data[:,0],
                        'rating': data[:,2]}
        df = pd.DataFrame(ratings_dict)

        reader = Reader(rating_scale=(0, 1))

        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader).build_full_trainset()
        # self.algo = KNNBasic(verbose=False)
        self.algo = SVDpp(verbose=True)
        self.algo.fit(data)
示例#20
0
def model(train_set, test_set):
    params = {'n_factors': 3, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1}

    svdpp = SVDpp(n_factors=params['n_factors'],
                  n_epochs=params['n_epochs'],
                  lr_all=params['lr_all'],
                  reg_all=params['reg_all'])
    svdpp.fit(train_set)

    predictions = svdpp.test(test_set)
    rmse = accuracy.rmse(predictions, verbose=False)

    return predictions, rmse
示例#21
0
 def __init__(self):
     self.sales_list_df = self.getSalesList()
     self.product_df = self.ProductList()
     self.lower_rating = self.sales_list_df['sum_quantity'].min()
     self.upper_rating = self.sales_list_df['sum_quantity'].max()
     self.data = self.LoadDataset()
     self.train_set, self.test_set = train_test_split(self.data,
                                                      test_size=0.20)
     self.algo = SVDpp()
     self.algo.fit(self.train_set)
     pred = self.algo.test(self.test_set)
     # Test score
     score = accuracy.rmse(pred)
示例#22
0
def test_SVDpp_parameters():
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = SVDpp(n_factors=1, n_epochs=1)
    rmse_default = evaluate(algo, data, measures=['rmse'])['rmse']

    # n_factors
    algo = SVDpp(n_factors=2, n_epochs=1)
    rmse_factors = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_factors

    # The rest is OK but just takes too long for now...
    """
示例#23
0
def test_SVDpp_parameters(u1_ml100k, pkf):
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = SVDpp(n_factors=1, n_epochs=1, random_state=1)
    rmse_default = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    # n_factors
    algo = SVDpp(n_factors=2, n_epochs=1, random_state=1)
    rmse_factors = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_factors

    # The rest is OK but just takes too long for now...
    """
示例#24
0
def trainSVD_surprise(
        training_data,
        colorlabels,
        plot=True,
        simplify=False,
        savefig="figures/"):  #colorlabels, sizelabels, plot=True, savefig=True
    # algo = SVD(n_factors=4, n_epochs=1000, biased=True)
    # algo = SVD(n_factors=20, n_epochs=500, biased=False)
    algo = SVDpp(n_factors=3, n_epochs=1000)
    algo.fit(training_data)
    U = algo.pu
    if plot:
        fig = plt.figure(figsize=(8, 8))
        ax = fig.add_subplot(1, 1, 1)
        ax.set_xlabel('First', fontsize=15)
        ax.set_ylabel('Second', fontsize=15)
        ax.set_title('Reduced SVD', fontsize=20)
        scatter = ax.scatter(
            U[:, 0], U[:, 1], c=colorlabels, s=10, alpha=0.7
        )  #explore labeling colors with features like demographics, age
        ax.grid()
        cbar = fig.colorbar(scatter, ax=ax)
        cbar.set_label("state")
        if savefig:
            plt.savefig(savefig + "svd_counties")
        plt.show()

    if simplify:
        U = U.transpose()
        A = np.linalg.svd(U)[0]
        U_proj = np.dot(A[:, :2].transpose(), U)
        # Rescale dimensions
        U_proj /= U_proj.std(axis=1).reshape(2, 1)
        if plot:
            fig = plt.figure(figsize=(8, 8))
            ax = fig.add_subplot(1, 1, 1)
            ax.set_xlabel('First', fontsize=15)
            ax.set_ylabel('Second', fontsize=15)
            ax.set_title('Reduced SVD', fontsize=20)
            scatter = ax.scatter(U_proj[0], U_proj[1], c=colorlabels, s=10)
            ax.grid()
            cbar = fig.colorbar(scatter, ax=ax)
            cbar.set_label("state")
            if savefig:
                plt.savefig(savefig + "svd_counties_simplfied")
            plt.show()
        return U_proj

    return U
示例#25
0
def RecommendPredictions():
    ## Load train and test data into Dataframes
    trainDF = pan.read_csv("data_source/train_count_norm_1_10.csv",
                           header=None,
                           dtype={2: np.float16})
    trainDF = trainDF.fillna(10.0)

    reader = Reader(rating_scale=(1, 10))

    print "Load train set...."
    dataTrain = Dataset.load_from_df(trainDF[[0, 1, 2]], reader=reader)
    trainset = dataTrain.build_full_trainset()

    print "Initiate Training ....."
    algo = SVDpp(n_epochs=1, lr_all=0.01, reg_all=0.02, verbose=True)
    algo.train(trainset)

    ## Predictions for test set with ground truth present
    print " Load test set..."
    testDF = pan.read_csv("data_source/test_count_norm_1_10.csv",
                          header=None,
                          dtype={2: np.float16})
    testDF = testDF.fillna(10.0)
    dataTest = Dataset.load_from_df(testDF[[0, 1, 2]], reader=reader)
    testset = dataTest.build_full_trainset().build_testset()

    print "Start predictions"
    predictions = algo.test(testset)

    try:
        os.remove("data_source/predictions_results_svdpp.csv")
    except OSError:
        pass

    print "Saving Prediction results in File"
    resultFile = open("data_source/predictions_results_svdpp.csv", "a")
    csv_writer = csv.writer(resultFile)

    for item in predictions:
        predictionTuple = (item.uid, item.iid, item.r_ui, item.est)
        csv_writer.writerow(predictionTuple)

    resultFile.close()

    ## Predictions for test set with random products present
    ##	LEFT

    #rmse = accuracy.rmse(predictions, verbose=True)
示例#26
0
def svd_pp():
    print('Algoritmo Baseline Only...')
    print('Que data desea utilizar?')
    print('(1) Android')
    print('(2) WordPress')
    data_utilizar = input()

    # Funcion de encoding para no tener error de lectura del archivo.
    reload(sys)
    sys.setdefaultencoding('utf8')

    if data_utilizar == 1:
        file_path = configuration.FILE_PATH_ANDROID
        reader = Reader(line_format='user item rating', sep='\t')
    else:
        file_path = configuration.FILE_PATH_WORDPRESS
        reader = Reader(line_format='user item rating', sep=',')

    # Dataset
    data = Dataset.load_from_file(file_path, reader=reader)
    data.split(n_folds=10)

    algo = SVDpp()

    perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
    print_perf(perf)
示例#27
0
def batchrunSVDpp(data, al, folds):
    '''
    define a function to run batches of data
    Args:
        data: data file name in string.
        al: algorithm name in string.
        folds: split the data into x folds for cross-validation, interger
    Returns:
        None	
	'''

    #load the data with given data format
    print "load data..."
    data = Dataset.load_from_file(path + data, reader=reader)

    #split the data into x folds for cross-validation.
    print "Split data...."
    data.split(n_folds=folds)

    # We'll use the famous SVDpp algorithm.

    if al == 'SVDpp':
        algo = SVDpp()
    elif al == 'Base':
        algo = BaselineOnly(bsl_options=bsl_options)

    # Evaluate performances of the algorithm on the dataset.
    perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

    print_perf(perf)
def check_for_args():
    args = sys.argv
    for arg in args:
        if (arg == 'SVD'):
            alg_list.append(SVD())
        elif (arg == 'SVDpp'):
            alg_list.append(SVDpp())
        elif (arg == 'SlopeOne'):
            alg_list.append(SlopeOne())
        elif (arg == 'NMF'):
            alg_list.append(NMF())
        elif (arg == 'NormalPredictor'):
            alg_list.append(NormalPredictor())
        elif (arg == 'KNNBaseline'):
            alg_list.append(KNNBaseline())
        elif (arg == 'KNNBasic'):
            alg_list.append(KNNBasic())
        elif (arg == 'KNNWithMeans'):
            alg_list.append(KNNWithMeans())
        elif (arg == 'KNNWithZScore'):
            alg_list.append(KNNWithZScore())
        elif (arg == 'BaselineOnly'):
            alg_list.append(BaselineOnly())
        elif (arg == 'CoClustering'):
            alg_list.append(CoClustering())

    return alg_list
def EvaluateDifferentAlgorithms():
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF(),
            NormalPredictor(),
            KNNBaseline(),
            KNNBasic(),
            KNNWithMeans(),
            KNNWithZScore(),
            BaselineOnly(),
            CoClustering()
    ]:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_6months,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
示例#30
0
def batchRunBaseline(data, al, folds):
	'''
    define a function to run batches of data
    Args:
        data: data file name in string.
        al: algorithm name in string.
        folds: split the data into x folds for cross-validation, interger
    Returns:
        None	
	'''

	#load the data with given data format
	print "load data..."
	data = Dataset.load_from_file(path + data, reader=reader)

	#split the data into x folds for cross-validation.
	print "Split data...."
	data.split(n_folds=folds)

	bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }

	elif al == 'SVDpp':
		algo = SVDpp()