示例#1
0
def test_dump_nothing():
    """Ensure that by default None objects are dumped."""
    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name)
        predictions, algo = dump.load(tmp_file.name)
        assert predictions is None
        assert algo is None
示例#2
0
def test_dump(u1_ml100k):
    """Train an algorithm, compute its predictions then dump them.
    Ensure that the predictions that are loaded back are the correct ones, and
    that the predictions of the dumped algorithm are also equal to the other
    ones."""

    random.seed(0)

    trainset, testset = next(PredefinedKFold().split(u1_ml100k))

    algo = BaselineOnly()
    algo.fit(trainset)
    predictions = algo.test(testset)

    with tempfile.NamedTemporaryFile() as tmp_file:
        dump.dump(tmp_file.name, predictions, algo)
        predictions_dumped, algo_dumped = dump.load(tmp_file.name)

        predictions_algo_dumped = algo_dumped.test(testset)
        assert predictions == predictions_dumped
        assert predictions == predictions_algo_dumped
示例#3
0
import pandas as pd
from surprise import Reader, Dataset, accuracy, dump, SVD
import surprise
import pickle
from surprise.model_selection import cross_validate, GridSearchCV, KFold

data = pd.read_pickle("data.pickle")

algo = surprise.SVD(n_factors=15,
                    n_epochs=10,
                    lr_all=0.03,
                    reg_all=0.04,
                    verbose=True)
kf = KFold(n_splits=3)

for trainset, testset in kf.split(data):
    pickle_out = open("testset.pickle", "wb")
    pickle.dump(testset, pickle_out)
    pickle_out.close()
    algo.fit(trainset)
    dump.dump('saved svd modelV12', predictions=False, algo=algo, verbose=0)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
示例#4
0
 def backup(self, filepath):
     dump.dump(filepath, predictions=None, algo=self.algorithm, verbose=1)
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)

pkf = PredefinedKFold()

#kf = KFold(n_splits=5)

algorithm = SVD()

i = 1
for train, test in pkf.split(data):
    print(f'Fitting model {i}')
    algorithm.fit(train)
    print(f'Testing model {i}')
    predictions = algorithm.test(test)


    print(f'Saving model {i}')
    model_dump = dump.dump(f'../svd_data/model{i}.model', predictions, algorithm)
    i += 1

# for trainset, testset in kf.split(data):
#     algorithm.fit(trainset)
#     predictions = algorithm.test(testset)
#
#     accuracy.rmse(predictions)
#
#     print(algorithm.pu)

示例#6
0
train = train_raw.build_full_trainset()

model.fit(train)
gc.collect()
'''
train_pred = model.test(train.build_testset())
val_raw = Dataset.load_from_df(df_val[['User Number', 'Movie Number', 'Rating']], reader)
val = val_raw.build_full_trainset()
val_pred = model.test(val.build_testset())

print('Train RMSE:', accuracy.rmse(train_pred))
print('Val RMSE:', accuracy.rmse(val_pred))
'''

if save_model:

    print('Saving model...')
    dump.dump(os.path.join('models', 'surprise_model'), model)

if submit:

    print('Saving submission...')
    df_qual = pd.read_csv(os.path.join('data', 'mu_qual.csv'))

    pred = []
    for _, row in df_qual.iterrows():
        r_est = model.predict(row['User Number'], row['Movie Number']).est
        pred.append(r_est)

    save_submission(model_name, pred, ordering)
示例#7
0
  pred.append(algo.predict(str(row[1]), str(row[2]), r_ui=row[4]).est)
# 计算在1-5评分上的rmse
rmse_test = np.sqrt(mean_squared_error(temp_test['score[1,5]'].tolist(),pred))
print('rmse on test scale[1,5]:', rmse_test)
# 四舍五入
pred_round = np.round(pred)
# 从1-5转到原来的数据
pred_score = []
for p in pred_round:
  # 先转化为int
  pred_score.append(rescale1_5(int(p)))
from sklearn.metrics import mean_squared_error
# 计算rmse
rmse = np.sqrt(mean_squared_error(test_score,pred_score))
print('rmse on test scale 0-100:', rmse)
dump('svd-250.model', algo=algo, verbose=1)
del algo
all_end = time.perf_counter()
print('Running time: %s Seconds' % (all_end - all_begin))

import time
all_begin = time.perf_counter()
# 基模型
def svd(train=[], input_csv=False):
    print('begin svd')
    begin = time.perf_counter()
    # 告诉文本阅读器,文本的格式是怎么样的
    reader = Reader(line_format='user item rating', sep=',', skip_lines=1,rating_scale=(1,5))
    # 从csv中加载数据
    if input_csv is True:
        # 指定文件所在路径
df = pd.merge(user, rating, on='user_id', how='inner')
#df.drop(['user_id', 'Age'], axis=1, inplace=True)
df.head()

reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df[['user_id', 'route_id', 'rating']], reader)

train, test = train_test_split(data, test_size=.2)

sim_options = {'name': 'msd',
               'min_support': 5,
               'user_based': True}
base1 = KNNBaseline(k=30,sim_options=sim_options)

base1.fit(train)
base1_preds = base1.test(test)
accuracy.rmse(base1_preds)

sim_options1 = {'name': 'cosine',
               'min_support': 5,
               'user_based': True}
base13 = KNNBaseline(k=2,sim_options=sim_options1)


base13.fit(train)
base13_preds = base13.test(test)
acc = accuracy.rmse(base13_preds)


dump.dump('KNNFinal_Model',algo=base13,predictions=base13_preds)
示例#9
0
    print('Data loaded: %s' % loaddata_time, file=original_stdout)

start_splitdata = time.time()
trainset, testset = model_selection.train_test_split(data)
splitdata_time = time.time() - start_splitdata
if args.verbose:
    print('Train test set created: %s' % splitdata_time, file=original_stdout)

model = eval(model_class)()
fit_time = fit(model, trainset)
if args.verbose:
    print('Fit time: %s' % fit_time, file=original_stdout)
if not fit_only:
    test_measures, test_time = score(model, testset, ['RMSE', 'MAE'])
    if args.verbose:
        print('Test time: %s' % test_time, file=original_stdout)
        print('RMSE: %s' % test_measures.get('RMSE'), file=original_stdout)
        print('MAE: %s' % test_measures.get('MAE'), file=original_stdout)

if filename:
    dump(filename, algo=model)
    if args.verbose:
        print('Model saved: %s' % filename, file=original_stdout)

sys.stdout = original_stdout
print(fit_time,
      test_time,
      test_measures.get('RMSE'),
      test_measures.get('MAE'),
      file=original_stdout)
示例#10
0
 def save(self, filename):
     dump(filename, algo=self.algo)
示例#11
0
print("end of metadata")
#Collaborative Filtering
start = time.time()
reader = Reader()
ratings = pd.read_csv('ratings_small.csv')
print(ratings.head(5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])
end = time.time()
elapsed = end - start
print(elapsed)
trainset = data.build_full_trainset()
svd.train(trainset)
dump.dump('./dump_file', svd)
svd = dump.load('./dump_file')
# print(ratings[ratings['userId'] == 554])
end = time.time()
elapsed = end - start
print(elapsed)
# quit()
print(svd.predict(554, 509, 4))
print(type(svd))


#end Collaborative Filtering
def convert_int(x):
    try:
        return int(x)
    except:
示例#12
0
 def dump_model(self, predictions):
     saved_ent = dump.dump(self.filename, algo = self.algo, predictions=predictions)
     return saved_ent
示例#13
0
               n_factors=param['n_factors'])

    # Run 5-fold cross-validation and print results
    results = cross_validate(algo,
                             data,
                             measures=['RMSE', 'MAE'],
                             cv=5,
                             n_jobs=-1,
                             verbose=True)

    avg_rmse = 0
    rmse = results['test_rmse']
    for r in rmse:
        avg_rmse += r
    avg_rmse /= len(rmse)

    if avg_rmse < best_RMSE:
        best_RMSE = avg_rmse
        best_param = param

# In[ ]:

print("Best RMSE: %s" % best_RMSE)
print("Best params: %s" % best_param)

trainset = data.build_full_trainset()
algo.fit(trainset)

# save the trained SVD model
dump.dump('../.tmp/svd', algo=algo)
示例#14
0
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="diabetes_model.pkl",
    )

    parser.add_argument(
        "--step_output",
        type=str,
        help=("output for passing data to next step")
    )

    parser.add_argument(
        "--dataset_version",
        type=str,
        help=("dataset version")
    )

    parser.add_argument(
        "--data_file_path",
        type=str,
        help=("data file path, if specified,\
               a new version of the dataset will be registered")
    )

    parser.add_argument(
        "--caller_run_id",
        type=str,
        help=("caller run id, for example ADF pipeline run id")
    )

    parser.add_argument(
        "--dataset_name",
        type=str,
        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation")
    )

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [caller_run_id]: %s" % args.caller_run_id)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    print("Getting training parameters")

    # Load the training parameters from the parameters file
    with open("parameters.json") as f:
        pars = json.load(f)
    try:
        train_args = pars["training"]
    except KeyError:
        print("Could not load training values from file")
        train_args = {}

    # Log the training parameters
    print(f"Parameters: {train_args}")
    for (k, v) in train_args.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Get the dataset
    if (dataset_name):
        if (data_file_path == 'none'):
            dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version)  # NOQA: E402, E501
        else:
            dataset = register_dataset(run.experiment.workspace,
                                       dataset_name,
                                       os.environ.get("DATASTORE_NAME"),
                                       data_file_path)
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset
    run.parent.tag("dataset_id", value=dataset.id)

    # Split the data into test/train
    df = dataset.to_pandas_dataframe()
    data = split_data(df)

    # Train the model
    model = train_model(data, train_args)

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, data)
    for (k, v) in metrics.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Pass model file to next step
    os.makedirs(step_output_path, exist_ok=True)
    model_output_path = os.path.join(step_output_path, model_name)
    # joblib.dump(value=model, filename=model_output_path)
    dump.dump(model_output_path, algo=model)

    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)
    # joblib.dump(value=model, filename=output_path)
    dump.dump(output_path, algo=model)

    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
示例#15
0
# nnmodel.add(Dense(1, input_dim=num_models))
# nnmodel.fit(blend_pred, valset)
#

#real_blend_pred = nnmodel.predict(blend_pred)
#
# blend_rmse = RMSE(blend_pred, valset)
#
# save_submission(model_name, real_blend_pred, ordering)

#'''

if save_model:

    print('Saving model...')
    dump.dump(os.path.join('models', model_name), model)

if submit:

    # Save probe and qual predictions for blending
    print('Saving submission...')
    df_qual = pd.read_csv(os.path.join('data', 'mu_qual.csv'))

    pred = []
    for _, row in df_qual.iterrows():
        r_est = model.predict(row['User Number'], row['Movie Number']).est
        pred.append(r_est)

    save_submission(model_name + "_qual", pred, ordering)

    df_qual = pd.read_csv(os.path.join('data', 'mu_probe.csv'))
import os

from surprise import SVD
from surprise import Dataset
from surprise import dump


data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()

algo = SVD()
algo.fit(trainset)

# Compute predictions of the 'original' algorithm.
predictions = algo.test(trainset.build_testset())

# Dump algorithm and reload it.
file_name = os.path.join(os.getcwd(), 'dump_file')
# file_name = os.path.expanduser('~/dump_file')
dump.dump('./dump_file', algo=algo)
predictions, loaded_algo = dump.load(file_name)

# We now ensure that the algo is still the same by checking the predictions.
predictions_loaded_algo = loaded_algo.test(trainset.build_testset())
assert predictions == predictions_loaded_algo
print('Predictions are the same')
示例#17
0
 def save_mode(self, file_path):
     dump.dump(file_path, algo=self.algo)
示例#18
0
                                        reader).build_full_trainset()

    print('Training model ' + str(i) + '...')
    algo = SVD(verbose=True,
               n_factors=factors,
               n_epochs=epochs,
               reg_bu=user_reg,
               reg_bi=game_reg,
               reg_pu=user_vec_reg,
               reg_qi=game_vec_reg)
    algo.fit(train_data)

    print('Done Training Model ' + str(i))
    print('Saving model...')
    predictions = algo.test(train_data.build_testset(), verbose=False)
    dump.dump(file_name=model_path, algo=algo, verbose=True)
    del algo
    print('Updating weights...')
    subset_inds = np.zeros(len(predictions))
    est_vec = np.zeros(len(predictions))
    true_vec = np.zeros(len(predictions))
    count = 0
    subset_inds = []
    for user, game, r_ui, est, _ in predictions:
        est_vec[count] = est
        true_vec[count] = r_ui
        subset_inds.append(train_users[user][game][1])
        count += 1

    # calculate loss
    loss = np.square(np.subtract(est_vec, true_vec))
    # cv=3, n_jobs = -2)
    cv=JumpStartKFolds(large_data=large_data,
                       n_splits=3,
                       random_state=3,
                       shuffle=True),
    n_jobs=-2,
    refit=True)

gs.fit(small_data)

best_rmse = gs.best_score['rmse']
print("\nBest RMSE score:\n{}".format(best_rmse))
print(algo.__name__)
print(gs.best_params['rmse'])

try:
    rmse_to_beat = pickle.load(open("rmse_to_beat.pickle", "rb"))
except FileNotFoundError:
    rmse_to_beat = 99999

if best_rmse <= rmse_to_beat:
    print("A new record")
    dump.dump("bestAlgo.pickle", algo=algo)

    pickle.dump(best_rmse, open("rmse_to_beat.pickle", "wb"))
    logFile = open("rmse_records.txt", "a")
    logFile.write("\n{}\nMAE: {}\nRMSE: {}\n{}\n".format(
        algo.__name__, gs.best_params['mae'], gs.best_params['rmse'],
        best_rmse))
    logFile.close()
import pandas as pd
from surprise import Reader, Dataset, accuracy, dump, SVD
import surprise
import pickle
from surprise.model_selection import cross_validate, GridSearchCV, KFold, train_test_split

data = pd.read_pickle("data.pickle")
test = pd.read_pickle("test.pickle")
kf = KFold()

trainset, testset = train_test_split(data, test_size=.80)

algo = dump.load('saved svd modelV12')
model = algo[1].fit(data.build_full_trainset())
predictions = algo[1].test(testset)
dump.dump('Complete SVD v1.12', predictions=False, algo=algo, verbose=0)
accuracy.rmse(predictions, verbose=True)
accuracy.mae(predictions, verbose=True)
accuracy.fcp(predictions, verbose=True)
accuracy.mse(predictions, verbose=True)
示例#21
0
 def save(self, filename):
   dump(filename, algo=self.algo)
示例#22
0
def random_search_all_RS(datasets):
    """ 	
	Grid searches all algorithms on RS variable and for all datasets in config.py.
	Serializes fitted algorithms on best hyperparameters combination found to dump_files.	
	"""

    for dataset_name in datasets:
        reader = Reader(line_format='user item rating timestamp', sep=',')
        reader_no_timestamp = Reader(line_format='user item rating', sep=',')

        user_train_time_meta_features = []
        for rs in RS:
            train = Dataset.load_from_file("./created_data/" + dataset_name +
                                           "_train.csv",
                                           reader=reader)
            trainset = train.build_full_trainset()
            testset = trainset.build_testset()
            negative_sampling = Dataset.load_from_file(
                "./created_data/" + dataset_name +
                "_train_negative_sample.csv",
                reader=reader_no_timestamp)
            negative_sampling_trainset = negative_sampling.build_full_trainset(
            )
            testset_ns = negative_sampling_trainset.build_testset()
            del (negative_sampling_trainset)
            del (negative_sampling)
            validation = Dataset.load_from_file(
                "./created_data/" + dataset_name + "_validation_set.csv",
                reader=reader)
            validation = validation.build_full_trainset()
            validation = validation.build_testset()

            #Memory error for 16GB machine or float division error for lastfm
            if ("KNN" in rs["name"]
                    and dataset_name in datasets_knn_mem_error):
                continue

            file_name = os.path.expanduser(
                './created_data/trained_RS/dump_file_' + dataset_name + '_' +
                rs["name"])
            if os.path.exists(file_name):
                print("Loading " + rs["name"] + " for " + dataset_name)
                _, estimator = dump.load(file_name)

            else:
                print("Training " + rs["name"] + " on " + dataset_name)
                estimator = rs["algo"](**rs["params"])
                estimator.train(trainset)
                #dump estimator to file
                file_name = os.path.expanduser(
                    './created_data/trained_RS/dump_file_' + dataset_name +
                    '_' + rs["name"])
                dump.dump(file_name, algo=estimator)

            del (train)
            del (trainset)

            ## ERROR-FEATURES USING TRAIN SET
            preds = estimator.test(testset)
            del (testset)
            predictions_df = pd.DataFrame(preds,
                                          columns=[
                                              "userId", "movieId", "rating",
                                              "prediction", "details"
                                          ])

            # Calculating user MAE
            # predictions_df["error"] = abs(predictions_df["prediction"]-predictions_df["rating"])
            # avg_errors = predictions_df.groupby("userId")["error"].mean().rename("MAE_"+rs["name"]).to_frame().reset_index()
            # avg_errors_var = predictions_df.groupby("userId")["error"].var().rename("MAE_VAR_"+rs["name"]).to_frame().reset_index()
            # avg_errors = avg_errors.merge(avg_errors_var,on="userId")

            # Calculating user MSE and RMSE
            predictions_df["squared_error"] = (
                predictions_df["prediction"] - predictions_df["rating"]) * (
                    predictions_df["prediction"] - predictions_df["rating"])
            avg_squared_errors = predictions_df.groupby(
                "userId")["squared_error"].mean().rename(
                    "MSE_" + rs["name"]).to_frame().reset_index()
            avg_squared_errors_var = predictions_df.groupby(
                "userId")["squared_error"].var().rename(
                    "MSE_VAR_" + rs["name"]).to_frame().reset_index()
            avg_squared_errors["RMSE_" +
                               rs["name"]] = avg_squared_errors.apply(
                                   lambda r, rs=rs["name"], math=math: math.
                                   sqrt(r["MSE_" + rs]),
                                   axis=1)
            # avg_errors = avg_errors.merge(avg_squared_errors.merge(avg_squared_errors_var,on="userId"), on="userId")
            avg_errors = avg_squared_errors.merge(avg_squared_errors_var,
                                                  on="userId")[[
                                                      "RMSE_" + rs["name"],
                                                      "MSE_VAR_" + rs["name"],
                                                      "userId"
                                                  ]]

            # Calculating user NDCG
            preds_ns = estimator.test(testset_ns)
            del (testset_ns)
            predictions_ns_df = pd.DataFrame(preds_ns,
                                             columns=[
                                                 "userId", "movieId", "rating",
                                                 "prediction", "details"
                                             ])
            predictions_ns_df.to_csv(
                "./created_data/l2r/predictions_train_ns_" + dataset_name +
                "_" + rs["name"] + ".csv",
                index=False)
            predictions_df = pd.concat([predictions_df, predictions_ns_df])
            del (predictions_ns_df)
            predictions_with_relevance = remove_dataset_bias(predictions_df,
                                                             has_ns=True)
            del (predictions_df)
            scores = predictions_with_relevance.groupby("userId").agg(
                lambda r, f=calculate_ndcg_score: f(r, "prediction"))
            scores = scores[[scores.columns[0]
                             ]].rename(index=str,
                                       columns={
                                           scores.columns[0]:
                                           "NDCG_" + rs["name"]
                                       }).reset_index()

            # # Calculating RR
            # scores_rr = predictions_with_relevance.groupby("userId").agg(lambda r,f = calculate_reciprocal_rank: f(r,"prediction"))
            # scores_rr = scores_rr[[scores_rr.columns[0]]].rename(index=str,columns={scores_rr.columns[0]:"RR_"+rs["name"]}).reset_index()
            # scores = scores.merge(scores_rr,on="userId")

            # # Calculating user MAP
            # scores_map = predictions_with_relevance.groupby("userId").agg(lambda r,f = calculate_ap_score: f(r,"prediction"))
            # scores_map = scores_map[[scores_map.columns[0]]].rename(index=str,columns={scores.columns[0]:"AP_"}).reset_index()
            # scores_map.columns = ["userId","AP_"+rs["name"]]
            # scores = scores.merge(scores_map,on="userId")

            # # Calculating user Precision
            # for k in [5,10,20]:

            # 	scores_precision = predictions_with_relevance.groupby("userId").agg(lambda r,f = calculate_precision_score,s=k: f(r,"prediction",s))
            # 	scores_precision = scores_precision[[scores_precision.columns[0]]].rename(index=str,columns={scores_precision.columns[0]:"Precision@"+str(k)+"_"+rs["name"]}).reset_index()

            # 	scores = scores.merge(scores_precision,on="userId")

            user_wise_train_errors = avg_errors.merge(scores, on="userId")

            ## ERROR-FEATURES USING VALIDATION SET
            preds = estimator.test(validation)
            del (validation)
            del (estimator)
            predictions_df = pd.DataFrame(preds,
                                          columns=[
                                              "userId", "movieId", "rating",
                                              "prediction", "details"
                                          ])

            # # Calculating user MAE
            # predictions_df["error"] = abs(predictions_df["prediction"]-predictions_df["rating"])
            # avg_errors = predictions_df.groupby("userId")["error"].mean().rename("MAE_"+rs["name"]+"_val_set").to_frame().reset_index()
            # avg_errors_var = predictions_df.groupby("userId")["error"].var().rename("MAE_VAR_"+rs["name"]+"_val_set").to_frame().reset_index()
            # avg_errors = avg_errors.merge(avg_errors_var,on="userId")

            # Calculating user MSE and RMSE
            predictions_df["squared_error"] = (
                predictions_df["prediction"] - predictions_df["rating"]) * (
                    predictions_df["prediction"] - predictions_df["rating"])
            avg_squared_errors = predictions_df.groupby(
                "userId")["squared_error"].mean().rename(
                    "MSE_" + rs["name"] + "_val_set").to_frame().reset_index()
            avg_squared_errors_var = predictions_df.groupby(
                "userId")["squared_error"].var().rename(
                    "MSE_VAR_" + rs["name"] +
                    "_val_set").to_frame().reset_index()
            avg_squared_errors["RMSE_" + rs["name"] +
                               "_val_set"] = avg_squared_errors.apply(
                                   lambda r, rs=rs["name"], math=math: math.
                                   sqrt(r["MSE_" + rs + "_val_set"]),
                                   axis=1)
            avg_errors = avg_errors.merge(
                avg_squared_errors.merge(avg_squared_errors_var, on="userId"),
                on="userId")[[
                    "RMSE_" + rs["name"] + "_val_set",
                    "MSE_VAR_" + rs["name"] + "_val_set", "userId"
                ]]

            # Calculating user NDCG
            predictions_ns_df = pd.DataFrame(preds_ns,
                                             columns=[
                                                 "userId", "movieId", "rating",
                                                 "prediction", "details"
                                             ])
            predictions_df = pd.concat([predictions_df, predictions_ns_df])
            del (predictions_ns_df)
            predictions_with_relevance = remove_dataset_bias(predictions_df,
                                                             has_ns=True)
            del (predictions_df)
            scores = predictions_with_relevance.groupby("userId").agg(
                lambda r, f=calculate_ndcg_score: f(r, "prediction"))
            scores = scores[[scores.columns[0]
                             ]].rename(index=str,
                                       columns={
                                           scores.columns[0]:
                                           "NDCG_" + rs["name"] + "_val_set"
                                       }).reset_index()

            # Calculating RR
            # scores_rr = predictions_with_relevance.groupby("userId").agg(lambda r,f = calculate_reciprocal_rank: f(r,"prediction"))
            # scores_rr = scores_rr[[scores_rr.columns[0]]].rename(index=str,columns={scores_rr.columns[0]:"RR_"+rs["name"]+"_val_set"}).reset_index()
            # scores = scores.merge(scores_rr,on="userId")

            # # Calculating user MAP
            # scores_map = predictions_with_relevance.groupby("userId").agg(lambda r,f = calculate_ap_score: f(r,"prediction"))
            # scores_map = scores_map[[scores_map.columns[0]]].rename(index=str,columns={scores.columns[0]:"AP_"}).reset_index()
            # scores_map.columns = ["userId","AP_"+rs["name"]+"_val_set"]

            # scores = scores.merge(scores_map,on="userId")

            # # Calculating user Precision
            # for k in[5,10,20]:

            # 	scores_precision = predictions_with_relevance.groupby("userId").agg(lambda r,f = calculate_precision_score,s=k: f(r,"prediction",s))
            # 	scores_precision = scores_precision[[scores_precision.columns[0]]].rename(index=str,columns={scores_precision.columns[0]:"Precision@"+str(k)+"_"+rs["name"]+"_val_set"}).reset_index()

            # 	scores = scores.merge(scores_precision,on="userId")

            user_wise_train_errors_val_set = avg_errors.merge(scores,
                                                              on="userId")

            all_error_features = user_wise_train_errors.merge(
                user_wise_train_errors_val_set, on="userId", how="left")
            # some variances are zero, as sometimes there are only one rating um val set
            all_error_features = all_error_features.fillna(0.0)
            assert user_wise_train_errors.userId.shape[
                0] == user_wise_train_errors_val_set.shape[0]

            # Using only validation set error features
            user_train_time_meta_features.append(all_error_features)

        all_features = reduce(lambda x, y: x.merge(y, on="userId"),
                              user_train_time_meta_features)
        all_features[["userId"] +
                     [c for c in all_features if "val_set" not in c]].to_csv(
                         "./created_data/tmp/h2_" + dataset_name +
                         "_user_train_time_features.csv",
                         header=True,
                         index=False)
        all_features[["userId"] +
                     [c for c in all_features if "val_set" in c]].to_csv(
                         "./created_data/tmp/h2_" + dataset_name +
                         "_user_train_time_features_val_set.csv",
                         header=True,
                         index=False)
示例#23
0
then reloaded and can be used again for making predictions.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os

from surprise import SVD
from surprise import Dataset
from surprise import dump


data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()

algo = SVD()
algo.fit(trainset)

# Compute predictions of the 'original' algorithm.
predictions = algo.test(trainset.build_testset())

# Dump algorithm and reload it.
file_name = os.path.expanduser('~/dump_file')
dump.dump(file_name, algo=algo)
_, loaded_algo = dump.load(file_name)

# We now ensure that the algo is still the same by checking the predictions.
predictions_loaded_algo = loaded_algo.test(trainset.build_testset())
assert predictions == predictions_loaded_algo
print('Predictions are the same')
示例#24
0
from surprise import SVD
from surprise import Dataset, Reader
from surprise.dump import dump, load
import os

if os.path.exists("/pfs/out/model"):
    # If we have model saved by previous training, load it
    _, algo = load("/pfs/out/model")
    reader = Reader(line_format='user item rating timestamp', sep=' ')

    # Train model with each new committed train data
    for dirpath, dirs, files in os.walk("/pfs/training"):
        for filename in files:
            filepath = os.path.join(dirpath, file)
            with open(filepath) as f:
                data = Dataset.load_from_file(
                    filepath, reader=reader).build_full_trainset()
                algo.fit(data)
else:
    # If it's initial run, train with existing dataset

    # Load the movielens-100k dataset (download it if needed),
    data = Dataset.load_builtin('ml-100k').build_full_trainset()

    # We'll use the famous SVD algorithm.
    algo = SVD()
    algo.fit(data)

# In both case, save trained model
dump("/pfs/out/model", algo=algo)
示例#25
0
 def _save(self, model, path):
     dump.dump(path, model)
示例#26
0
def serialize_algo(algo, fname):
    dump.dump(fname, algo=algo)
    def handle(self, *args, **options):
        if options["which"] == "books":
            """
            rec_models.books_count_vec = CountVectorizer(stop_words='english')
            rec_models.books_count_matrix = rec_models.books_count_vec.fit_transform(rec_models.books_dataset['soup'])
            rec_models.books_cosine_sim = cosine_similarity(rec_models.books_count_matrix, rec_models.books_count_matrix)
            """

            rec_models.books_dataset = pd.read_csv("rcsystem/static/books_dataset.csv")
            rec_models.books_tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words='english')
            rec_models.books_tfidf_matrix = rec_models.books_tf.fit_transform(rec_models.books_dataset['soup'])
            rec_models.books_cosine_sim = cosine_similarity(rec_models.books_tfidf_matrix, rec_models.books_tfidf_matrix)

            self.stdout.write(self.style.SUCCESS('books updated'))

        if options["which"] == "books_rated":
            # surprise model
            rec_models.books_ratings = pd.read_csv("rcsystem/static/books_ratings.csv")
            reader = Reader()
            data = Dataset.load_from_df(rec_models.books_ratings[['user_id', 'book_id', 'rating']], reader)
            kf = KFold(n_splits=5)
            svd = SVD()

            for trainset, testset in kf.split(data):
                svd.fit(trainset)
                predictions = svd.test(testset)
                accuracy.rmse(predictions, verbose=True)

            trainset = data.build_full_trainset()
            svd.fit(trainset)

            dump.dump("rcsystem/static/user_based_book.dump", algo=svd)
            rec_models.user_based_book_algo = svd

            # fastai model
            rec_models.books_ratings_title = pd.read_csv("rcsystem/static/books_ratings_with_title.csv")
            rec_models.books_dls = CollabDataLoaders.from_df(rec_models.books_ratings, item_name='title', seed=1)
            rec_models.books_collab_filtering = collab_learner(rec_models.books_dls, y_range=(0.5, 5.5))
            rec_models.books_collab_filtering.model_dir = "."
            rec_models.books_collab_filtering.fine_tune(1, wd=0.1)  # could be more epochs
            rec_models.books_collab_filtering.save("rcsystem/static/books_collab_filtering")

            self.stdout.write(self.style.SUCCESS('books collaborative filtering updated'))

        if options["which"] == "movies":
            """
            rec_models.movies_count_vec = CountVectorizer(stop_words='english')
            rec_models.movies_count_matrix = rec_models.movies_count_vec.fit_transform(rec_models.movies_dataset['soup'][:10000])
            rec_models.movies_cosine_sim = cosine_similarity(rec_models.movies_count_matrix, rec_models.movies_count_matrix)
            """

            rec_models.movies_dataset = pd.read_csv("rcsystem/static/movies_dataset.csv")
            rec_models.movies_tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, stop_words='english')
            rec_models.movies_tfidf_matrix = rec_models.movies_tf.fit_transform(rec_models.movies_dataset['soup'])
            rec_models.movies_cosine_sim = cosine_similarity(rec_models.movies_cosine_sim, rec_models.movies_cosine_sim)

            self.stdout.write(self.style.SUCCESS('movies updated'))

        if options["which"] == "movies_rated":
            # surprise model
            rec_models.books_ratings = pd.read_csv("rcsystem/static/movies_ratings.csv")
            reader = Reader()
            data = Dataset.load_from_df(rec_models.books_ratings[['user_id', 'movie_id', 'rating']], reader)
            kf = KFold(n_splits=5)
            svd = SVD()

            for trainset, testset in kf.split(data):
                svd.fit(trainset)
                predictions = svd.test(testset)
                accuracy.rmse(predictions, verbose=True)

            trainset = data.build_full_trainset()
            svd.fit(trainset)

            dump.dump("rcsystem/static/user_based_movie.dump", algo=svd)
            rec_models.user_based_movie_algo = svd

            # fastai model
            rec_models.movies_ratings_title = pd.read_csv("rcsystem/static/movies_ratings_with_title.csv")
            rec_models.movies_dls = CollabDataLoaders.from_df(rec_models.movies_ratings, item_name='title', seed=1)
            rec_models.movies_collab_filtering = collab_learner(rec_models.movies_dls, y_range=(0.5, 5.5))
            rec_models.movies_collab_filtering.model_dir = "."
            rec_models.movies_collab_filtering.fine_tune(1, wd=0.1)  # could be more epochs
            rec_models.movies_collab_filtering.save("rcsystem/static/movies_collab_filtering")

            self.stdout.write(self.style.SUCCESS('movies collaborative filtering updated'))
示例#28
0
import os 
import pandas as pd
userORItemBoolean = [True,False]
for x in userORItemBoolean:
    sim_options = {'name': 'cosine',
                'user_based': x
                }

    ml = MovieLens()
    data = ml.loadMovieLensLatestSmall()
    movieID_to_name, name_to_movieID, movieWithNameAndGenre = ml.read_item_names()

    trainSet = data.build_full_trainset()
    model = KNNBasic(sim_options=sim_options)
    predictions = model.fit(trainSet)
    # Maybe can put all the prediction in the dump file so we can use in the future 
    file_tail_name = ""
    if x:
        file_tail_name = "_user_to_user"
    else:
        file_tail_name = "_item_to_item"
    file_name = os.path.expanduser('~/dump_file_100k_KNNBasic' + file_tail_name)
    dump.dump(file_name, predictions,algo=model)
 
 
predictions_knnBasic, algo_knnBasic = dump.load('C:/Users/182381j/Desktop/dump_file_100k_KNNBasic_item_to_item')
simsMatrix = predictions_knnBasic.compute_similarities()
print(simsMatrix)


示例#29
0
 def save_mode(self, file_path):
     dump.dump(file_path, algo=self.algo, predictions=self.predictions)
示例#30
0
 def save(predictions, algo):
     dump.dump(f'./app/modelo/dump_svd', predictions, algo)
     print('Saved')
    del df

    # Configure algorithm
    algo = SVD()
    # cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

    trainingSet = data.build_full_trainset()
    algo.fit(trainingSet)

    user_watched_list = [x['movie_id'] for x in user_data]

    return algo, user_watched_list


if __name__ == "__main__":
    import os
    if os.getcwd().endswith("data_processing"):
        from get_user_ratings import get_user_data
    else:
        from data_processing.get_user_ratings import get_user_data

    # Load ratings data
    df = pd.read_csv('data/training_data.csv')

    user_data = get_user_data("samlearner")[0]
    algo, user_watched_list = build_model(df, user_data)

    dump("models/mini_model.pkl", predictions=None, algo=algo, verbose=1)
    with open("models/user_watched.txt", "wb") as fp:
        pickle.dump(user_watched_list, fp)
algorithm. The SVD algorithm is trained on a dataset and then serialized. It is
then reloaded and can be used again for making predictions.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os

from surprise import SVD
from surprise import Dataset
from surprise import dump

data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()

algo = SVD()
algo.fit(trainset)

# Compute predictions of the 'original' algorithm.
predictions = algo.test(trainset.build_testset())

# Dump algorithm and reload it.
file_name = os.path.expanduser('~/dump_file')
dump.dump(file_name, algo=algo)
_, loaded_algo = dump.load(file_name)

# We now ensure that the algo is still the same by checking the predictions.
predictions_loaded_algo = loaded_algo.test(trainset.build_testset())
assert predictions == predictions_loaded_algo
print('Predictions are the same')
示例#33
0
def train_full(data, algo):
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    # save the trained SVD model
    dump.dump('../.tmp/svd', algo=algo)
示例#34
0
    def train_collaborative_filtering(self, grid_search=False, gs_params=None):
        #transform page list in single value
        analytics_df_SVD = self.analytics_df.copy()
        analytics_df_SVD['ranking'] = analytics_df_SVD[[
            'totals.pageviews', 'totals.timeOnSite'
        ]].apply(lambda x: self.__generate_ranking(x), axis=1)
        analytics_df_SVD = analytics_df_SVD['pages_visited'].apply(lambda x: pd.Series(eval(x)))\
         .stack()\
         .reset_index(level=1,drop=True)\
         .to_frame('pageId')\
         .join(analytics_df_SVD[['visitId','ranking']], how='left')
        analytics_df_SVD = analytics_df_SVD.dropna()
        analytics_df_SVD = analytics_df_SVD[['visitId', 'ranking', 'pageId']]
        analytics_df_SVD['pageId'] = analytics_df_SVD['pageId'].apply(
            lambda x: int(x))

        # Saves Matrix for later use
        analytics_df_SVD.to_csv('state/visit_user_ranking.csv')

        # A reader is still needed but only the rating_scale param is requiered.
        reader = Reader(rating_scale=(1, 4))

        # The columns must correspond to user id, item id and ratings (in that order).
        data = Dataset.load_from_df(
            analytics_df_SVD[['visitId', 'pageId', 'ranking']], reader)

        trainset, testset = train_test_split(data, test_size=.1)

        # If user desires to use GridSearch to find best params and algo
        if grid_search:
            if (not gs_params):
                param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008],\
                            'reg_all': [0.08, 0.1, 0.15]}
            else:
                param_grid = gs_params
            gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
            gs.fit(data)
            algo = gs.best_estimator['rmse']
            print(gs.best_score['rmse'])
            print(gs.best_params['rmse'])

        ## Comment next lines if you are searching the best params
        # We can now use this dataset as we please, e.g. calling cross_validate
        else:
            algo = SVD(n_factors=110, n_epochs=110, lr_all=0.008, reg_all=0.15)

        cross_validate(algo,
                       data,
                       measures=['RMSE', 'MAE'],
                       cv=5,
                       verbose=True)

        algo.fit(trainset)
        test_pred = algo.test(testset)
        print("SVD : Test Set")
        accuracy.rmse(test_pred, verbose=True)

        # Dump algorithm
        print('Saving trained algo...', end=" ")
        algo_list = glob.glob('state/algo_*')
        file_name = 'state/algo_' + datetime.datetime.now().strftime(
            "%Y_%B_%d__%Ih%M%p")
        dump.dump(file_name, algo=algo)
        for file in algo_list:
            os.remove(file)
        print('Done.')