def test_rmse(): # TODO: revised so that it will take user's inputs instead of hardcoded values movies_schema = None ratings_schema = None # load the schemas with open("movielens_20m_movies_schema.json", "r") as json_schema_file: movies_schema = StructType.fromJson(json.load(json_schema_file)) with open("movielens_20m_ratings_schema.json", "r") as json_schema_file: ratings_schema = StructType.fromJson(json.load(json_schema_file)) # create a hdfs directory os.system("hdfs dfs -mkdir datasets") # load the json file into the hdfs directory os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz") # create a DataFrame based on the content of the json file ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema) # explicitly repartition RDD after loading so that more tasks can run on it in parallel # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3) # parse ratings DataFrame into an RDD of [(userId, itemId, rating)] ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating)) ratingsRDD.cache() # split data into train (60%), test (40%) # TODO: add validation in the future? train (60%), validation (20%), test(20%)? trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4]) trainingRDD.cache() testRDD.cache() # run training algorithm to build the model # without validation with Timer() as t: model = ALS.train(trainingRDD, rank=3) print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs # make a prediction with Timer() as t: testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache() print "testPredRDD: %s seconds" % t.secs # calculate RMSE with Timer() as t: testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD) print "testRmse: %s seconds" % t.secs print "testRmse", testRmse return
def test_simple_rmse(): """ Test RMSE as follows: (1) train the ALS model with a subset of 15 values (2) predict a subset of 15 values using the trained model (3) calculate RMSE or how accurately the prediction is in comparison to the known values Values used to train the ALS model are based on a fictitious world where 5 users rate 4 items whether they like or dislike an item. If the user liked the item, he will provide a rating of 1; otherwise, if the user disliked the item, he will provide a rating of -1. No rating means that the user has not rated the item. This data will be formatted in an RDD of [(userId, itemId, rating)]. Splitting these 15 values into training, validation, and test dataset is randomly selected. 0 1 2 3 = itemID userId = 0 1 -1 1 1 1 1 -1 -1 2 1 1 -1 3 -1 1 4 1 1 -1 0: (0, 0, 1) 1: (0, 1, -1) 2: (0, 2, 1) 3: (0, 3, 1) 4: (1, 1, 1) 5: (1, 2, -1) 6: (1, 3, -1) 7: (2, 0, 1) 8: (2, 1, 1) 9: (2, 1, -1) 10: (3, 0, -1) 11: (3, 2, 1) 12: (4, 0, 1) 13: (4, 1, 1) 14: (4, 3, -1) """ # load the data, an RDD of [(userId, itemId, rating)] # split data into train (60%), validation (20%), test(20%) # training (8): data to train the model # validation (3): best performing approach using the validation data # test (3): estimate accuracy of the selected approach # TODO: possible split using sklearn's train_test_split? trainingArray = [(4, 3, -1), (1, 1, 1), (3, 0, -1), (4, 0, 1), (1, 2, -1), (0, 0, 1), (2, 1, -1), (0, 2, 1), (1, 3, -1)] validationArray = [(4, 1, 1), (3, 2, 1), (2, 1, 1)] testArray = [(2, 0, 1), (0, 1, -1), (0, 3, 1)] trainingRDD = scsingleton.sc.parallelize(trainingArray) validationRDD = scsingleton.sc.parallelize(validationArray) testRDD = scsingleton.sc.parallelize(testArray) # run training algorithm to build the model isExplicit = True ranks = [3, 5, 7] #numIters = [5] # default value #lmbdas = [0.01] # default value #blocks = -1 # default value #nonnegative = False # default value #seed = None # default value #alpha = [0.01] # default value model = None bestModel = None bestValidationRmse = float("inf") bestRank = 0 # with validation #for rank, numIter, lmbda in itertools.product(ranks, numIters, lmbdas): for rank in ranks: if isExplicit: model = ALS.train(trainingRDD, rank) else: # TODO: figure out why trainImplicit crash model = ALS.trainImplicit(trainingRDD, rank, iterations=5, alpha=0.01) validationPredRDD = model.predictAll( validationRDD.map( lambda x: (x[0], x[1]) ) ) validationRmse = pm.calculate_rmse_using_rdd(validationRDD, validationPredRDD) if (validationRmse < bestValidationRmse): bestModel = model bestValidationRmse = validationRmse bestRank = rank # make a prediction testPredRDD = bestModel.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache() """ # without validation model = ALS.train(trainingRDD, rank=3) testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ) """ # calculate RMSE testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD) print "testRmse using RDD = ", testRmse return