def train_model(df, target_col, parameters, path):
    '''
    Train a model for a specific target column
    Parameter
    --------
    df: Spark DataFrame
        DataFrame to use for training
    target_col: str
        String Representation of the target_col to use
    parameters: dict
        Parameters to use for the ALS (=maxIter, regParam, rank)
    path: str
        Path on the HDFS to save the model
    Return
    ------
    None
        But Saves model to path
    '''
    maxIter=parameters["maxIter"]
    regParam=parameters["regParam"]
    rank=parameters["rank"]

    model = ALS(maxIter=maxIter, regParam=regParam, rank=rank, 
            userCol="user", itemCol="tweet", ratingCol=target_col,
            coldStartStrategy="nan", implicitPrefs=True).fit(df)

    model.save(path + target_col + "_als_model")
示例#2
0
def main(spark, rank, regParam, path, fraction):
    train = spark.read.parquet("{}/data/processed/train_{}.parquet".format(path, fraction))
    als = ALS(rank=rank, maxIter=5, seed=42, regParam=regParam,
              userCol='user_id', itemCol='book_id', ratingCol='rating',
              coldStartStrategy="drop")
    print("Training ALS model with rank {} and regularization {} with {} of data...".format(rank, regParam, fraction))
    model = als.fit(train)
    temp_path = "/ALS_{}_{}_{}".format(rank, regParam, fraction)
    als_path = temp_path + "/als"
    print("Saving model...")
    als.save(path + "/models" + als_path)
    model_path = temp_path + "/als_model"
    model.save(path + "/models" + model_path)
# Run model on test data
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print("RMSE value is", "%.15f" % RMSE)

test_predictions.show()

n_recommendations = best_model.recommendForAllUsers(10)
n_recommendations.limit(10).show()

n_recommendations = n_recommendations \
    .withColumn("rec_exp", explode("recommendations")) \
    .select('userId', col("rec_exp.packageId"), col("rec_exp.rating"))

n_recommendations.limit(10).show()

n_recommendations_550031373 = n_recommendations.join(
    packages, on='packageId').filter('userId = 550031373')

print("category of recommended items for n_recommendations_550031373")
n_recommendations_550031373.join(ratings_orig, on='packageId').filter(
    ratings_orig.userId == '550031373').limit(100).show()

print("category of original score items before recommendation for 550031373")
ratings_orig.join(packages, on='packageId').filter(
    ratings_orig.userId == '550031373').limit(100).show()

# Save the model into cluster as files
als.save('../models/FlexiGYMImplicitRecommender_ALS')
model.save('../models/FlexiGYMImplicitRecommender_ALSModel')
示例#4
0
(training, test) = ratings.randomSplit([0.8, 0.2])

als = ALS(maxIter=5,
          regParam=0.01,
          userCol="uid",
          itemCol="urlid",
          ratingCol="label")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="label",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

predictions.head(5)
predictions.write.csv(
    "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/predictions_test"
)
print "finish predictions"

# Save and load model
als_path = "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/als_model_test"
als.save(als_path)
als2 = ALS.load(als_path)
# $example off$
print "finish load"