Пример #1
0
print('############################## - CLASSIFYING DATA')

userRecommends = model.recommendForAllUsers(10)

print('############################## - EXPLODING PREDICTIONS')
flatUserRecomends = userRecommends.withColumn(
    'userAndRatings',
    explode(userRecommends.recommendations)).select('userIndex',
                                                    'userAndRatings.*')

print('############################## - CONVERTING INDEXES TO STRING')
userConverter = IndexToString(inputCol='userIndex',
                              outputCol='userId',
                              labels=indexer_acc_fitted.labels)
itemConverter = IndexToString(inputCol='itemIndex',
                              outputCol='itemId',
                              labels=indexer_mer_fitted.labels)

convertedMoviesRecs = Pipeline(
    stages=[userConverter, itemConverter]).fit(df).transform(flatUserRecomends)

print('############################## - SAVING DATA')
df.unpersist()
convertedMoviesRecs.cache()
convertedMoviesRecs.show()

convertedMoviesRecs.write.json('/ML/movies/usersrec/')

# spark-submit als-model-predictions.py --master yarn --deploy-mode client --num-executors 3 --driver-java-options "-XX:+UseG1GC -XX:ResizePLAB -Xms1g -Xmx1g -XX:InitiatingHeapOccupancyPercent=35" --conf "spark.sql.tungthen.enabled=true" --conf "spark.serializer=org.apache.spark.serializer.KyrioSerializer" --conf "spark.memory.fraction=0.9" --conf "spark.driver.memoryOverhead=1g" --conf "spark.executor.memoryOverhead=1g" --conf "spark.executor.extraJavaOptions -XX:+UseG1GC -XX:ResizePLAB -Xms3g -Xmx3g -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThread=20" --conf "spark.scheduler.mode=FAIR"
Пример #2
0
wordLength = "wordLength"
wordCount = "wordCount"
wordLengthTransformer = UDFTransformer(inputCol="text", outputCol=wordLength, udf=wordLengthUDF)
wordCountTransformer = UDFTransformer(inputCol="text", outputCol=wordCount, udf=wordCountUDF)


# COMMAND ----------

from pyspark.ml import Pipeline
data = Pipeline(stages=[wordLengthTransformer, wordCountTransformer]) \
       .fit(rawData).transform(rawData) \
       .withColumn("label", rawData["rating"] > 3).drop("rating")

# COMMAND ----------

data.show(5)

# COMMAND ----------

# MAGIC %md ### 4a. Classify using pyspark
# MAGIC 
# MAGIC To choose the best LogisticRegression classifier using the `pyspark`
# MAGIC library, need to *explictly* perform the following steps:
# MAGIC 
# MAGIC 1. Process the features:
# MAGIC    * Tokenize the text column
# MAGIC    * Hash the tokenized column into a vector using hashing
# MAGIC    * Merge the numeric features with the vector in the step above
# MAGIC 2. Process the label column: cast it into the proper type.
# MAGIC 3. Train multiple LogisticRegression algorithms on the `train` dataset
# MAGIC    with different hyperparameters