# only keep tweets in english tweets_df = tweets_df.filter(tweets_df.lang == "en") tweets_df = tweets_df.withColumn( "cleaned_tweets", regexp_replace(col("tweets"), "http.+|@.|\n|RT|\d+", ' ')) # All words are lowercase and tokenized tweets_df = RegexTokenizer(inputCol="cleaned_tweets", outputCol="lowercase_tweets", pattern="\\W").transform(tweets_df) # We remove the StopWords tweets_df = StopWordsRemover( inputCol="lowercase_tweets", outputCol="processed_tweets").transform(tweets_df) # We drop the unused columns tweets_df = tweets_df.drop("cleaned_tweets", "lowercase_tweets", "lang", "date") # We load the language model model_path = "s3://" + bucket_name + "/models/w2v_model" loaded_model = Word2VecModel.load(model_path) # We add the output columns : it is the average of the words' vectors for each tweet tweets_df = loaded_model.transform(tweets_df) # We load the classifier clf_path = "s3://" + bucket_name + "/models/mpc_model" loaded_clf = MultilayerPerceptronClassificationModel.load(clf_path) predictions = loaded_clf.transform(tweets_df) # We keep the probability only for the predicted sentiment to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType())) predictions = predictions.withColumn("probability", to_array("probability"))
path = "hdfs:/input/prices.csv", header = False, schema = StructType([StructField("asin", StringType(), True), StructField("price", FloatType(), True)])) reviews = reviews.join(prices, ["asin"], how="leftsemi") # # Use nltk.word_tokenizer to tokenize words # @udf(ArrayType(StringType())) # def tokenize(string): # return word_tokenize(string) # reviews = reviews.withColumn("words", tokenize("reviewText")) reviews = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(reviews) reviews = reviews.drop("reviewText") reviews = reviews.withColumn("num_words", size("words")) reviews = reviews.drop("words") reviews = reviews.groupBy("asin").agg(avg("num_words").alias("average_review_length")) reviews = reviews.drop("num_words") data = reviews.join(prices, ["asin"]) data = data.drop("asin") data = data.repartition(20) xy = data.rdd.map(lambda row: (row.average_review_length, row.price)) xy = xy.coalesce(8) x = xy.map(lambda v: v[0]) y = xy.map(lambda v: v[1])
escape = "\"", schema = StructType([StructField("reviewId", IntegerType(), True), StructField("asin", StringType(), True), StructField("reviewText", StringType(), True)])) df = df.drop("asin") df = df.repartition(20) # # Use nltk.word_tokenizer to tokenize words # @udf(ArrayType(StringType())) # def tokenize(string): # return word_tokenize(string) # df = df.withColumn("words", tokenize("reviewText")) df = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(df) df = df.drop("reviewText") cv_model = CountVectorizer(inputCol="words", outputCol="tf").fit(df) vocabulary = cv_model.vocabulary df = cv_model.transform(df) df = df.drop("words") df.cache() df = IDF(inputCol="tf", outputCol="tfidf").fit(df).transform(df) df = df.drop("tf") df.unpersist() @udf(MapType(StringType(), FloatType())) def create_map(vector): zipped = zip(vector.indices, vector.values)