def user_genre_scores(spark, ratings, movies, user_ids): """ Returns a dataframe containing genre scores for users matching IDs in user_ids """ # Find ratings made by given users filtered_ratings = ratings.where(ratings.userId.isin(user_ids)) # Find associated movies to found ratings movies_ratings = filtered_ratings.join(movies, "movieId") # Expand genre arrays into multiple rows movies_ratings = movies_ratings.withColumn( "genres", explode(movies_ratings.genres))\ .withColumnRenamed("genres", "genre") # Find sum and count of ratings for each user scores = movies_ratings.groupBy('userId', 'genre').agg( count('rating').alias("ratingCount"), spark_sum('rating').alias("ratingSum")) # Add one 5.0 and one 0.0 rating to aggregates scores = scores.withColumn("ratingCount", expr("ratingCount + 2")).\ withColumn("ratingSum", expr("ratingSum + 5")) # Find mean rating for "score" scores = scores.withColumn("score", col("ratingSum") / col("ratingCount")).\ drop(col("ratingCount")).drop("ratingSum") return scores
def get_distribution(self, dataset): ticket_categories = dataset.select(self.getInputCols()) aggregation_row = ticket_categories.agg( *[spark_sum(col) for col in self.getInputCols()]).collect() sums = aggregation_row[0].asDict().values() total = sum(sums) return [one / total for one in sums]
def get_home_usage(nol_usage_sdf): """ :param nol_usage_sdf: :return: """ twh_usage_sdf = nol_usage_sdf.filter('duration > 0').groupby( ['brand_cs_id', 'cc'])['weight'].sum().wothColumnRenamed('weight', 'twh') windowval = (Window.partitionBy('brand_cs_id').orderBy('cc').rangeBetween( Window.unboundedPreceding, 0)) sdf_1 = nol_usage_sdf.withColumn('rw', spark_sum('value').over(windowval)).join( twh_usage_sdf, ['brand_cs_id', 'cc']) sdf_2 = sdf_1.withColumn('rw_twh', sdf_1['rw'] / sdf_1['twh']) sdf_3 = sdf_2.withColumn( 'homeusage', when(sdf_2['duration'] == 0.0, 0).otherwise( when(sdf_2['rw_twh'] < 1.0 / 3.0, 3).otherwise( when(sdf_2['rw_twh'] < 2.0 / 3.0, 2).otherwise(1)))) return sdf_3
def get_total_revenue(input_df): w = Window.partitionBy(col("visit_id")) return input_df\ .withColumn( 'total_revenue_per_session', when(col("is_new_session") == '1', spark_sum(when((col("body_t") == 'event') & (col("body_pa") == 'purchase'), col("body_tr")).otherwise(lit('')) ).over(w)).otherwise(lit('')))
def compute(df): """ Parse json and sum all the random-ints :type df: DataFrame """ schema = StructType([ StructField("random-string", StringType(), nullable=True), StructField("random-int", IntegerType(), nullable=True) ]) return df.select(from_json(df.value, schema).alias("js")) \ .agg(spark_sum("js.random-int").alias("s"))
def calculate_prob_h_v(self, visible_activation): # print("begin calculate_prob_h_v") # v is a dataframe with 3 column (self.getUserCol(),col_name_item_index, value) # for each item index there is only 1 softmax has value of 1 (otherwise 0) # print("user count: " + str(visible_activation.select(self.getUserCol()).distinct().count())) prob_h_df = visible_activation.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_value_visible) \ .join(broadcast(self._weight.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_value_weight)), [RBMCore.col_name_item_index]) \ .withColumn(RBMCore.col_name_mul, self.array_dot_udf(col(RBMCore.col_name_value_visible), col(RBMCore.col_name_value_weight))) \ .groupBy(self.getUserCol(), RBMCore.col_name_hidden_node_index) \ .agg(spark_sum(RBMCore.col_name_mul).alias("sum")) \ .join(broadcast(self._hidden_layer_bias.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_value_hidden)), [RBMCore.col_name_hidden_node_index]) \ .withColumn(RBMCore.col_name_probability, self.sigmoid_double_udf(col("sum") + col(RBMCore.col_name_value_hidden)))\ .select(self.getUserCol(), RBMCore.col_name_hidden_node_index, RBMCore.col_name_probability) # print("prob_h_v_df count: {}".format(prob_h_df.count())) # prob_h_df.show() # print("done calculate_prob_h_v") return prob_h_df
def count_null(col_name): return spark_sum(col(col_name).isNull().cast('integer')).alias(col_name)
# it's faster to sort this data in driver geodata_local = sorted(geodata.collect(), key=lambda x: x[0]) geodata_local = tuple(x["network"] for x in geodata_local), tuple( x["country"] for x in geodata_local) # scuffed reshape geodata_bcast = sc.broadcast(geodata_local) def ip2country(ip): # do work analogous to Hive version. the only difference is that in spark we can make use of shared variables import bisect ip_num = ip2num(ip) idx = bisect.bisect_left(geodata_bcast.value[0], ip_num) return geodata_bcast.value[1][max(idx - 1, 0)] return udf(ip2country, StringType()) ip2country = make_udfs() # part 2: write simple query top_countries = df \ .select(col("price"), ip2country(col("ip")).alias("country"))\ .cache()\ .groupBy("country")\ .agg(spark_sum(col("price")).alias("total"))\ .orderBy(col("total").desc())\ .limit(10) top_countries.show() write_to_mysql(top_countries, "spark_top_countries")
def compute_gradient(self, v0_df, vk_df, ph0_df, phk_df, number_of_user): # print("begin compute_gradient") # gradient of hidden bias # print("compute gradient") # vk_df = self.spark.createDataFrame(vk_df.rdd, vk_df.schema).cache() #cloning prevent join bug dhb_df = ph0_df.withColumnRenamed(RBMCore.col_name_probability, RBMCore.col_name_value_0) \ .join(phk_df.withColumnRenamed(RBMCore.col_name_probability, RBMCore.col_name_value_k), [self.getUserCol(), RBMCore.col_name_hidden_node_index]) \ .withColumn(RBMCore.col_name_value, (col(RBMCore.col_name_value_0) - col(RBMCore.col_name_value_k)) / number_of_user) \ .groupBy(RBMCore.col_name_hidden_node_index) \ .agg(spark_sum(RBMCore.col_name_value).alias("sum_" + RBMCore.col_name_value)) \ .withColumnRenamed("sum_" + RBMCore.col_name_value, RBMCore.col_name_value) \ .select(RBMCore.col_name_hidden_node_index, RBMCore.col_name_value) # dhb_df.printSchema() # dhb_df.show() # gradient of visible bias dvb_df = v0_df.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_value_0)\ .join(vk_df.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_value_k), [self.getUserCol(), RBMCore.col_name_item_index]) \ .withColumn(RBMCore.col_name_value, self.array_minus_long_udf(col(RBMCore.col_name_value_0), col(RBMCore.col_name_value_k)))\ .withColumn(RBMCore.col_name_value, self.array_div_udf(col(RBMCore.col_name_value), lit(number_of_user))) \ .groupBy(RBMCore.col_name_item_index) \ .agg(collect_list(RBMCore.col_name_value).alias("list_" + RBMCore.col_name_value))\ .withColumn(RBMCore.col_name_value, self.array_sum_array_udf(col("list_" + RBMCore.col_name_value)))\ .select(RBMCore.col_name_item_index, RBMCore.col_name_value) # dvb_df.printSchema() # dvb_df.show() # gradient of weight weight_0 = v0_df.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_activation) \ .join(ph0_df, [self.getUserCol()]) \ .withColumn(RBMCore.col_name_value_0, self.array_multiply_udf(col(RBMCore.col_name_activation), col(RBMCore.col_name_probability))) \ .groupBy(RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index) \ .agg(collect_list(RBMCore.col_name_value_0).alias("list_" + RBMCore.col_name_value)) \ .withColumn(RBMCore.col_name_value_0, self.array_sum_array_udf(col("list_" + RBMCore.col_name_value))) \ .select(RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index, RBMCore.col_name_value_0) \ # .repartition(ncore, [col_name_item_index, col_name_soft_max_unit_index]) # weight_0.printSchema() # weight_0.cache() # weight_0.show() weight_k = vk_df.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_activation) \ .join(phk_df, [self.getUserCol()]) \ .withColumn(RBMCore.col_name_value_k, self.array_multiply_udf(col(RBMCore.col_name_activation), col(RBMCore.col_name_probability))) \ .groupBy(RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index) \ .agg(collect_list(RBMCore.col_name_value_k).alias("list_" + RBMCore.col_name_value)) \ .withColumn(RBMCore.col_name_value_k, self.array_sum_array_udf(col("list_" + RBMCore.col_name_value)))\ .select(RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index, RBMCore.col_name_value_k) \ # .repartition(ncore, [col_name_item_index, col_name_soft_max_unit_index]) # weight_k.printSchema() # weight_k.cache() # weight_k.show() dw_df = weight_0.join(broadcast(weight_k), [RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index]) \ .withColumn(RBMCore.col_name_value, self.array_minus_double_udf(col(RBMCore.col_name_value_0), col(RBMCore.col_name_value_k))) \ .withColumn(RBMCore.col_name_value, self.array_div_udf(col(RBMCore.col_name_value), lit(number_of_user))) \ .select(RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index, RBMCore.col_name_value) # dw_df.printSchema() # dw_df.show() # print("done compute_gradient") return dw_df, dvb_df, dhb_df
def process_analysis_window(analysis_window, data_table, results_table): """ Process a given analysis window using Spark for the following steps: 1. Load the news aticle data from DynamoDB 2. Start a Spark session if it does not exists 3. Parallelize the data to Spark Nodes 4. Process the data from raw text to final sentiment score and label 5. Store the results into DynamoDB """ ### 1. Load the news aticle data from DynamoDB dynamo = DynamoDBHelper() news_data = dynamo.read_table(table_name=data_table, target_analysis_window=analysis_window) # TODO use logger print('@@@Loaded DynamoDB table') ### 2. Start Spark session # TODO maybe move this to main and make the spark context an argument # to this function, that way the spark context can be initialized # based on commandline arguments such as --local_test # if local_test == True: # spark = (SparkSession.builder # .appName("SparkTest") # Set app name # .master("local[2]") # Run locally with 2 cores # .config("spark.driver.memory", "4g") # .config("spark.executor.memory", "3g") # .getOrCreate()) spark = ( SparkSession.builder.appName("SparkTest") # Set app name .getOrCreate()) # TODO use logger print('@@@Started Spark Session') print('@@@ Configurations @@@') print(spark.sparkContext._conf.getAll()) ### 3. Parallelize the data to Spark Nodes # Convert list to RDD news_rdd = spark.sparkContext.parallelize(news_data) # Create data frame news_df = spark.createDataFrame(news_rdd) ### 4. Process the data from raw text to final sentiment score and label # TODO use logger print('@@@ Initializing Sentiment UDF') # Add Sentiment Analysis Pipeline as a UDF to spark s_pipe = SentimentAnalysisPipeline() udf_schema = StructType([ StructField("label", StringType(), nullable=False), StructField("score", FloatType(), nullable=False) ]) s_pipe_udf = udf(s_pipe.raw_text_to_sentiment, udf_schema) # TODO use logger print('@@@ Defining transformations') # Start defining spark transformations, note that these # transformations are lazily evaluated so they are executed # only at the end when an action is triggered. # Run all news titles through the sentiment pipeline # TODO switch to full article analysis with summarizer later on. # Drop nulls, most likely there won't be any for titles title_df = news_df.select('analysis_window', 'analysis_date', 't_symb', 'news_timestamp', 'news_title').na.drop() sentiment_df = title_df.withColumn('sentiment', s_pipe_udf(title_df['news_title'])) # Subset columns sentiment_df = sentiment_df.select('analysis_window', 'analysis_date', 't_symb', 'news_timestamp', 'news_title', 'sentiment.label', 'sentiment.score') ## Final Label Calculation: # We want a final label of either **POSITIVE**, **NEGATIVE**, or **UNCERTAIN**. # We will use a somewhat naive and simple approach to calculating sentiment through averaging. # Criteria for final score: # * The final score should be between -1 and 1. # * The older news, the less important it is, scores are weighed exponentially # less every 3 hours from the most recent news. # * E.g. Most recent news have a weight of 1 and news that are 3 hours away from # the MAX timestamp have a weight of 0.5, 6 hour away from MAX timestamp has weight of 0.25 and so on. # * Any score between -0.7 and 0.7 (exclusive) is labelled UNCERTAIN # * Scores that are 0.7 or greater are labelled POSITIVE # * Scores that are -0.7 or less are labelled NEGATIVE ## Scores are bounded between -1 and 1 # If the label is NEGATIVE, make the score value negative. # TODO make this line a little more nicer to read. sentiment_df = sentiment_df.withColumn( 'score', (when(sentiment_df.label == 'NEGATIVE', -sentiment_df.score).otherwise(sentiment_df.score))) ## Old news weigh less # The older the news, the less important it is, scores are weighted # exponentially less every 3 hours from the most recent timestamp in the analysis window. # Calculate weight factor # Since we want the weight to be halved every 3 hours, the weight is basically # 1 / (2^h) where h is the hours away from max divided by 3 and rounded down [h = floor(diff_hour/3)] # e.g. 5 Hour difference from MAX timestamp means h = floor(5/3) and weight = 1/(2^1) = 1/2 # Spark transformations needed: # 1. Get most latest (max) timestamp of news articles in each analysis window and stock # 2. Convert news_timestamp which is seconds from epoch to hour from epoch. # 3. Calculate the number of hour difference between the current row value and max value # in terms of news_timestamp hours from epoch. # 4. Divide this difference by 3 and get the floor # 5. Calculate the weight which is 1/(2^h) where h = floor(diff_hour/3), h was calculated in step (3) # 6. Multiply the sentiment score by the weight to get the new time weighted score column # 1. Get max timestamp (epoch seconds) for each analysis window and stock ticker # https://stackoverflow.com/questions/49241264/ # https://stackoverflow.com/questions/62863632/ column_list = ['analysis_window', 'analysis_date', 't_symb'] window_spec = Window.partitionBy([col(x) for x in column_list]) sentiment_df = sentiment_df.withColumn( 'max_timestamp', spark_max(col('news_timestamp')).over(window_spec)) # Convert from seconds from epoch to hour from epoch # Just divide the timestamp by 3600 seconds number of hours since epoch. # (Worry about taking floor later) sentiment_df = sentiment_df.withColumn('max_timestamp_hours', sentiment_df.max_timestamp / 3600) sentiment_df = sentiment_df.drop('max_timestamp') # 2. Convert news_timestamp which is seconds from epoch to hour from epoch. sentiment_df = sentiment_df.withColumn('news_timestamp_hours', sentiment_df.news_timestamp / 3600) sentiment_df = sentiment_df.drop('news_timestamp') # 3. Calculate the number of hour difference between the current row value and max value in terms of # news_timestamp hours from epoch. sentiment_df = sentiment_df.withColumn( 'diff_hours', sentiment_df.max_timestamp_hours - sentiment_df.news_timestamp_hours) sentiment_df = sentiment_df.drop( 'news_timestamp_hours') # don't need it anymore # 4. Divide this difference by 3 and get the floor staleness_period = 3 sentiment_df = sentiment_df.withColumn( 'weight_denom_power', spark_floor(sentiment_df.diff_hours / staleness_period)) sentiment_df = sentiment_df.drop('diff_hours') # Check for when difference is negative and throw and error or log it because something is wrong. # TODO maybe add this number to log file or throw error num_negatives = sentiment_df.filter( sentiment_df.weight_denom_power < 0).count() # 5. Calculate the weight which is 1/(2^h) where h = floor(diff_hour/3), h was calculated in step (3) sentiment_df = sentiment_df.withColumn( 'score_weight', 1 / (2**sentiment_df.weight_denom_power)) sentiment_df = sentiment_df.drop('weight_denom_power') # 6. Multiply the sentiment score by the weight to get the new time weighted score column sentiment_df = sentiment_df.withColumn( 'weighted_score', sentiment_df.score_weight * sentiment_df.score) ## Weighted average scores and change the labels # First we will sum all the weighted scores and divide it by the sum of # the score weights i.e. get a weighted average. This operation will be on # rows grouped by their respective analysis window and stock ticker symbol. # Then, instead of just **positive** and **negative**, we want one more # label called **uncertain** which is for scores less than 0.7 # for either positive or negative. # Get sum of weights and sum of weighted scores sentiment_df = (sentiment_df.groupBy( 'analysis_window', 'analysis_date', 't_symb').agg( spark_sum('weighted_score').alias('sum_scores'), spark_sum('score_weight').alias('sum_weights'))) # Calculate final score for each stock sentiment_df = sentiment_df.withColumn( 'final_score', sentiment_df.sum_scores / sentiment_df.sum_weights) sentiment_df = (sentiment_df.withColumn( 'label', when(sentiment_df.final_score >= 0.5, 'POSITIVE').when(sentiment_df.final_score <= -0.5, 'NEGATIVE').otherwise('UNCERTAIN'))) # Keep only entries that we need for the website sentiment_df = sentiment_df.select('analysis_window', 'analysis_date', 't_symb', 'label', 'final_score') # Cast float to Decimal # precision: the maximum total number of digits (default: 10) # scale: the number of digits on right side of dot. (default: 0) sentiment_df = sentiment_df.withColumn( 'final_score', sentiment_df.final_score.cast(DecimalType(precision=10, scale=8))) ### 5. Store the results into DynamoDB # Keep only entries that we need for the website # TODO add timestamp EST string sentiment_df = sentiment_df.select('analysis_window', 'analysis_date', 't_symb', 'label', 'final_score') # Cast float to Decimal # precision: the maximum total number of digits (default: 10) # scale: the number of digits on right side of dot. (default: 0) sentiment_df = sentiment_df.withColumn( 'final_score', sentiment_df.final_score.cast(DecimalType(precision=10, scale=8))) # Add the most recent API success timestamp for each stock, to be used for # "Last Updated at:" entry on the website because we want to reflect that the sentiments # are based on the time the data was pulled not the time the analysis pipeline finished. timestamp_groupby = news_df.select('analysis_window', 'analysis_date', 't_symb', 'api_success_e_str').groupBy( 'analysis_window', 'analysis_date', 't_symb') max_timestamp_df = timestamp_groupby.agg( spark_max('api_success_e_str').alias('api_success_e_str')) join_on_list = ["analysis_window", "t_symb"] sentiment_df = sentiment_df.join(max_timestamp_df, join_on_list, "inner") # TODO use logger print('@@@ Executing Transformations') # Execute transformations and collect final dataframe in Driver results = sentiment_df.collect() results_dict_list = [row.asDict() for row in results] # TODO lots of error catches and logging needed! # TODO use logger print('Writing results to DynamoDB') dynamo.write_table(table_name=results_table, data_dict_list=results_dict_list) # TODO use logger print('@@@ DONE!')