示例#1
0
def user_genre_scores(spark, ratings, movies, user_ids):
    """ Returns a dataframe containing genre scores for users matching IDs in user_ids """

    # Find ratings made by given users
    filtered_ratings = ratings.where(ratings.userId.isin(user_ids))

    # Find associated movies to found ratings
    movies_ratings = filtered_ratings.join(movies, "movieId")

    # Expand genre arrays into multiple rows
    movies_ratings = movies_ratings.withColumn(
        "genres", explode(movies_ratings.genres))\
        .withColumnRenamed("genres", "genre")

    # Find sum and count of ratings for each user
    scores = movies_ratings.groupBy('userId', 'genre').agg(
        count('rating').alias("ratingCount"),
        spark_sum('rating').alias("ratingSum"))

    # Add one 5.0 and one 0.0 rating to aggregates
    scores = scores.withColumn("ratingCount", expr("ratingCount + 2")).\
        withColumn("ratingSum", expr("ratingSum + 5"))

    # Find mean rating for "score"
    scores = scores.withColumn("score", col("ratingSum") / col("ratingCount")).\
        drop(col("ratingCount")).drop("ratingSum")

    return scores
 def get_distribution(self, dataset):
     ticket_categories = dataset.select(self.getInputCols())
     aggregation_row = ticket_categories.agg(
         *[spark_sum(col) for col in self.getInputCols()]).collect()
     sums = aggregation_row[0].asDict().values()
     total = sum(sums)
     return [one / total for one in sums]
示例#3
0
def get_home_usage(nol_usage_sdf):
    """
    :param nol_usage_sdf:
    :return:
    """

    twh_usage_sdf = nol_usage_sdf.filter('duration > 0').groupby(
        ['brand_cs_id',
         'cc'])['weight'].sum().wothColumnRenamed('weight', 'twh')

    windowval = (Window.partitionBy('brand_cs_id').orderBy('cc').rangeBetween(
        Window.unboundedPreceding, 0))

    sdf_1 = nol_usage_sdf.withColumn('rw',
                                     spark_sum('value').over(windowval)).join(
                                         twh_usage_sdf, ['brand_cs_id', 'cc'])

    sdf_2 = sdf_1.withColumn('rw_twh', sdf_1['rw'] / sdf_1['twh'])

    sdf_3 = sdf_2.withColumn(
        'homeusage',
        when(sdf_2['duration'] == 0.0, 0).otherwise(
            when(sdf_2['rw_twh'] < 1.0 / 3.0, 3).otherwise(
                when(sdf_2['rw_twh'] < 2.0 / 3.0, 2).otherwise(1))))

    return sdf_3
示例#4
0
def get_total_revenue(input_df):
    w = Window.partitionBy(col("visit_id"))
    return input_df\
                .withColumn(
                    'total_revenue_per_session',
                    when(col("is_new_session") == '1', 
                        spark_sum(when((col("body_t") == 'event') & (col("body_pa") == 'purchase'), 
                                col("body_tr")).otherwise(lit(''))
                        ).over(w)).otherwise(lit('')))
def compute(df):
    """
    Parse json and sum all the random-ints
    :type df: DataFrame
    """
    schema = StructType([
        StructField("random-string", StringType(), nullable=True),
        StructField("random-int", IntegerType(), nullable=True)
    ])
    return df.select(from_json(df.value, schema).alias("js")) \
        .agg(spark_sum("js.random-int").alias("s"))
示例#6
0
 def calculate_prob_h_v(self, visible_activation):
     # print("begin calculate_prob_h_v")
     # v is a dataframe with 3 column (self.getUserCol(),col_name_item_index, value)
     # for each item index there is only 1 softmax has value of 1 (otherwise 0)
     # print("user count: " + str(visible_activation.select(self.getUserCol()).distinct().count()))
     prob_h_df = visible_activation.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_value_visible) \
         .join(broadcast(self._weight.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_value_weight)),
               [RBMCore.col_name_item_index]) \
         .withColumn(RBMCore.col_name_mul, self.array_dot_udf(col(RBMCore.col_name_value_visible), col(RBMCore.col_name_value_weight))) \
         .groupBy(self.getUserCol(), RBMCore.col_name_hidden_node_index) \
         .agg(spark_sum(RBMCore.col_name_mul).alias("sum")) \
         .join(broadcast(self._hidden_layer_bias.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_value_hidden)),
               [RBMCore.col_name_hidden_node_index]) \
         .withColumn(RBMCore.col_name_probability, self.sigmoid_double_udf(col("sum") + col(RBMCore.col_name_value_hidden)))\
         .select(self.getUserCol(), RBMCore.col_name_hidden_node_index, RBMCore.col_name_probability)
     # print("prob_h_v_df count: {}".format(prob_h_df.count()))
     # prob_h_df.show()
     # print("done calculate_prob_h_v")
     return prob_h_df
示例#7
0
 def count_null(col_name):
     return spark_sum(col(col_name).isNull().cast('integer')).alias(col_name)
示例#8
0
    # it's faster to sort this data in driver
    geodata_local = sorted(geodata.collect(), key=lambda x: x[0])
    geodata_local = tuple(x["network"] for x in geodata_local), tuple(
        x["country"] for x in geodata_local)  # scuffed reshape

    geodata_bcast = sc.broadcast(geodata_local)

    def ip2country(ip):
        # do work analogous to Hive version. the only difference is that in spark we can make use of shared variables
        import bisect
        ip_num = ip2num(ip)
        idx = bisect.bisect_left(geodata_bcast.value[0], ip_num)
        return geodata_bcast.value[1][max(idx - 1, 0)]

    return udf(ip2country, StringType())


ip2country = make_udfs()

# part 2: write simple query
top_countries = df \
    .select(col("price"), ip2country(col("ip")).alias("country"))\
    .cache()\
    .groupBy("country")\
    .agg(spark_sum(col("price")).alias("total"))\
    .orderBy(col("total").desc())\
    .limit(10)
top_countries.show()

write_to_mysql(top_countries, "spark_top_countries")
示例#9
0
    def compute_gradient(self, v0_df, vk_df, ph0_df, phk_df, number_of_user):
        # print("begin compute_gradient")
        # gradient of hidden bias
        # print("compute gradient")
        # vk_df = self.spark.createDataFrame(vk_df.rdd, vk_df.schema).cache() #cloning prevent join bug
        dhb_df = ph0_df.withColumnRenamed(RBMCore.col_name_probability, RBMCore.col_name_value_0) \
            .join(phk_df.withColumnRenamed(RBMCore.col_name_probability, RBMCore.col_name_value_k),
                  [self.getUserCol(), RBMCore.col_name_hidden_node_index]) \
            .withColumn(RBMCore.col_name_value, (col(RBMCore.col_name_value_0) - col(RBMCore.col_name_value_k)) / number_of_user) \
            .groupBy(RBMCore.col_name_hidden_node_index) \
            .agg(spark_sum(RBMCore.col_name_value).alias("sum_" + RBMCore.col_name_value)) \
            .withColumnRenamed("sum_" + RBMCore.col_name_value, RBMCore.col_name_value) \
            .select(RBMCore.col_name_hidden_node_index, RBMCore.col_name_value)
        # dhb_df.printSchema()
        # dhb_df.show()
        # gradient of visible bias
        dvb_df = v0_df.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_value_0)\
            .join(vk_df.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_value_k),
            [self.getUserCol(), RBMCore.col_name_item_index]) \
            .withColumn(RBMCore.col_name_value, self.array_minus_long_udf(col(RBMCore.col_name_value_0), col(RBMCore.col_name_value_k)))\
            .withColumn(RBMCore.col_name_value, self.array_div_udf(col(RBMCore.col_name_value), lit(number_of_user))) \
            .groupBy(RBMCore.col_name_item_index) \
            .agg(collect_list(RBMCore.col_name_value).alias("list_" + RBMCore.col_name_value))\
            .withColumn(RBMCore.col_name_value, self.array_sum_array_udf(col("list_" + RBMCore.col_name_value)))\
            .select(RBMCore.col_name_item_index, RBMCore.col_name_value)
        # dvb_df.printSchema()
        # dvb_df.show()
        # gradient of weight
        weight_0 = v0_df.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_activation) \
            .join(ph0_df, [self.getUserCol()]) \
            .withColumn(RBMCore.col_name_value_0, self.array_multiply_udf(col(RBMCore.col_name_activation), col(RBMCore.col_name_probability))) \
            .groupBy(RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index) \
            .agg(collect_list(RBMCore.col_name_value_0).alias("list_" + RBMCore.col_name_value)) \
            .withColumn(RBMCore.col_name_value_0, self.array_sum_array_udf(col("list_" + RBMCore.col_name_value))) \
            .select(RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index, RBMCore.col_name_value_0) \
            # .repartition(ncore, [col_name_item_index, col_name_soft_max_unit_index])

        # weight_0.printSchema()
        # weight_0.cache()
        # weight_0.show()
        weight_k = vk_df.withColumnRenamed(RBMCore.col_name_value, RBMCore.col_name_activation) \
            .join(phk_df, [self.getUserCol()]) \
            .withColumn(RBMCore.col_name_value_k, self.array_multiply_udf(col(RBMCore.col_name_activation), col(RBMCore.col_name_probability))) \
            .groupBy(RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index) \
            .agg(collect_list(RBMCore.col_name_value_k).alias("list_" + RBMCore.col_name_value)) \
            .withColumn(RBMCore.col_name_value_k, self.array_sum_array_udf(col("list_" + RBMCore.col_name_value)))\
            .select(RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index, RBMCore.col_name_value_k) \
            # .repartition(ncore, [col_name_item_index, col_name_soft_max_unit_index])

        # weight_k.printSchema()
        # weight_k.cache()
        # weight_k.show()
        dw_df = weight_0.join(broadcast(weight_k),
                              [RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index]) \
            .withColumn(RBMCore.col_name_value, self.array_minus_double_udf(col(RBMCore.col_name_value_0), col(RBMCore.col_name_value_k))) \
            .withColumn(RBMCore.col_name_value,
                        self.array_div_udf(col(RBMCore.col_name_value), lit(number_of_user))) \
            .select(RBMCore.col_name_item_index, RBMCore.col_name_hidden_node_index, RBMCore.col_name_value)
        # dw_df.printSchema()
        # dw_df.show()
        # print("done compute_gradient")
        return dw_df, dvb_df, dhb_df
def process_analysis_window(analysis_window, data_table, results_table):
    """
    Process a given analysis window using Spark for the following steps:

    1. Load the news aticle data from DynamoDB
    2. Start a Spark session if it does not exists
    3. Parallelize the data to Spark Nodes
    4. Process the data from raw text to final sentiment score and label
    5. Store the results into DynamoDB    
    """
    ### 1. Load the news aticle data from DynamoDB
    dynamo = DynamoDBHelper()

    news_data = dynamo.read_table(table_name=data_table,
                                  target_analysis_window=analysis_window)

    # TODO use logger
    print('@@@Loaded DynamoDB table')

    ### 2. Start Spark session

    # TODO maybe move this to main and make the spark context an argument
    # to this function, that way the spark context can be initialized
    # based on commandline arguments such as --local_test
    # if local_test == True:
    #     spark = (SparkSession.builder
    #                         .appName("SparkTest") # Set app name
    #                         .master("local[2]") # Run locally with 2 cores
    #                         .config("spark.driver.memory", "4g")
    #                         .config("spark.executor.memory", "3g")
    #                         .getOrCreate())

    spark = (
        SparkSession.builder.appName("SparkTest")  # Set app name
        .getOrCreate())

    # TODO use logger
    print('@@@Started Spark Session')
    print('@@@ Configurations @@@')
    print(spark.sparkContext._conf.getAll())

    ### 3. Parallelize the data to Spark Nodes

    # Convert list to RDD
    news_rdd = spark.sparkContext.parallelize(news_data)

    # Create data frame
    news_df = spark.createDataFrame(news_rdd)

    ### 4. Process the data from raw text to final sentiment score and label

    # TODO use logger
    print('@@@ Initializing Sentiment UDF')

    # Add Sentiment Analysis Pipeline as a UDF to spark
    s_pipe = SentimentAnalysisPipeline()

    udf_schema = StructType([
        StructField("label", StringType(), nullable=False),
        StructField("score", FloatType(), nullable=False)
    ])

    s_pipe_udf = udf(s_pipe.raw_text_to_sentiment, udf_schema)

    # TODO use logger
    print('@@@ Defining transformations')

    # Start defining spark transformations, note that these
    # transformations are lazily evaluated so they are executed
    # only at the end when an action is triggered.

    # Run all news titles through the sentiment pipeline
    # TODO switch to full article analysis with summarizer later on.
    # Drop nulls, most likely there won't be any for titles
    title_df = news_df.select('analysis_window', 'analysis_date', 't_symb',
                              'news_timestamp', 'news_title').na.drop()

    sentiment_df = title_df.withColumn('sentiment',
                                       s_pipe_udf(title_df['news_title']))

    # Subset columns
    sentiment_df = sentiment_df.select('analysis_window', 'analysis_date',
                                       't_symb', 'news_timestamp',
                                       'news_title', 'sentiment.label',
                                       'sentiment.score')

    ## Final Label Calculation:
    # We want a final label of either **POSITIVE**, **NEGATIVE**, or **UNCERTAIN**.

    # We will use a somewhat naive and simple approach to calculating sentiment through averaging.

    # Criteria for final score:
    # * The final score should be between -1 and 1.
    # * The older news, the less important it is, scores are weighed exponentially
    # less every 3 hours from the most recent news.
    # * E.g. Most recent news have a weight of 1 and news that are 3 hours away from
    # the MAX timestamp have a weight of 0.5, 6 hour away from MAX timestamp has weight of 0.25 and so on.
    # * Any score between -0.7 and 0.7 (exclusive) is labelled UNCERTAIN
    # * Scores that are 0.7 or greater are labelled POSITIVE
    # * Scores that are -0.7 or less are labelled NEGATIVE

    ## Scores are bounded between -1 and 1
    # If the label is NEGATIVE, make the score value negative.
    # TODO make this line a little more nicer to read.
    sentiment_df = sentiment_df.withColumn(
        'score', (when(sentiment_df.label == 'NEGATIVE',
                       -sentiment_df.score).otherwise(sentiment_df.score)))

    ## Old news weigh less
    # The older the news, the less important it is, scores are weighted
    # exponentially less every 3 hours from the most recent timestamp in the analysis window.

    # Calculate weight factor
    # Since we want the weight to be halved every 3 hours, the weight is basically
    # 1 / (2^h) where h is the hours away from max divided by 3 and rounded down [h = floor(diff_hour/3)]
    # e.g. 5 Hour difference from MAX timestamp means h = floor(5/3) and weight = 1/(2^1) = 1/2

    # Spark transformations needed:
    # 1. Get most latest (max) timestamp of news articles in each analysis window and stock
    # 2. Convert news_timestamp which is seconds from epoch to hour from epoch.
    # 3. Calculate the number of hour difference between the current row value and max value
    #    in terms of news_timestamp hours from epoch.
    # 4. Divide this difference by 3 and get the floor
    # 5. Calculate the weight which is 1/(2^h) where h = floor(diff_hour/3), h was calculated in step (3)
    # 6. Multiply the sentiment score by the weight to get the new time weighted score column

    # 1. Get max timestamp (epoch seconds) for each analysis window and stock ticker
    # https://stackoverflow.com/questions/49241264/
    # https://stackoverflow.com/questions/62863632/

    column_list = ['analysis_window', 'analysis_date', 't_symb']
    window_spec = Window.partitionBy([col(x) for x in column_list])
    sentiment_df = sentiment_df.withColumn(
        'max_timestamp',
        spark_max(col('news_timestamp')).over(window_spec))

    # Convert from seconds from epoch to hour from epoch
    # Just divide the timestamp by 3600 seconds number of hours since epoch.
    # (Worry about taking floor later)
    sentiment_df = sentiment_df.withColumn('max_timestamp_hours',
                                           sentiment_df.max_timestamp / 3600)
    sentiment_df = sentiment_df.drop('max_timestamp')

    # 2. Convert news_timestamp which is seconds from epoch to hour from epoch.
    sentiment_df = sentiment_df.withColumn('news_timestamp_hours',
                                           sentiment_df.news_timestamp / 3600)
    sentiment_df = sentiment_df.drop('news_timestamp')

    # 3. Calculate the number of hour difference between the current row value and max value in terms of
    #    news_timestamp hours from epoch.

    sentiment_df = sentiment_df.withColumn(
        'diff_hours',
        sentiment_df.max_timestamp_hours - sentiment_df.news_timestamp_hours)
    sentiment_df = sentiment_df.drop(
        'news_timestamp_hours')  # don't need it anymore

    # 4. Divide this difference by 3 and get the floor
    staleness_period = 3
    sentiment_df = sentiment_df.withColumn(
        'weight_denom_power',
        spark_floor(sentiment_df.diff_hours / staleness_period))
    sentiment_df = sentiment_df.drop('diff_hours')

    # Check for when difference is negative and throw and error or log it because something is wrong.
    # TODO maybe add this number to log file or throw error
    num_negatives = sentiment_df.filter(
        sentiment_df.weight_denom_power < 0).count()

    # 5. Calculate the weight which is 1/(2^h) where h = floor(diff_hour/3), h was calculated in step (3)
    sentiment_df = sentiment_df.withColumn(
        'score_weight', 1 / (2**sentiment_df.weight_denom_power))
    sentiment_df = sentiment_df.drop('weight_denom_power')

    # 6. Multiply the sentiment score by the weight to get the new time weighted score column
    sentiment_df = sentiment_df.withColumn(
        'weighted_score', sentiment_df.score_weight * sentiment_df.score)

    ## Weighted average scores and change the labels

    # First we will sum all the weighted scores and divide it by the sum of
    # the score weights i.e. get a weighted average. This operation will be on
    # rows grouped by their respective analysis window and stock ticker symbol.

    # Then, instead of just **positive** and **negative**, we want one more
    # label called **uncertain** which is for scores less than 0.7
    # for either positive or negative.

    # Get sum of weights and sum of weighted scores
    sentiment_df = (sentiment_df.groupBy(
        'analysis_window', 'analysis_date', 't_symb').agg(
            spark_sum('weighted_score').alias('sum_scores'),
            spark_sum('score_weight').alias('sum_weights')))

    # Calculate final score for each stock
    sentiment_df = sentiment_df.withColumn(
        'final_score', sentiment_df.sum_scores / sentiment_df.sum_weights)

    sentiment_df = (sentiment_df.withColumn(
        'label',
        when(sentiment_df.final_score >= 0.5,
             'POSITIVE').when(sentiment_df.final_score <= -0.5,
                              'NEGATIVE').otherwise('UNCERTAIN')))

    # Keep only entries that we need for the website
    sentiment_df = sentiment_df.select('analysis_window', 'analysis_date',
                                       't_symb', 'label', 'final_score')

    # Cast float to Decimal
    # precision: the maximum total number of digits (default: 10)
    # scale: the number of digits on right side of dot. (default: 0)
    sentiment_df = sentiment_df.withColumn(
        'final_score',
        sentiment_df.final_score.cast(DecimalType(precision=10, scale=8)))

    ### 5. Store the results into DynamoDB

    # Keep only entries that we need for the website
    # TODO add timestamp EST string
    sentiment_df = sentiment_df.select('analysis_window', 'analysis_date',
                                       't_symb', 'label', 'final_score')

    # Cast float to Decimal
    # precision: the maximum total number of digits (default: 10)
    # scale: the number of digits on right side of dot. (default: 0)
    sentiment_df = sentiment_df.withColumn(
        'final_score',
        sentiment_df.final_score.cast(DecimalType(precision=10, scale=8)))

    # Add the most recent API success timestamp for each stock, to be used for
    # "Last Updated at:" entry on the website because we want to reflect that the sentiments
    # are based on the time the data was pulled not the time the analysis pipeline finished.
    timestamp_groupby = news_df.select('analysis_window', 'analysis_date',
                                       't_symb', 'api_success_e_str').groupBy(
                                           'analysis_window', 'analysis_date',
                                           't_symb')
    max_timestamp_df = timestamp_groupby.agg(
        spark_max('api_success_e_str').alias('api_success_e_str'))

    join_on_list = ["analysis_window", "t_symb"]
    sentiment_df = sentiment_df.join(max_timestamp_df, join_on_list, "inner")

    # TODO use logger
    print('@@@ Executing Transformations')

    # Execute transformations and collect final dataframe in Driver
    results = sentiment_df.collect()

    results_dict_list = [row.asDict() for row in results]

    # TODO lots of error catches and logging needed!
    # TODO use logger
    print('Writing results to DynamoDB')
    dynamo.write_table(table_name=results_table,
                       data_dict_list=results_dict_list)

    # TODO use logger
    print('@@@ DONE!')