示例#1
0
 def test_avg(self):
     data = [('Benny', 86), ('Jenny', 77), ('Oscar', 55), ('Scarlett', 89)]
     df = self.spark.createDataFrame(data)
     df = df.withColumnRenamed('_1', 'name').withColumnRenamed('_2', 'marks')
     
     # data is returned as a list of tuples
     self.assertEqual(76.75, df.select(avg(df.marks)).collect()[0][0])
示例#2
0
    def process(time, rdd):
        print("========= %s =========" % str(time))
        try:
            # Get the singleton instance of SQLContext
            sqlContext = getSqlContextInstance(rdd.context)
            # Convert RDD[String] to RDD[Row] to DataFrame
            parts = rdd.map(lambda line: line.split(","))
            delays_rdd= parts.map(lambda w: Row(carrier=w[0], origin=w[1], delay=float(w[2])))
            delays = sqlContext.createDataFrame(delays_rdd, samplingRatio=1)

            avg_delays = delays.groupBy("origin", "carrier").agg(F.avg(delays.delay).alias('average'))

            avg_delays.write.format("org.apache.spark.sql.cassandra").\
                options(table="task2_part2_group2_1", keyspace="mykeyspace").\
                save(mode="append")

            # Register as table
            #dataFrame.registerTempTable("origin_carrier_delays")
            # Do word count on table using SQL and print it
            #carrier_delays_df = \
            #    sqlContext.sql("SELECT origin, carrier, avg(delay) AS average FROM origin_carrier_delays GROUP BY origin, carrier")
            #carrier_delays_df.registerTempTable("origin_carrier_avg_delays")
            #carrier_avg_delays_df = \
            #    sqlContext.sql("SELECT origin, carrier, avg_delay FROM origin_carrier_avg_delays GROUP BY origin ORDER BY avg_delay LIMIT 10")
            #for i in carrier_delays_df.rdd.takeOrderedByKey(10, sortValue=lambda x: x[2], reverse=False).map(lambda x: x[1]).collect():
            #    print (i)
            #dataFrame.select("origin", "carrier", "delay").write \
            #carrier_delays_df.write \
            #    .format("org.apache.spark.sql.cassandra") \
            #    .options( table = "task2_part2_group2_1", keyspace = "mykeyspace") \
            #    .save(mode="append")
            #carrier_delays_df.show()
        except Exception as e: print (e)
示例#3
0
 def getValueFieldValueLists(self, handlerId, keyFields, valueFields):
     df = self.entity.groupBy(keyFields)
     agg = self.options.get("aggregation",self.getDefaultAggregation(handlerId))
     maxRows = int(self.options.get("rowCount","100"))
     numRows = min(maxRows,df.count())
     valueLists = []
     for valueField in valueFields:
         valueDf = None
         if agg == "SUM":
             valueDf = df.agg(F.sum(valueField).alias("agg"))
         elif agg == "AVG":
             valueDf = df.agg(F.avg(valueField).alias("agg"))
         elif agg == "MIN":
             valueDf = df.agg(F.min(valueField).alias("agg"))
         elif agg == "MAX":
             valueDf = df.agg(F.max(valueField).alias("agg"))
         else:
             valueDf = df.agg(F.count(valueField).alias("agg"))
         for keyField in keyFields:
             valueDf = valueDf.sort(F.col(keyField).asc())
         valueDf = valueDf.dropna()
         rows = valueDf.select("agg").take(numRows)
         valueList = []
         for row in rows:
             valueList.append(row["agg"])
         valueLists.append(valueList)
     return valueLists   
示例#4
0
def process_ratings(time, rdd):
    print "============== %s ============" % str(time)
    #
    # ts = now()
    # print "TIME AS now(): {}".format(ts)

    local_sql = getSqlContextInstance(rdd.context)
    from datetime import datetime
    ts = datetime.now()

    # from pyspark.sql.types import *
    # schema = StructType([
    #     StructField("user_id", IntegerType(), True),
    #     StructField("movie_id", IntegerType(), True),
    #     StructField("rating", FloatType(), True),
    #     StructField("timestamp")
    #     ]
    #     )

    ratings = rdd.map(lambda line: line.split("::"))
    row_rdd = ratings.map(lambda (user_id, movie_id, rating, timestamp):
                          Row(movie_id=int(movie_id), user_id=int(user_id),
                              rating=float(rating), ts=ts))


    ratings = local_sql.createDataFrame(row_rdd, samplingRatio=1)
    # ratings.show()
    # df.registerTempTable("ratings")

    # I want to get the average rating, and count of the number of ratings for each movie and persist it to cassandra
    from pyspark.sql import functions as F
    # movie_ids = ratings.select("movie_id").distinct()
    # movie_ids.show()

    # create table movie_ratings_time_series ( movie_id int, ts timeuuid, rating float, primary key (movie_id, ts) );

    avg_ratings = ratings.groupBy("movie_id", "ts").agg(F.avg(ratings.rating).alias('rating'))

    avg_ratings.write.format("org.apache.spark.sql.cassandra").\
                options(table="movie_ratings_time_series", keyspace="training").\
                save(mode="append")

    # writer("movie_ratings_time_series", avg_ratings)

    # movie_to_ts = local_sql.sql("select distinct movie_id, ts from ratings")
    # movie_to_ts.registerTempTable("movie_ts")

    # going to join this against itself
    # agg = local_sql.sql("SELECT movie_id, avg(rating) as a, count(rating) as c from ratings group by movie_id")
    # agg.registerTempTable("movie_aggregates")

    # matched = local_sql.sql("select a.movie_id, b.ts, a.a, a.c from movie_aggregates a join movie_ts b on a.movie_id = b.movie_id  ")

    # writer(matched, "movie_stream_ratings")

    print "========== DONE WRITING ============== "
def process_ratings(time, rdd):
    if (rdd.isEmpty()):
        print "============== RDD Is Empty. Give it a few moments to get the stream.  You started the stream right?"
        return

    print "============== %s ============" % str(time)

    import time
    ts = time.time()

    ratings = rdd.map(lambda line: line.split("::"))
    row_rdd = ratings.map(
        lambda (user_id, movie_id, rating, timestamp):
        Row( movie_id=int(movie_id), user_id=int(user_id), rating=float(rating), timestamp=int(timestamp) )
        )


    local_sql = getSqlContextInstance(rdd.context)
    ratings = local_sql.createDataFrame(row_rdd, samplingRatio=1)
    ratings.show()

    # Save dataFrame to rating_by_movie
    #writer('rating_by_movie', ratings)

    #ratings.registerTempTable("ratings")

    # I want to get the average rating, and count of the number of ratings for each movie and persist it to cassandra
    #movie_ids = ratings.select("movie_id").distinct()
    #movie_ids.show()

    # create table movie_ratings_time_series ( movie_id int, ts timeuuid, rating float, primary key (movie_id, ts) );

    from pyspark.sql import functions as F
    avg_ratings = ratings.groupBy("movie_id", "timestamp").agg(F.avg(ratings.rating).alias('rating'))
    avg_ratings.show()

    #avg_ratings.write.format("org.apache.spark.sql.cassandra").\
    #            options(table="movie_ratings_time_series", keyspace="training").\
    #            save(mode="append")

    # writer("movie_ratings_time_series", avg_ratings)

    # movie_to_ts = local_sql.sql("select distinct movie_id, ts from ratings")
    # movie_to_ts.registerTempTable("movie_ts")

    # going to join this against itself
    # agg = local_sql.sql("SELECT movie_id, avg(rating) as a, count(rating) as c from ratings group by movie_id")
    # agg.registerTempTable("movie_aggregates")

    # matched = local_sql.sql("select a.movie_id, b.ts, a.a, a.c from movie_aggregates a join movie_ts b on a.movie_id = b.movie_id  ")

    # writer(matched, "movie_stream_ratings")

    print "============== DONE WRITING ============== "
示例#6
0
    def handleUIOptions(self, displayColName):
        agg = self.options.get("aggregation")
        valFields = self.options.get("valueFields")

        if agg == 'COUNT':
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
        elif agg == 'SUM':
            return self.entity.groupBy(displayColName).agg(F.sum(valFields).alias("agg")).toPandas()
        elif agg == 'AVG':
            return self.entity.groupBy(displayColName).agg(F.avg(valFields).alias("agg")).toPandas()
        elif agg == 'MIN':
            return self.entity.groupBy(displayColName).agg(F.min(valFields).alias("agg")).toPandas()
        elif agg == 'MAX':
            return self.entity.groupBy(displayColName).agg(F.max(valFields).alias("agg")).toPandas()
        elif agg == 'MEAN':
            return self.entity.groupBy(displayColName).agg(F.mean(valFields).alias("agg")).toPandas()
        else:
            return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
def match_accidents_with_roads(spark, road_df, accident_df, use_cache=True):
    cache_path = workdir + 'data/matches_accident-road.parquet'
    if isdir(cache_path) and use_cache:
        print('Reading accident-road matches from cache...')
        return spark.read.parquet(cache_path)

    nb_top_road_center_preselected = 5
    max_distance_accepted = 10  # in meters

    # Compute distance between accident and road centers to identify the
    # top nb_top_road_center_preselected closest roads
    road_centers = (road_df.select(['street_id', 'center_long',
                                    'center_lat']).drop_duplicates())

    acc_window = (
        Window.partitionBy("accident_id").orderBy("distance_measure"))
    accidents_top_k_roads = (accident_df.select(
        'loc_lat', 'loc_long',
        'accident_id').crossJoin(road_centers).withColumn(
            'distance_inter',
            distance_intermediate_formula('loc_lat', 'loc_long', 'center_lat',
                                          'center_long')
        ).withColumn('distance_measure', distance_measure()).select(
            'accident_id', 'street_id', 'distance_measure', 'loc_lat',
            'loc_long',
            rank().over(acc_window).alias('distance_rank')).filter(
                col('distance_rank') <= nb_top_road_center_preselected).drop(
                    'distance_measure', 'distance_rank').persist())

    # For each accident identify road point closest
    accidents_roads_first_match = (accidents_top_k_roads.join(
        road_df, 'street_id').withColumn(
            'distance_inter',
            distance_intermediate_formula(
                'loc_lat', 'loc_long', 'coord_lat', 'coord_long')).withColumn(
                    'distance_measure', distance_measure()).select(
                        'accident_id', 'loc_lat', 'loc_long', 'coord_lat',
                        'coord_long', 'street_id', 'street_name',
                        row_number().over(acc_window).alias('distance_rank'),
                        'distance_measure').filter(
                            col('distance_rank') == 1).withColumn(
                                'distance',
                                col('distance_measure') *
                                (6371 * 2 * 1000)).drop(
                                    'distance_rank', 'distance_measure',
                                    'coord_lat', 'coord_long').persist())

    # If the distance is lower than max_distance_accepted we keep the
    # accident/street matches
    accidents_road_correct_match = (accidents_roads_first_match.filter(
        col('distance') < max_distance_accepted).select(
            'accident_id', 'street_id'))

    # If not, we try to get a better match by adding intermediate points on
    # the preselected streets
    # For unsatisfying matches, recompute the k closests roads
    # Recomputing is probably faster than reading from disk
    # cache + joining on accident_ids
    accidents_close_streets_coords = \
        (accidents_roads_first_match
         .filter(col('distance') >= max_distance_accepted)
         .select('accident_id', 'loc_lat', 'loc_long')
         .crossJoin(road_centers)
         .withColumn('distance_inter',
                     distance_intermediate_formula(
                                    'loc_lat',
                                    'loc_long',
                                    'center_lat',
                                    'center_long'))
         .withColumn('distance_measure',
                     distance_measure())
         .select('accident_id', 'street_id',
                 'distance_measure', 'loc_lat', 'loc_long',
                 rank().over(acc_window)
                 .alias('distance_rank'))
         .filter(col('distance_rank') <=
                 nb_top_road_center_preselected)
         .drop('distance_measure', 'distance_rank')
         .join(
             road_df.select('street_id', 'coord_lat', 'coord_long'),
             'street_id'))

    # Add the intermediate points
    street_rolling_window = (
        Window.partitionBy('street_id').orderBy("coord_long").rowsBetween(
            0, +1))
    accidents_close_streets_with_additional_coords = \
        (accidents_close_streets_coords
         .select('accident_id', 'street_id', 'loc_lat', 'loc_long',
                 avg('coord_long')
                 .over(street_rolling_window)
                 .alias('coord_long'),
                 avg('coord_lat')
                 .over(street_rolling_window)
                 .alias('coord_lat'))
         .union(accidents_close_streets_coords)
         .dropDuplicates())
    accidents_close_streets_coords.unpersist()

    # Recompute distances between accident and new set of points
    # and use closest point to identify street
    accidents_roads_first_match_with_additional_coords = \
        (accidents_close_streets_with_additional_coords
         .withColumn('distance_inter', distance_intermediate_formula(
                                                      'loc_lat',
                                                      'loc_long',
                                                      'coord_lat',
                                                      'coord_long'))
         .withColumn('distance_measure', distance_measure())
         .select('accident_id', 'street_id', 'loc_lat', 'loc_long',
                 'coord_lat', 'coord_long',
                 row_number().over(acc_window).alias('distance_rank'))
         .filter(col('distance_rank') == 1)
         .drop('distance_rank', 'loc_lat', 'loc_long',
               'coord_lat', 'coord_long'))

    # Union accidents matched correctly with first method with the accidents
    # for which we used more street points
    final_match = (accidents_road_correct_match.union(
        accidents_roads_first_match_with_additional_coords))

    # Make sure there is only one road per accident
    final_match = (final_match.join(road_centers, 'street_id').join(
        accident_df.select('loc_lat', 'loc_long', 'accident_id'),
        'accident_id').withColumn(
            'distance_inter',
            distance_intermediate_formula(
                'loc_lat', 'loc_long',
                'center_lat', 'center_long')).withColumn(
                    'distance_measure', distance_measure()).withColumn(
                        'dist_rank',
                        row_number().over(acc_window)).filter(
                            col('dist_rank') == 1).select(
                                'accident_id', 'street_id'))

    return final_match
示例#8
0
def start_stream(args):
    validate_params(args)
    _, brokers, topic = args

    spark = create_spark_session()

    json = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", brokers) \
        .option("subscribe", topic) \
        .load()

    json.printSchema()

    # Explicitly set schema
    schema = StructType([StructField("symbol", StringType(), False),
                         StructField("timestamp", TimestampType(), False),
                         StructField("price", DoubleType(), False)])

    json_options = {"timestampFormat": "yyyy-MM-dd'T'HH:mm'Z'"}
    stocks_json = json \
        .select(from_json(F.col("value").cast("string"), schema, json_options).alias("content"))

    stocks_json.printSchema

    stocks = stocks_json.select("content.*")

    ####################################
    # Stream to Parquet
    ####################################
    query = stocks \
        .withColumn('year', year(F.col('timestamp'))) \
        .withColumn('month', month(F.col('timestamp'))) \
        .withColumn('day', dayofmonth(F.col('timestamp'))) \
        .withColumn('hour', hour(F.col('timestamp'))) \
        .withColumn('minute', minute(F.col('timestamp'))) \
        .writeStream \
        .format('parquet') \
        .partitionBy('year', 'month', 'day', 'hour', 'minute') \
        .option('startingOffsets', 'earliest') \
        .option('checkpointLocation', '/dataset/checkpoint') \
        .option('path', '/dataset/streaming.parquet') \
        .trigger(processingTime='30 seconds') \
        .start()

    avg_pricing = stocks \
        .groupBy(F.col("symbol")) \
        .agg(F.avg(F.col("price")).alias("avg_price"))

    ####################################
    # Console Output
    ####################################
    query2 = avg_pricing.writeStream \
        .outputMode('complete') \
        .format("console") \
        .trigger(processingTime="10 seconds") \
        .start()

    ####################################
    # Table in Memory
    ####################################
    # query3 = avg_pricing \
    #     .writeStream \
    #     .queryName("avgPricing") \
    #     .outputMode("complete") \
    #     .format("memory") \
    #     .trigger(processingTime="10 seconds") \
    #     .start()
    #
    # while True:
    #     print('\n' + '_' * 30)
    #     # interactively query in-memory table
    #     spark.sql('SELECT * FROM avgPricing').show()
    #     print(query3.lastProgress)
    #     sleep(10)

    query2.awaitTermination()
    pass
示例#9
0
extended_trips = trip_data \
    .withColumn("pick_date", f.to_date(trip_data["lpep_pickup_datetime"])) \
    .withColumn("pick_hour", f.hour(trip_data["lpep_pickup_datetime"]))\
    .withColumn("drop_date", f.to_date(trip_data["lpep_dropoff_datetime"])) \
    .withColumn("drop_hour", f.hour(trip_data["lpep_dropoff_datetime"])) \
    .withColumn("duration", f.unix_timestamp(trip_data["lpep_dropoff_datetime"]) - f.unix_timestamp(trip_data["lpep_pickup_datetime"]))
extended_trips = extended_trips.filter((trip_data["lpep_pickup_datetime"] > '2020-01-01 00:00:00'))

hourly_taxi_trips = extended_trips \
    .groupBy("pick_date", "pick_hour").agg(
        f.count(extended_trips["fare_amount"]).alias("trip_count"),
        f.sum(extended_trips["passenger_count"]).alias("passenger_count"),
        f.sum(extended_trips["fare_amount"]).alias("fare_amount"),
        f.sum(extended_trips["tip_amount"]).alias("tip_amount"),
        f.sum(extended_trips["total_amount"]).alias("total_amount"),
        f.avg(extended_trips["duration"]).alias("avg_duration")
    )
# hourly_taxi_trips.write.mode("overwrite").parquet("./values/taxi-trips-hourly")

hourly_taxi_trips_drop = extended_trips \
    .groupBy("drop_date", "drop_hour").agg(
        f.count(extended_trips["fare_amount"]).alias("trip_count"),
        f.sum(extended_trips["passenger_count"]).alias("passenger_count"),
        f.sum(extended_trips["fare_amount"]).alias("fare_amount"),
        f.sum(extended_trips["tip_amount"]).alias("tip_amount"),
        f.sum(extended_trips["total_amount"]).alias("total_amount"),
        f.avg(extended_trips["duration"]).alias("avg_duration")
    )

daily_taxi_trips = hourly_taxi_trips.groupBy("pick_date").agg(
    f.sum(hourly_taxi_trips["trip_count"]).alias("trip_count"),
import os

app = MainApp()
app.init()
app.loadData()
app.createCheckInDataPerUser()

df_userLocs = app.user_locations
df_businessLocs =  app.df_business

df_userFeatures = sqlContext.read.json(os.environ['WORKDIR'] + "user_features.json")
df_reviews  = sqlContext.read.json(os.environ['WORKDIR'] + "yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json")
df_business = sqlContext.read.json(os.environ['WORKDIR'] + "yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json")

df_finalBusiness = df_business.join(df_businessLocs,df_business.business_id == df_businessLocs.business_id).select(df_business.business_id,df_business.stars)


df_joinBusinessLocsAndReviews = df_businessLocs.join(df_reviews,df_businessLocs.business_id == df_reviews.business_id).select(df_reviews.user_id,df_reviews.business_id,df_reviews.stars)


df_finalUsersBusinessRating =  df_joinBusinessLocsAndReviews.join(df_userLocs,df_userLocs.user_id == df_joinBusinessLocsAndReviews.user_id).select(df_joinBusinessLocsAndReviews.business_id,df_joinBusinessLocsAndReviews.stars).groupBy("business_id").agg(func.avg("stars").alias('avg_rating'))

df = df_finalUsersBusinessRating.join(df_finalBusiness, df_finalUsersBusinessRating.business_id == df_finalBusiness.business_id).select(df_finalUsersBusinessRating.business_id, "stars","avg_rating")

pdf = df.toPandas()

pdf.plot(x='business_id',y='avg_rating',color='y',label='avg_rating_by_users')
pdf.plot(x='business_id',y='stars',color='r',label='business_rating')
plt.legend(loc='lower left', fontsize=20)
pylab.show()
a = python_ratings.join(python_users, python_ratings['user_id']==python_users['user_id'],'inner')\
    .drop(python_users['user_id'])
movie_lens_joined = a.join(python_movie_data, a['movie_id']==python_movie_data['movie_id'], 'inner')\
    .drop(python_movie_data['movie_id'])

#First actual output (user count of new joined dataframe)
print "Record Count of New Movie Lens (Joined) table is: ", movie_lens_joined.count()
print '\n'

#Analysis Piece #3 - Aggregration of ratings to rescale them by occupation

#Extra imports
from pyspark.sql import functions as spfun

#avg rating computation
avgs = movie_lens_joined.groupby('user_id').agg(spfun.avg('rating')\
                                               .alias('avg_rating'))
#join again with intitial
final_avgs = movie_lens_joined.join(avgs, movie_lens_joined['user_id']==avgs['user_id'])\
                                   .drop(avgs['user_id'])

#final column for new rescaled ratings by occupation
df = final_avgs.withColumn('rescaled_rating', final_avgs['rating'] - final_avgs['avg_rating'])

#Analysis Piece #4 - Plot rescaled ratings by occupation
matplotlib.style.use('ggplot')

#Spark Dataframe
stats = df.groupby('occupation').avg('rescaled_rating').toPandas()

#Pandas Dataframe
stats.columns = ['occupation', 'rescaled_rating']
示例#12
0
# array是否包含某個值 array_contains()

# first() last()
from pyspark.sql.functions import first, last
df.select(first("name"),last("salary"))

# min(), max()
from pyspark.sql.functions import min, max
df.select(min("salary"),max("salary")).show()

from pyspark.sql.functions import sum
df.select(sum("salary")).show()

#sumDistinct()
from pyspark.sql.functions import avg
df.select(avg("salary")).show()


"""自定義函數"""
def toFormat(s):
    return str(s).split(",")[0].replace("[","").replace("'","")

toFormat=udf(toFormat, StringType())
df.withColumn('words',toFormat('keywords')).select("words").show()


"""cache"""
DF1.cache()
DF2 = DF1.groupBy("DEST_COUNTRY_NAME").count().collect()
DF3 = DF1.groupBy("ORIGIN_COUNTRY_NAME").count().collect()
DF4 = DF1.groupBy("count").count().collect()
示例#13
0
trunc_df = yelp_df.filter("review_count>=10 and open = 'True'").groupBy("state").count()

trunc_df.orderBy(desc("count")).collect()

###################

/usr/lib/hue/apps/search/examples/collections/solr_configs_log_analytics_demo/index_data.csv
logs_df = sqlCtx.load(source="com.databricks.spark.csv",header = 'true',inferSchema = 'true',path ='index_data_http.csv')
sc._jsc.hadoopConfiguration().set('textinputformat.record.delimiter','\r\n')
sc._jsc.hadoopConfiguration().set('textinputformat.record.delimiter','\r\n')
from pyspark.sql.functions import asc, desc
logs_df.groupBy("code").count().orderBy(desc("count")).show()
logs_df.groupBy("code").avg("bytes").show()
import pyspark.sql.functions as F
logs_df.groupBy("code").agg(logs_df.code,F.avg(logs_df.bytes),F.min(logs_df.bytes),F.max(logs_df.bytes)).show()

###########################################
yelp_df = sqlCtx.load(source='com.databricks.spark.csv',header = 'true',inferSchema = 'true',path ='index_data.csv')
yelp_df.registerTempTable("yelp")
filtered_yelp = sqlCtx.sql("SELECT * FROM yelp WHERE useful >= 1")
filtered_yelp.count()

sqlCtx.sql("SELECT MAX(useful) AS max_useful FROM yelp").collect()
useful_perc_data.join(yelp_df,yelp_df.id == useful_perc_data.uid,"inner").select(useful_perc_data.uid, "useful_perc", "review_count")
useful_perc_data.registerTempTable("useful_perc_data")

sqlCtx.sql(
"""SELECT useful_perc_data.uid, useful_perc,
review_count
FROM useful_perc_data
#import SQLContext and pyspark SQL functions

from pyspark.sql import SQLContext, Row
import pyspark.sql.functions as func
sqlContext = SQLContext(sc)

inputRDD = sc.textFile("/user/pravat/auctiondata.csv").map(lambda l: l.split(","))
auctions = inputRDD.map(lambda p:Row(auctionid=p[0], bid=float(p[1]), bidtime=float(p[2]), bidder=p[3], bidrate=int(p[4]), openbid=float(p[5]), price=float(p[6]), itemtype=p[7], dtl=int(p[8])))

# Infer the schema, and register the DataFrame as a table.
auctiondf = sqlContext.createDataFrame(auctions)
auctiondf.registerTempTable("auctions")

auctiondf.show()

auctiondf.printSchema()

totbids = auctiondf.count()
print totbids

totalauctions = auctiondf.select("auctionid").distinct().count()
print total auctions

itemtypes = auctiondf.select("itemtype").distinct().count()
print itemtypes
auctiondf.groupBy("itemtype","auctionid").count().show()
auctiondf.groupBy("itemtype","auctionid").count().agg(func.min("count"), func.max("count"), func.avg("count")).show()
auctiondf.groupBy("itemtype", "auctionid").agg(func.min("bid"), func.max("bid"), func.avg("bid")).show()
auctiondf.filter(auctiondf.price>200).count()
xboxes = sqlContext.sql("SELECT auctionid, itemtype,bid,price,openbid FROM auctions WHERE itemtype = 'xbox'").show()
示例#15
0
    def loadData(self):
        category_list = self.sc.textFile(os.environ['WORKDIR'] + "yelp_dataset_challenge_academic_dataset/cat_subcat.csv").map(lambda line: (line.split(',')[0], line.split(',')))
        category_schema = StructType([
            StructField("category", StringType(), True),
            StructField("sub_category", ArrayType(StringType()), True)
        ])
        # self.category_list.registerTempTable("categories_list")
        # subcat = self.sqlContext.sql("SELECT sub_category FROM categories_list WHERE category = \"{0}\" LIMIT 1".format(self.category))
        category_list = self.sqlContext.createDataFrame(category_list, category_schema)
        subcat = category_list.where(category_list.category == self.category).first().sub_category

        self.df_business = self.sqlContext.read.json(os.environ['WORKDIR'] + "yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json")
        # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache()
        self.df_business = self.df_business.select("business_id", "name", "stars", "latitude", "longitude", "categories")

        filter_business = partial(isBusinessLocalAndRelevant, latitude = self.loc_lat, longitude = self.loc_long, sub_categories = subcat)
        self.df_business = self.df_business.rdd.filter(filter_business)
        self.df_business = self.sqlContext.createDataFrame(self.df_business)
        self.df_business = self.df_business.select("business_id", "name", "stars")
        self.df_business.registerTempTable("business")

        schema_2 = StructType([
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True)
        ])
        
        schema = StructType([
            StructField("cluster_centers", ArrayType(schema_2), True),
            StructField("user_id", StringType(), True)
        ])

        self.df_user_locations = self.sqlContext.read.json(os.environ['WORKDIR'] + "clustering_models/center.json/dbscan", schema)
        filter_users = partial(isUserlocal, latitude = self.loc_lat, longitude = self.loc_long)
        self.df_user_locations = self.df_user_locations.rdd.filter(filter_users)
        self.df_user_locations = self.sqlContext.createDataFrame(self.df_user_locations)
        self.df_user_locations = self.df_user_locations.select("user_id")
        self.df_user_locations.registerTempTable("user")
        #print "user locations: ", self.self.df_user_locations.count()

        self.df_review = self.sqlContext.read.json(os.environ['WORKDIR'] + "yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json")
        self.df_review = self.df_review.select("business_id", "user_id", "stars")
        self.df_review.registerTempTable("review")
        #print "reviews: ", self.self.df_review.count()

        self.df_joined = self.sqlContext.sql("SELECT r.user_id AS user_id, r.business_id AS business_id, first(b.name) AS business_name, first(b.stars) as business_stars, avg(r.stars) AS avg_rev_stars FROM review r, business b, user u WHERE r.business_id = b.business_id AND r.user_id = u.user_id GROUP BY r.user_id, r.business_id")
        self.df_joined.registerTempTable("joined")
        
        self.df_business.unpersist()
        self.df_user_locations.unpersist()
        self.df_review.unpersist()

        self.df_category_pred = self.loadEliteScorePredictionsForCategory()
        self.df_category_pred.registerTempTable("prediction")
        
        self.df_joined = self.sqlContext.sql("SELECT j.*, p.prediction AS elite_score, (j.avg_rev_stars*p.prediction) AS w_score FROM joined j, prediction p WHERE j.user_id = p.user_id") 
        #print "joined: ", self.self.df_joined.count()
        #self.self.df_joined.show()

        self.df_category_pred.unpersist()

        df_grouped = self.df_joined.groupBy("business_id", "business_name", "business_stars").agg(F.avg("w_score").alias("rank"))
        df_grouped = df_grouped.sort("rank", ascending=False)
        print df_grouped.count()
        df_grouped.show()

        self.df_joined.unpersist()

        return df_grouped
示例#16
0
d1 = spark.read.option("header", "true") \
    .option("sep", ",").option("inferSchema", True) \
    .option("mode", "DROPMALFORMED") \
    .csv("file:///Users/beginspark/Temp/data2.csv")

d2 = d1.toDF("year", "month", "road", "avr_traffic_month", "avr_velo_month", "mon", "tue", "wed", "thu", "fri", "sat",
             "sun")

# data 확인
d2.printSchema()

# null 값 제거
d3 = d2.where("avr_velo_month is not null")

# 도로별 평균 속도
d4 = d3.groupBy("road").agg(functions.round(functions.avg("avr_velo_month"), 1).alias("avr_velo_total"))
d5 = d3.join(d4, ["road"])

# label 부여
d6 = d5.withColumn("label", label(d5.avr_velo_month, d5.avr_velo_total).cast("double"))
d6.select("road", "avr_velo_month", "avr_velo_total", "label").show(5, False)
d6.groupBy("label").count().show(truncate=False)

dataArr = d6.randomSplit([0.7, 0.3])
train = dataArr[0]
test = dataArr[1]

indexer = StringIndexer(inputCol="road", outputCol="roadcode")

assembler = VectorAssembler(inputCols=["roadcode", "mon", "tue", "wed", "thu", "fri", "sat", "sun"],
                            outputCol="features")
示例#17
0
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

def getSparkSession():
    return SparkSession.builder.appName('API-Test')\
           .config(conf = SparkConf()).getOrCreate()

if __name__ == '__main__':

    spark = getSparkSession()
    data = [('Benny', 86), ('Jenny', 77), ('Oscar', 55), ('Scarlett', 89)]
    df = spark.createDataFrame(data)
    df = df.withColumnRenamed('_1', 'name').withColumnRenamed('_2', 'marks')

    print('Average marks: ', df.select(avg(df.marks)).collect())

    print('Marks between 80 & 90: ', df.filter(df.marks.between(80, 90)).collect())

    print('Marks between 80 & 90 and name starts with \'S\': ', \
          df.filter(df.marks.between(80, 90) & df.name.startswith('S')).collect())
    
    print('Names having \'y\': ', df.filter(df.name.like('%y%')).collect())

    names_with_y = df.filter(df.name.like('%y%'))
    
    print('Avg. of names having \'y\': ', \
          names_with_y.select(avg(names_with_y.marks)).collect())
    
sqlCtx = SQLContext(sc)

lines = sc.parallelize(["m1,d1,1", "m1,d2,2", "m2,d1,1", "m2,d2,2"])

record = lines.map(lambda line: line.split(",")).map(
    lambda columns: Row(machine=columns[0], domain=columns[1], request=columns[2]))

recordSchema = sqlCtx.createDataFrame(record)

recordSchema.groupBy().agg({"*": "count"}).show()

recordSchema.groupBy("machine", recordSchema["domain"]).agg(
    {"domain": "max", "request": "min"}).show()

recordSchema.groupBy("machine", recordSchema.domain).agg(functions.count("*"), functions.max(
    recordSchema.request), functions.min(recordSchema["request"]), functions.sum(recordSchema["request"]), functions.avg(recordSchema["request"])).show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int")).groupBy("machine").count().show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").max("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").min("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").sum("request").show()

recordSchema.select(recordSchema.machine, recordSchema.request.cast(
    "int").alias("request")).groupBy("machine").avg("request").show()
示例#19
0
def cal_mat_window(sc, sqlContext, dfSC, window):
    windowSpec = Window.partitionBy("symbol").orderBy("date").rangeBetween(-1 * window+1,1)
    mat = func.avg("close").over(windowSpec)
    dfSC = dfSC.select(dfSC.symbol, dfSC.date, dfSC.close, mat )
    print dfSC.collect()
示例#20
0
    def doRenderMpld3(self, handlerId, figure, axes, keyFields, keyFieldValues, keyFieldLabels, valueFields, valueFieldValues):
        allNumericCols = self.getNumericalFieldNames()
        if len(allNumericCols) == 0:
            self._addHTML("Unable to find a numerical column in the dataframe")
            return
        
                 
        keyFields = self.options.get("keyFields")
        valueField = self.options.get("valueFields")

        if(keyFields==None and valueField==None):
            keyFields=self.getFirstStringColInfo()
            valueField=self.getFirstNumericalColInfo() 
        else:
            keyFields = keyFields.split(',') 
            valueField = valueField.split(',') 
            if(len(valueField) > 1):
                self._addHTML("You can enter only have one value field for Bar Charts (2-D)"+str(len(valueField)))
                return
            keyFields = keyFields[0]
            valueField=valueField[0]
        
                
        #if(len(valueFields>)):


    
        #init
        fig=figure
        ax=axes
        
        #fig, ax = plt.subplots()   
        #fig = plt.figure()
        

        params = plt.gcf()
        plSize = params.get_size_inches()
        params.set_size_inches( (plSize[0]*2, plSize[1]*2) )


        agg=self.options.get("aggregation")
        groupByCol=self.options.get("groupByCol")
        
        if (agg=="None" or agg==None):
            colLabel = keyFields
            y = self.entity.select(valueField).toPandas()[valueField].dropna().tolist()
            x_intv = np.arange(len(y))
            labels =  self.entity.select(keyFields).toPandas()[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel(valueField, fontsize=18)
        elif(agg=='AVG'):
            y1=self.entity.groupBy(keyFields).agg(F.avg(valueField).alias("avg")).toPandas().sort_values(by=keyFields)
            y=y1["avg"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("Average "+valueField, fontsize=18)
        elif(agg=='SUM'):
            y1=self.entity.groupBy(keyFields).agg(F.sum(valueField).alias("sum")).toPandas().sort_values(by=keyFields)
            y=y1["sum"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("sum "+valueField, fontsize=18)
        elif(agg=='MAX'):
            y1=self.entity.groupBy(keyFields).agg(F.max(valueField).alias("max")).toPandas().sort_values(by=keyFields)
            y=y1["max"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("max "+valueField, fontsize=18)
        elif(agg=='MIN'):
            y1=self.entity.groupBy(keyFields).agg(F.min(valueField).alias("min")).toPandas().sort_values(by=keyFields)
            y=y1["min"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("min "+valueField, fontsize=18)
        elif(agg=='COUNT'):
            y1=self.entity.groupBy(keyFields).agg(F.count(valueField).alias("count")).toPandas().sort_values(by=keyFields)
            y=y1["count"].dropna().tolist()
            x_intv = np.arange(len(y))
            labels=y1[keyFields].dropna().tolist()
            plt.xticks(x_intv,labels)
            plt.xlabel(keyFields, fontsize=18)
            plt.ylabel("count "+valueField, fontsize=18)

        mpld3.enable_notebook()      
        plt.bar(x_intv,y,color="blue",alpha=0.5)
        ax_fmt = BarChart(labels)
        mpld3.plugins.connect(fig, ax_fmt)
示例#21
0
simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600),
              ("Robert", "Sales", 4100), ("Maria", "Finance", 3000),
              ("James", "Sales", 3000), ("Scott", "Finance", 3300),
              ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000),
              ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)]
schema = ["employee_name", "department", "salary"]

df = spark.createDataFrame(data=simpleData, schema=schema)
df.printSchema()
df.show(truncate=False)

print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department &amp; Salary: " + str(df2.collect()[0][0]))

print("count: " + str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
示例#22
0
from pyspark import SparkContext, SparkConf, SQLContext
import pyspark.sql.functions as f

conf = (SparkConf().setMaster("local[20]").setAppName(
    "sample app for reading files").set("spark.executor.memory", "2g"))

sc = SparkContext(conf=conf)

sqlContext = SQLContext(sc)
df = sqlContext.read.load("ratings.csv",
                          format='com.databricks.spark.csv',
                          header='true',
                          inferSchema='true')

df.groupby('movieId').agg(f.avg('rating').alias('movie_rating')).orderBy(
    'movieId', ascending=True).coalesce(1).write.format(
        "com.databricks.spark.csv").save("WowResultsnew.csv")
示例#23
0
)
movies_df_schema = StructType(
  [StructField('ID', IntegerType()),
   StructField('title', StringType())]
)

spark = SparkSession.builder.appName('ALS Movie Predictions').getOrCreate()

movies_raw_df = spark.read.format("csv").option("header", "true").load("/home/ragesh/Data/Movie_Ratings/movies.csv", schema=movies_df_schema)
ratings_raw_df = spark.read.format("csv").option("header", "true").load("/home/ragesh/Data/Movie_Ratings/ratings.csv", schema=ratings_df_schema)

movies_raw_df.cache()
ratings_raw_df.cache()

# Movies with Highest Average Ratings
movie_ids_with_avg_ratings_df = ratings_raw_df.groupBy('movieId').agg(F.count(ratings_raw_df.rating).alias("count"), F.avg(ratings_raw_df.rating).alias("average"))
# print('movie_ids_with_avg_ratings_df:')
# movie_ids_with_avg_ratings_df.show(3, truncate=False)

movie_names_with_avg_ratings_df = movie_ids_with_avg_ratings_df.join(movies_raw_df, movies_raw_df.ID == movie_ids_with_avg_ratings_df.movieId) \
                                              .select(movie_ids_with_avg_ratings_df['average'], movies_raw_df.title, movie_ids_with_avg_ratings_df['count'], movie_ids_with_avg_ratings_df.movieId)
# print('movie_names_with_avg_ratings_df:')
# movie_names_with_avg_ratings_df.show(3, truncate=False)


# Movies with Highest Average Ratings and at least 500 reviews
movies_with_500_ratings_or_more = movie_names_with_avg_ratings_df.filter(movie_names_with_avg_ratings_df['count'] >= 500)\
                                                                 .sort(movie_names_with_avg_ratings_df['average'].desc())
# print('Movies with highest ratings:')
# movies_with_500_ratings_or_more.show(20, truncate=False)
if __name__ == "__main__":
    conf = get_spark_app_config()
    spark = SparkSession.builder \
        .config(conf=conf) \
        .getOrCreate()

    logger = Log4j(spark)

    logger.info("Starting the pyspark application")
    invoice_df = load_invoice_df(spark)

    invoice_df.select(
        f.countDistinct(col("InvoiceNo")).alias("Count_Of_Invoice"),
        f.sum(col("Quantity")).alias("Sum_Of_Quantity"),
        f.avg(col("UnitPrice")).alias("Avg_Of_UnitPrice"),
        f.count("*").alias("Total_Count"),
    ).show()

    summary_df = invoice_df \
          .groupBy(col("Country"),col("InvoiceNo")) \
          .agg(f.sum(col("Quantity")).alias("Total_Quantity"),
               f.round(f.sum(col("Quantity")*col("UnitPrice")),2).alias("InvoiceValue")
              ).show()

    weekly_summary_df = invoice_df \
       .withColumn("WeekNumber", f.weekofyear(f.to_date(f.substring(col("InvoiceDate"), 1, 10), "M-d-yyyy"))
                  ) \
       .groupBy(col("Country"), col("WeekNumber")) \
      .agg(
      f.countDistinct(col("InvoiceNo")).alias("NumInvoices"),
示例#25
0
        .withColumn('x_centers', x_udf('centers'))
        .withColumn('y_centers', y_udf('centers'))
        .withColumn('group_sizes', group_udf('centers'))
        .withColumn('num_groups', count_udf('group_sizes'))
        .withColumn('velocities', velocity_udf('pair_centers'))
        .withColumn('num_velocities', count_udf('velocities'))
        .withColumn('sum_velocities', sum_udf('velocities')))

"""# Aggregate each 5 minute window to compute:
- average number of people detected
- average group size
- average velocity
"""

window_str = '{} minutes'.format(window_minutes)
agg_df = (df.groupBy(window('timestamp', windowDuration=window_str, slideDuration=window_str))
           .agg(F.sum('num_people'),
                F.sum('num_groups'),
                F.sum('sum_velocities'),
                F.sum('num_velocities'),
                avg('num_people'),
                collect_list('x_centers'),
                collect_list('y_centers'))
           .withColumn('x_centers', flatten_udf('collect_list(x_centers)'))
           .withColumn('y_centers', flatten_udf('collect_list(y_centers)'))
           .drop('collect_list(x_centers)')
           .drop('collect_list(y_centers)')
           .orderBy('window'))

agg_df.show()
示例#26
0
def cal_mat_window(sc, sqlContext, dfSC, window):
    windowSpec = Window.partitionBy("symbol").orderBy("date").rangeBetween(
        -1 * window + 1, 1)
    mat = func.avg("close").over(windowSpec)
    dfSC = dfSC.select(dfSC.symbol, dfSC.date, dfSC.close, mat)
    print dfSC.collect()
from pyspark.sql import functions as F

#Creating data frame from list
data = [('John', 'Smith', 47),('Jane', 'Smith', 22), ('Frank', 'Jones', 28)]
schema = ['fname', 'lname', 'age']
df = sqlContext.createDataFrame(data, schema)
df

#Retrieving contents of data frame
df.printSchema()
df.show()
df.first()
df.count()

#Adding columns
df = df.withColumn('salary', F.lit(0))
df.show()
df.withColumn('salary2', df['age'] * 100).show()

#Filtering and subsetting 
df.filter(df['age'] > 30).select('fname','age').show()
df.select(F.max('age').alias('max-age')).show()

#Grouped aggregations
df.groupBy('lname').max('age').show()
df.groupBy('lname').agg(F.avg('age').alias('avg-age'), F.min('age'), F.max('age')).show()


 totalSal=0
 count=0
 for x in list:
   count=count+1
   totalSal=totalSal+x[0]
   
 return totalSal/count 
 
 avgSal=totalAvg(salList)
 newEmpDt_df = empDt_df.withColumn("avgSalary", Func.lit(avgSal))

# another approach using windows function

from pyspark.sql.window import Window
window = Window.partitionBy(empDt_df.address).orderBy(empDt_df.address.desc())
empDt1_df = empDt_df.withColumn("Avg_salary_country_wise",Func.avg(empDt_df.salary).over(window))

"""
+------+--------+---------+------+-----------------------+
|emp_id|emp_name|  address|salary|Avg_salary_country_wise|
+------+--------+---------+------+-----------------------+
|     4|   Tanya|   Russia|  7500|                23750.0|
|     7|   Jerry|   Russia| 40000|                23750.0|
|     1|     Tim|       US|  4800|                 5750.0|
|     2|  George|       US|  3200|                 5750.0|
|     8|   Cathy|       US|  5000|                 5750.0|
|    10|   Peter|       US| 10000|                 5750.0|
|     3|    Mary|       UK|  8000|                 6700.0|
|     6|     Jim|       UK|  5400|                 6700.0|
|     5|    Rose|Australia|  7000|                13500.0|
|     9|    Andy|Australia| 20000|                13500.0|
    split_col = F.split(df_modified["location"], ',')
    df_modified = df_modified.withColumn("name", split_col.getItem(0))\
        .withColumn("highway", split_col.getItem(1))\
        .withColumn("lanes", split_col.getItem(2))\
        .withColumn("bridge", split_col.getItem(3))\
        .withColumn("lit", split_col.getItem(4))\
        .withColumn("id", split_col.getItem(5))\
        .withColumn("unique_id", split_col.getItem(6))

    # Calculation of the density of vehicles divided by section and type of vehicle
    query = df_modified \
        .withWatermark("timestamp_millisecond", "2 minutes") \
        .groupBy("timestamp_millisecond", "name", "unique_id", "id_object")\
        .agg(
            F.count("*").alias("count"),
            F.avg("speed").alias("average_speed"),
            )

    ############### Writing the stream modified to memory #################
    query.writeStream\
        .queryName("streamingOutput")\
        .format("parquet")\
        .option("path", os.path.join(os.getcwd(), 'sink', 'sink_stream_modified'))\
        .option("checkpointLocation", os.path.join(os.getcwd(), 'checkpoint', 'checkpoint_stream_modified'))\
        .start()

    ##################### Aggregation 1 minute ####################
    # Load the previously modified stream
    df_read = spark \
        .readStream \
        .schema(query.schema) \
示例#30
0
    phy_df = spark.createDataFrame(phy)
    phy_df = phy_df\
             .withColumnRenamed('_1', 'id')\
             .withColumnRenamed('_2', 'name')\
             .withColumnRenamed('_3', 'marks')

    chem_df = spark.createDataFrame(chem)
    chem_df = chem_df.withColumnRenamed('_1', 'id')\
              .withColumnRenamed('_2', 'name')\
              .withColumnRenamed('_3', 'marks')

    phy_df.printSchema()
    chem_df.printSchema()

    phy_df.agg(avg(phy_df.marks)).show()
    chem_df.groupBy().avg('marks').show() # alternate API

    #inner join
    phy_df\
            .join(chem_df, phy_df.id == chem_df.id)\
            .select(phy_df.name, phy_df.marks, chem_df.marks).show()

    #with SQL statements
    phy_df.createOrReplaceTempView('phy')
    chem_df.createOrReplaceTempView('chem')

    sql_str = 'select phy.name, phy.marks, chem.marks from phy, chem where phy.id = chem.id'
    spark.sql(sql_str).show()

    sql_str = 'select phy.name, phy.marks, chem.marks from phy full outer join chem on phy.id = chem.id'
def fill_with_mean(df, include=set()):
    stats = df.agg(*(fn.avg(c).alias(c) for c in df.columns if c in include))
    return df.fillna(stats.first().asDict())
示例#32
0

import pyspark.sql.functions as F

df = sqlContext.createDataFrame([('a', 1), ('b', 2), ('a', 3)], ["key", "value"])
df2 = df.withColumn('key', F.upper(df.key))
df2.groupBy('key').agg(F.avg(df.value)).collect()


示例#33
0
        return (
            row[0],  # commentBody
            row[1],  # commentID
            row[2],  # createDate
            row[3],  # articleID
            str(nltk_sentiment(row[0])))

    df = df.select('commentBody', 'commentId', 'createDate', 'articleID')
    df = df.na.drop()
    df = df.rdd.map(callback)\
        .toDF(['commentBody', 'commentId', 'createDate', 'articleID', 'sentimentScore'])
    df.show()
    df.write.csv("hdfs://cm:9000/uhadoop2019/dpi/test2")

    df.cache()

    # get avg score per article
    article_df = spark.read.option('header', 'true') \
        .option("delimiter", ",") \
        .option('quote', '"') \
        .option('multiLine', 'true') \
        .option('parserLib', 'univocity') \
        .csv("hdfs://cm:9000/uhadoop2019/dpi/ArticlesApril2017.csv.gz")

    joined_df = article_df.join(df, 'articleID')
    joined_df = joined_df.groupBy('articleID').agg(
        avg('SentimentScore').alias('avg_score'))

    joined_df.write.csv("hdfs://cm:9000/uhadoop2019/dpi/avg_score_article")
    print(joined_df.show())
示例#34
0
'file:///usr/lib/hue/apps/search/examples/collections/solr_configs_log_analytics_demo/index_data.csv')
logs_df.count()

# count by different code type
logs_df.groupBy("code").count().show()
# rank by counts
from pyspark.sql.functions import asc, desc
logs_df.groupBy('code').count().orderBy(desc('count')).show()

# calculate average size of different code
logs_df.groupBy("code").avg("bytes").show()
# more calculation by code - average, min, max
import pyspark.sql.functions as F
logs_df.groupBy("code").agg(
logs_df.code,
F.avg(logs_df.bytes),
F.min(logs_df.bytes),
F.max(logs_df.bytes)
).show()


# homework
# 1
yelp_df.select("cool").agg({"cool" : "mean"}).collect()
# 2
import pyspark.sql.functions as F
yelp_df.filter('review_count >= 10').groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show()
# 3
yelp_df.filter((yelp_df.review_count >= 10) & (yelp_df.open == 'True')).groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show()
# 4
from pyspark.sql.functions import asc, desc
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
import os
spark = SparkSession.builder.appName("SparkSQLDataframes").getOrCreate()
curwd = os.getcwd()
people = spark.read.option("header", "true").option(
    "inferSchema", "true").csv(f"file:///{curwd}/fakefriends-header.csv")
print("log: check our inferred schema: ")
people.printSchema()

print("showing only name column from data")
people.select("age", "friends").show(10)
print("group by age with average of friends")
people.select("age",
              "friends").groupBy("age").avg("friends").sort("age").show(5)
print("doing the same with agg and with round: ")
people.select("age","friends") \
        .groupBy("age") \
        .agg(func.round(func.avg("friends"),2)) \
        .alias("n_friends_avg") \
        .sort("age") \
        .show(5)

spark.stop()
# MAGIC +-------+-----------------------------+-----+-------+
# MAGIC |5.0    |Ella Lola, a la Trilby (1898)|1    |94431  |
# MAGIC |5.0    |Serving Life (2011)          |1    |129034 |
# MAGIC |5.0    |Diplomatic Immunity (2009? ) |1    |107434 |
# MAGIC +-------+-----------------------------+-----+-------+
# MAGIC only showing top 3 rows
# MAGIC ```

# COMMAND ----------

# TODO: Replace <FILL_IN> with appropriate code
from pyspark.sql import functions as F

# From ratingsDF, create a movie_ids_with_avg_ratings_df that combines the two DataFrames
ratings_df.show(3)
movie_ids_with_avg_ratings_df = ratings_df.groupBy('movieId').agg(F.count(ratings_df.rating).alias("count"), F.avg(ratings_df.rating).alias("average"))
print 'movie_ids_with_avg_ratings_df:'
movie_ids_with_avg_ratings_df.show(3, truncate=False)

# Note: movie_names_df is a temporary variable, used only to separate the steps necessary
# to create the movie_names_with_avg_ratings_df DataFrame.
movie_names_df = movie_ids_with_avg_ratings_df.join(movies_df,movie_ids_with_avg_ratings_df["movieId"]==movies_df["Id"])
movie_names_with_avg_ratings_df = movie_names_df.drop("Id")

print 'movie_names_with_avg_ratings_df:'
movie_names_with_avg_ratings_df.show(3, truncate=False)

# COMMAND ----------

# TEST Movies with Highest Average Ratings (1a)
Test.assertEquals(movie_ids_with_avg_ratings_df.count(), 26744,
示例#37
0
# Location of raw historian data in ADLS
adls = os.environ["ADLS_PATH"]
testDataDir = os.path.join(adls, os.environ['PROC_HIST_DATA_PATH'], runDay,
                           'clean/test')
predDataDir = os.path.join(adls, os.environ['PROC_HIST_DATA_PATH'], runDay,
                           'clean/predictions')
modelDevDir = os.path.join(adls, os.environ["PROC_HIST_DATA_PATH"], runDay,
                           'models')
modelName = 'spark-lrPipelineModel-v3'

testData = spark.read.load(testDataDir, format="parquet")

# # Data Feature Preparation
# This model requires data to be stored 'hourly' and padded for missing hours with previous values
hourlyData = testData.withColumn('recordInterval',(F.round(F.unix_timestamp('recordTime')/3600)*3600).cast('timestamp'))\
  .groupBy('recordInterval','deviceID').agg(F.avg('value').alias('reading'))

# 'Pad' the data by filling in any missing hours using the last good (ie. non-null) value from the device
paddedHourlyData = util.padData(hourlyData, spark)

# We need the following features:
# * Previous hour's reading
# * Average of last 5 hours reading
window = Window.partitionBy("deviceID").orderBy("recordInterval")
modelData = paddedHourlyData\
  .withColumn('lastReading',F.avg('reading').over(window.rowsBetween(-1,-1)))\
  .withColumn('avgReadings5hr',F.avg('reading').over(window.rowsBetween(-5,-1)))\
  .filter('deviceID=="85WIC2703_COR"')\
  .na.drop()

# # Model Testing

# COMMAND ----------

from pyspark.sql.functions import sumDistinct
df.select(sumDistinct("Quantity")).show() # 29310


# COMMAND ----------

from pyspark.sql.functions import sum, count, avg, expr

df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()


# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()

示例#39
0
def summary_df(df,fn): #,max_date):
    # drop null ad_click values
    df = df.na.drop(subset=["ad_click"])
    # Remove non search sessions
    df = df[df['ad_click']>0]

    # sum ad_click
    sum_search_clients_daily = df.groupBy("client_id", "country", "submission_date_s3", "activity_date")\
                                        .agg(F.sum("ad_click").alias("ad_click"))
        
    # read revenue_by_country
    rev_by_country_s3_path = "s3://net-mozaws-prod-us-west-2-pipeline-analysis/nawong/revenue_by_country.csv"
    rev_by_country = sqlContext.read.csv(rev_by_country_s3_path, header=True)
    rev_by_country = rev_by_country.withColumn("rev_per_search_float", F.col("rev_per_search").cast("double"))\
                               .withColumn("yyyyMM_timestamp", F.to_timestamp(F.col("yyyymm"), "yyyyMM"))\
                               .withColumn("country_code", F.upper(F.col("country_code")))

    # add country field and revenue table - need transform to calculate transaction-level monetary value
    tbl = sum_search_clients_daily.join(rev_by_country, sum_search_clients_daily.country == rev_by_country.country_code,how='left_outer')
    spec = Window.partitionBy("client_id","country","submission_date_s3").orderBy(F.col("yyyyMM_timestamp").desc())
    # NOTE partition includes country because client may change country over time

    no_country=(
    tbl
        .where(F.isnull(F.col("yyyymm")))
        .withColumn("rev_per_search_float", F.lit(.005))
    )

    has_country=(
    tbl
        .na.drop(subset=["yyyymm"])
        .where("yyyyMM_timestamp <= activity_date")
        .withColumn('rank', F.row_number().over(spec))
        .where("rank = 1")
        .drop('rank')
    )

    tbl2=(no_country.union(has_country))

    # drop first purchase to calculate revenue
    spec2 = Window.partitionBy("client_id").orderBy(F.col("activity_date").asc()) # earliest date has row #1
    search_rev = (tbl2
     .withColumn("rank", F.row_number().over(spec2))
     .where("rank > 1")
    ).groupBy("client_id").agg(F.avg(F.col('rev_per_search_float')*F.col('ad_click')).alias("monetary_value"))
    
    # compute the final dataset for the BG/NBD model
    dataset = (
        tbl2
        .groupBy("client_id")
        .agg(F.datediff(F.max('activity_date'),F.min("activity_date")).alias("recency"), 
             (F.countDistinct('activity_date')-1).alias("frequency"),
             (F.datediff(F.lit(end_date.strftime("%Y-%m-%d")).cast("date"),F.min("activity_date"))).alias("T"),
             F.sum("ad_click").alias("historical_searches"),
             F.sum(F.col('rev_per_search_float')*F.col('ad_click')).alias("historical_clv"))
        .join(search_rev, "client_id", how="left")
        .where("frequency >= 0 AND recency >= 0 AND T >= 0")
        .select("client_id", (F.crc32("client_id") % 100).alias("sample_id"), "frequency","recency","T","monetary_value","historical_searches","historical_clv")
      ).fillna(0, subset=['monetary_value'])

    # anonymize client_id    
    dataset = dataset.withColumn('client_id',sha1(dataset.client_id))

    # write dataset recency, freq, age, revenue table per client
    #dataset.write.partitionBy("sample_id").format("parquet").mode("overwrite").save(fn)
    duplicated = dataset.withColumn("sample_id_dupe", dataset["sample_id"])
    duplicated.write.partitionBy("sample_id_dupe").format("parquet").mode("append").save(fn)
schema1=['id1','name','math']
schema2=['id2','physics','biology']
df1=rdd1.toDF(schema1)
df2=rdd2.toDF(schema2)
print("Time taken for joining 2 DFs:") 
start=time.time()
joined_df=df1.join(df2,(df1.id1 == df2.id2))
print(joined_df.show())
end=time.time()
print(end-start)

##Appending 2 DFs:

#df_appended= df1.unionAll(df2)
df_appended = joined_df.unionAll(joined_df)
df_groupby_id=df_appended.groupBy('id1').agg(F.avg(df_appended.math))
print(df_groupby_id.show())

#df filter
filtered_data=joined_df.filter(joined_df.math>95)
print(filtered_data.show())
#df sort
df_sortby_name=joined_df.sort(joined_df.name.asc())
print(df_sortby_name.show())
    

##Groupby:


########################filter#####################333
filtered_data = counted_grped_data.filter(lambda x: True if x[0]=='hi' else False )
示例#41
0
df2 = df2.withColumn("Media_por_org_sup_diaria", udf_to_value(df2["Valor diárias"]))
df2 = df2.withColumn("Min_por_org_sup_diaria",   udf_to_value(df2["Valor diárias"]))
df2 = df2.withColumn("Total_por_org_sup_diaria", udf_to_value(df2["Valor diárias"]))


# In[ ]:


from pyspark.sql import functions as F


# In[ ]:


df2.groupBy("Nome do órgão superior").agg(F.max("Max_por_org_sup"), 
                                          F.avg("Media_por_org_sup"), 
                                          F.min("Min_por_org_sup"), 
                                          F.sum("Total_por_org_sup")).sort('Nome do órgão superior').show( truncate=True)


# In[ ]:


df2.groupBy("Destinos").agg(F.max("Max_por_destinos"), 
                                          F.avg("Media_por_destinos"), 
                                          F.min("Min_por_destinos"), 
                                          F.sum("Total_por_destinos")).sort('Destinos').show( truncate=True)


# In[ ]:
 def _add_average(self, df, low_col, high_col, enddate, compute_term):
     avgDf = df.agg(
         fn.avg(low_col).alias(low_col),
         fn.avg(high_col).alias(high_col),
         fn.avg("avg_chg_market_3d").alias("avg_chg_market_3d"),
         fn.avg("avg_chg_market_5d").alias("avg_chg_market_5d"),
         fn.avg("avg_chg_market_10d").alias("avg_chg_market_10d"),
         fn.avg("avg_chg_industry_3d").alias("avg_chg_industry_3d"),
         fn.avg("avg_chg_industry_5d").alias("avg_chg_industry_5d"),
         fn.avg("avg_chg_industry_10d").alias("avg_chg_industry_10d"),
         fn.avg("top_ind_perc_3d").alias("top_ind_perc_3d"),
         fn.avg("top_ind_perc_5d").alias("top_ind_perc_5d"),
         fn.avg("top_ind_perc_10d").alias("top_ind_perc_10d"),
         fn.avg("avg_chg_stock_3d").alias("avg_chg_stock_3d"),
         fn.avg("avg_chg_stock_5d").alias("avg_chg_stock_5d"),
         fn.avg("avg_chg_stock_10d").alias("avg_chg_stock_10d"),
         fn.avg("top_stock_perc_3d").alias("top_stock_perc_3d"),
         fn.avg("top_stock_perc_5d").alias("top_stock_perc_5d"),
         fn.avg("top_stock_perc_10d").alias("top_stock_perc_10d")
     ).withColumn("busi_date", fn.lit(enddate))\
         .withColumn("compute_term", fn.lit(compute_term))\
         .withColumn("trade_id", fn.lit("average"))
     df = df.union(avgDf)
     return df
示例#43
0
# Get Number of Rows of a DataFrame
df_title_basics.count()

# In[6]:

# Groups and Counts: Get column titleTypes values with counts and ordered descending
from pyspark.sql.functions import desc
df_title_basics.groupBy("titleType").count().orderBy(desc("count")).show()

# In[7]:

# Calculate average Movie length in minutes
from pyspark.sql.functions import avg, col
df_title_basics.where(col('titleType') == 'movie').agg(
    avg('runtimeMinutes')).show()

# In[8]:

# Save Dataframe back to HDFS (partitioned) as Parquet files
df_title_basics.repartition('startYear').write.format("parquet").mode(
    "overwrite").partitionBy('startYear').save(
        '/user/hadoop/imdb/title_basics_partitioned_files')

# In[9]:

# Save Dataframe back to HDFS (partitioned) as EXTERNAL TABLE and Parquet files
df_title_basics.repartition('startYear').write.format(
    "parquet").mode("overwrite").option(
        'path',
        '/user/hadoop/imdb/title_basics_partitioned_table').partitionBy(
spark.sql("""SELECT first(StockCode) as first, 
                    last(StockCode) as last,
                    min(Quantity)  as minQty,
                    max(Quantity) as maxQty
             FROM dfTable""").show()


# ----------------------------------------------------------
#  Example 3 - sum, sumDistinct, avg
# ----------------------------------------------------------
from pyspark.sql.functions import sum, sumDistinct, avg

df.select(sum("Quantity")).show() 
df.select(sumDistinct("Quantity")).show()
df.select(avg("Quantity")).show()

spark.sql("""SELECT sum(Quantity)  as sumQty, 
                    mean(Quantity) as mean
             FROM dfTable""").show() 

from pyspark.sql.functions import mean, expr             
df.select(
  count("Quantity").alias("total_transactions"),
  sum("Quantity").alias("total_purchases"),
  avg("Quantity").alias("avg_purchases"),
  expr("mean(Quantity)").alias("mean_purchases")) \
.selectExpr(
  "total_purchases/total_transactions",
  "avg_purchases",
  "mean_purchases").show()
示例#45
0
        .getOrCreate()

    df = spark.read.format("csv").option("header", "true").load(DATA_FILE)
    print('Source data frame:')
    df.show()
    df = df.withColumn("temperatureTmp", df.temperature.cast('float')) \
        .drop("temperature") \
        .withColumnRenamed("temperatureTmp", "temperature")

    df = df.withColumn('month', F.month('date'))

    print('Min, max and avg temperature for each month for each city:')
    temp_df = df.groupBy('city', 'month').agg(
        F.max(F.col('temperature')).alias('max_temperature'),
        F.min(F.col('temperature')).alias('min_temperature'),
        F.avg(F.col('temperature')).alias('avg_temperature')).orderBy(
            'month', 'city')
    temp_df.show()

    if SAVE_RESULTS:
        temp_df.write.csv('data/agg_city_month.csv')

    N = 255  # min examples to take
    print(
        'Min, max and avg temperature for each month for each city having at least {} records:'
        .format(N))
    temp_df = df.groupBy('city', 'month').agg(F.max(F.col('temperature')).alias('max_temperature'),
                                              F.min(F.col('temperature')).alias('min_temperature'),
                                              F.avg(F.col('temperature')).alias('avg_temperature'),
                                              F.count(F.col('temperature')).alias('count_temperature')) \
        .orderBy('month', 'city')
    .drop("value", "sensor_id", "id")

# Group all streaming measurements by location and timestamp.
# Group them by 1 minutes and calculate avg metrics for air quality
# one-minute intervals.
# Also, group them with 5 minutes delay for rewriting previous metrics
# for last 5 minutes if they occurs.
w = df4\
    .withWatermark("timestamp", "5 minutes") \
    .groupBy(window("timestamp", "1 minutes"), \
             col("location_id"), \
             col("latitude"), \
             col("longitude"), \
             col("country")) \
    .agg( \
        avg("temperature").alias("temperature"), \
        avg("humidity").alias("humidity"), \
        avg("pressure").alias("pressure"), \
        avg("P1").alias("P1"), \
        avg("P2").alias("P2")) \
    .withColumn("timestamp", col("window").end).drop("window")

# Create new DataFrame with only one column 'value'
# that contains all row as csv row delimited by comma
output = w.select(
    concat(
        col("timestamp"), lit(","), \
        col("location_id"), lit(","), \
        col("latitude"), lit(","), col("longitude"), lit(","), \
        col("country"), lit(","), \
        col("temperature"), lit(","), col("humidity"), lit(","), \
示例#47
0
                     marginal_y="violin",
                     marginal_x="box",
                     trendline="ols",
                     template="seaborn")
    fig.update_layout(yaxis_tickformat="$.0f" if iscash else ".2%",
                      xaxis_tickformat=".2%")
    return fig


scurve(df_PVDE_F_P, 'BeckPVDE', iscash=True)

# COMMAND ----------

# DBTITLE 1,3D Surface Graph
pivot_Tbl = dfr.groupby("runid", "sourcevarname", "LineNo").pivot("Month").agg(
    avg("variablevalue")).orderBy(*('runid', 'LineNo'), ascending=True)
pivot_Tbl_Pandas = pivot_Tbl.toPandas()
c = pivot_Tbl_Pandas.shape[1] - 3
x = np.linspace(0, 594, c)
df = pivot_Tbl_Pandas.drop(['runid', 'sourcevarname', 'LineNo'], axis=1)

fig_3dsurface = go.Figure(
    data=[go.Surface(z=df.values, x=x, y=StatInc_Tbl_Pandas.LineNo)])
fig_3dsurface.show()

# COMMAND ----------

vw_mv_get_df.unpersist()

# COMMAND ----------
示例#48
0
    # Create a schema for dataframe.
    df_schema = StructType([
        StructField("user_id", IntegerType(), False),
        StructField("name", StringType(), False),
        StructField("age", IntegerType(), False),
        StructField("friends_count", IntegerType(), True)
    ])

    dataset_df = spark.read.schema(df_schema).csv(
        'dataset/fakefriends-header.csv',
        header=True,
        sep=',',
        inferSchema=False).cache()

    # Select only required columns and discard the useless ones as soon as possible to prevent wasting cluster resource.
    age_friends_count_df = dataset_df.select('age', 'friends_count')

    # Simple average
    # avg_friends_count_by_age = age_friends_count_df.groupBy('age').avg('friends_count')

    # Round average and rename the aggregation column.
    # Needs agg function to modify average.
    avg_friends_count_by_age = age_friends_count_df.groupBy('age').\
        agg(func.round(func.avg('friends_count'), 2).alias('avg_friends_count')).\
        orderBy('avg_friends_count')

    avg_friends_count_by_age.show()

    spark.stop()
示例#49
0
    '*',
    u_parse_time(cleaned_df['timestamp']).cast('timestamp').alias(
        'time')).drop('timestamp')

logs_df.cache()

# Content Size statistics
# -------------------------
content_size_summary_df = logs_df.describe(['content_size'])
# content_size_summary_df.show()

# Alternate method

content_size_stats = content_size_summary_df.agg(
    min(content_size_summary_df['content_size']),
    avg(content_size_summary_df['content_size']),
    max(content_size_summary_df['content_size'])).first()

# print(content_size_stats[1])

# HTTP Status analysis
# --------------------
status_by_count_df = logs_df.groupby('status').count().sort('status')

# status_by_count_df.show()

# Frequent hosts
# --------------
host_sum_df = logs_df.groupBy("host").count()
# host_sum_df.show(10, truncate=False)
host_more_than_10_df = host_sum_df.filter(host_sum_df['count'] > 10).select(
示例#50
0
# MAGIC ### Baseline Model
# MAGIC
# MAGIC A **baseline model** offers an educated best guess to improve upon as different models are trained and evaluated.  It represents the simplest model we can create.  This is generally approached as the center of the data.  In the case of regression, this could involve predicting the average of the outcome regardless of the features it sees.  In the case of classification, the center of the data is the mode, or the most common class.
# MAGIC
# MAGIC A baseline model could also be a random value or a preexisting model.  Through each new model, we can track improvements with respect to this baseline.

# COMMAND ----------

# MAGIC %md
# MAGIC Create a baseline model by calculating the most common Survival status (rounding of average) in the training dataset.

# COMMAND ----------

from pyspark.sql.functions import avg

trainAvg = trainDF.select(avg("Survived")).first()[0]
trainAvg = float(round(trainAvg))

print("Common Survival Status: {}".format(trainAvg))

# COMMAND ----------

# MAGIC %md
# MAGIC Take the average calculated on the training dataset and append it as the column `prediction` on the test dataset.

# COMMAND ----------

from pyspark.sql.functions import lit

testPredictionDF = testDF.withColumn("prediction", lit(trainAvg))
示例#51
0
    def calculate_avg_node_score(self):
        # TODO Assumes that only one nodes file exists, needs to be fixed for link data
        # Create a SparkSession
        # Note: In case its run on Windows and generates errors use (tmp Folder mus exist):
        # spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").
        # appName("Postprocessing").getOrCreate()
        spark = SparkSession.builder.appName(
            "Calculate_Controvercy_Score_Nodes").getOrCreate()
        nodes_source = spark.sparkContext.textFile(
            os.path.join(os.getcwd(), self.data_path, self.nodes_files[0]))
        nodes = nodes_source.map(self.mapper_nodes)
        nodes_df = spark.createDataFrame(nodes).cache()
        nodes_df.createOrReplaceTempView("nodes")
        results_file = os.path.join(self.data_path, self.nodes_files[0])
        tmp_results_file = os.path.join(self.data_path,
                                        'tmp_' + self.nodes_files[0])
        spark_results_path = os.path.join(self.data_path,
                                          self.nodes_files[0][:-4])

        for file in self.events_files:
            events_source = spark.sparkContext.textFile(
                os.path.join(self.data_path, file))
            events = events_source.map(self.mapper_events)
            events_df = spark.createDataFrame(events).cache()
            events_df.createOrReplaceTempView("events")
            source_df = spark.sql('SELECT source as node, cscore FROM events')
            target_df = spark.sql('SELECT target as node, cscore FROM events')
            node_cscores_df = source_df.union(target_df)
            avg_node_cscores_df = node_cscores_df.groupby('node').agg(
                avg('cscore').alias('avg_cscore'))
            avg_node_cscores_df.createOrReplaceTempView("cscore_nodes")

            nodes = spark.sql(
                "SELECT n.id, n.title, n.ns, c.avg_cscore as cscore "
                "FROM nodes n LEFT OUTER JOIN cscore_nodes c ON n.id = c.node")

            nodes.write.format('com.databricks.spark.csv').option(
                'header', 'false').option('delimiter',
                                          '\t').save(spark_results_path)

            self.assemble_spark_results(spark_results_path, tmp_results_file)
            os.remove(os.path.join(self.data_path, self.nodes_files[0]))
            os.rename(tmp_results_file, results_file)

            print('results assembled')
            #HANDLE Null Values in CSCORE: Replace NULL WITH ZERO Option 1
            print('Handle Cscore Null Values for Nodes')
            nodes = pd.read_csv(results_file,
                                header=None,
                                delimiter='\t',
                                names=['id', 'title', 'ns', 'cscore'],
                                skip_blank_lines=True,
                                na_filter=False,
                                error_bad_lines=False,
                                warn_bad_lines=True)
            print('Number of nodes without cscore')
            print(len(nodes.loc[nodes['cscore'] == ""]))
            nodes.loc[nodes['cscore'] == "", 'cscore'] = 0.0
            nodes.to_csv(results_file,
                         sep='\t',
                         index=False,
                         header=False,
                         mode='w')
        del spark
示例#52
0
    def calculate_avg_edge_score(self):
        # TODO Assumes that only one edges file exists, needs to be fixed for link data
        # Create a SparkSession
        # Note: In case its run on Windows and generates errors use (tmp Folder mus exist):
        # spark = SparkSession.builder.config("spark.sql.warehouse.dir", "file:///C:/temp").appName("Postprocessing").getOrCreate()
        spark = SparkSession.builder.appName(
            "Calculate_Controvercy_Score_Edges").getOrCreate()

        edges_source = spark.sparkContext.textFile(
            os.path.join(self.data_path, self.edges_files[0]))
        edges = edges_source.map(self.mapper_edges)
        edges_df = spark.createDataFrame(edges).cache()
        edges_df.createOrReplaceTempView("edges")

        results_file = os.path.join(self.data_path, self.edges_files[0])
        tmp_results_file = os.path.join(self.data_path,
                                        'tmp_' + self.edges_files[0])
        spark_results_path = os.path.join(self.data_path,
                                          self.edges_files[0][:-4])

        for file in self.events_files:
            events_source = spark.sparkContext.textFile(
                os.path.join(self.data_path, file))
            events = events_source.map(self.mapper_events)
            events_df = spark.createDataFrame(events).cache()
            events_df.createOrReplaceTempView("events")

            avg_edge_cscores_df = events_df.groupby('source', 'target').agg(
                avg('cscore').alias('avg_cscore'))
            avg_edge_cscores_df.createOrReplaceTempView("cscore_edges")
            edges = spark.sql(
                "SELECT e.source, e.target, e.etype, c.avg_cscore as cscore "
                "FROM edges e LEFT OUTER JOIN cscore_edges c "
                "ON e.source = c.source AND e.target = c.target")
            edges.write.format('com.databricks.spark.csv').option(
                'header', 'false').option('delimiter',
                                          '\t').save(spark_results_path)

            self.assemble_spark_results(spark_results_path, tmp_results_file)

            os.remove(os.path.join(self.data_path, self.edges_files[0]))
            os.rename(tmp_results_file, results_file)

            #HANDLE Null Values in CSCORE: Replace NULL WITH ZERO
            print('Handle Cscore Null Values for edges.')
            edges = pd.read_csv(results_file,
                                header=None,
                                delimiter='\t',
                                names=['source', 'target', 'type', 'cscore'],
                                skip_blank_lines=True,
                                na_filter=False,
                                error_bad_lines=False,
                                warn_bad_lines=True)
            print('Number of edges without cscore')
            print(len(edges.loc[edges['cscore'] == ""]))
            edges.loc[edges['cscore'] == "", 'cscore'] = 0.0
            edges.to_csv(results_file,
                         sep='\t',
                         index=False,
                         header=False,
                         mode='w')

        del spark
示例#53
0
文件: week6-2.py 项目: gregce/MIDS
# In[ ]:

sqlCtx.sql("select program,avg(age) AS AverageAge FROM st GROUP BY program").show()


# In[ ]:




# In[ ]:

from pyspark.sql import functions as funcs

AvgMin=students.groupBy('program').agg(funcs.avg('age').alias('AverageAge '),funcs.max('age').alias('MaximumAge'))

AvgMin.show()


# In[ ]:




# #How the queries are optimized

# In[ ]:

sqlCtx.sql("select name, program FROM st").explain()
示例#54
0
(new_fire_df.select("ResponseDelayedinMins").where(
    col("ResponseDelayedinMins") > 5).show(5, False))

# Convert to more usable formats
fire_ts_df=(new_fire_df
.withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
.drop("CallDate")
.withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
.drop("WatchDate")
.withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"),
"MM/dd/yyyy hh:mm:ss a"))
.drop("AvailableDtTm"))
# Select the converted columns
fire_ts_df.select("IncidentDate", "OnWatchDate",
                  "AvailableDtTS").show(5, False)
fire_ts_df.select(year('IncidentDate')).distinct().orderBy(
    year("IncidentDate")).show(10, False)

# the most common types of fire calls
(fire_ts_df.select("CallType")
.where(col("CallType").isNotNull()).groupBy("CallType")
.count().orderBy("count", ascending=False)
.show(n=10, truncate=False))

# some Computatinos
(fire_ts_df
.select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),
F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
.show())
# ----------------------------------------------------------------------------
content_size_summary_df.show()

# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC Alternatively, we can use SQL to directly calculate these statistics.  You can explore the many useful functions within the `pyspark.sql.functions` module in the [documentation](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions).
# MAGIC 
# MAGIC After we apply the `.agg()` function, we call `.first()` to extract the first value, which is equivalent to `.take(1)[0]`.

# COMMAND ----------

from pyspark.sql import functions as sqlFunctions
contentSizeStats =  (logs_df
                     .agg(sqlFunctions.min(logs_df['content_size']),
                          sqlFunctions.avg(logs_df['content_size']),
                          sqlFunctions.max(logs_df['content_size']))
                     .first())

print 'Using SQL functions:'
print 'Content Size Avg: {1:,.2f}; Min: {0:.2f}; Max: {2:,.0f}'.format(*contentSizeStats)

# COMMAND ----------

# MAGIC %md
# MAGIC ### (3b) Example: HTTP Status Analysis
# MAGIC 
# MAGIC Next, let's look at the status values that appear in the log. We want to know which status values appear in the data and how many times.  We again start with `logs_df`, then group by the `status` column, apply the `.count()` aggregation function, and sort by the `status` column.

# COMMAND ----------
示例#56
0
sqlContext = SQLContext(sc)

#set time variables for date filtering
time = datetime.datetime.now()
epochtime = int(time.strftime("%s"))
start_time = epochtime - 86400
compare_time = datetime.datetime.fromtimestamp(start_time)

#create a dataframe from the raw metrics
rawmetrics = sqlContext.read.format("org.apache.spark.sql.cassandra").options(table="raw_metrics", keyspace="metrics").load()

#filter metrics to those in last 24 hours
last_day = rawmetrics.where(rawmetrics.metric_time > compare_time)

#aggregates
averages = last_day.groupby('device_id').agg(func.avg('metric_value').alias('metric_avg'))
maximums = last_day.groupby('device_id').agg(func.max('metric_value').alias('metric_max'))
minimums = last_day.groupby('device_id').agg(func.min('metric_value').alias('metric_min'))

#rename id columns for uniqueness
averages_a = averages.withColumnRenamed("device_id", "id")
maximums_a = maximums.withColumnRenamed("device_id", "maxid")
minimums_a = minimums.withColumnRenamed("device_id", "minid")

#join the tables above
temp = averages_a.join(maximums_a, averages_a.id == maximums_a.maxid)
aggs = temp.join(minimums, temp.id == minimums.device_id).select('id','metric_min','metric_max','metric_avg')

#add columns to format for cassandra
addday = aggs.withColumn("metric_day", lit(time))
addname = addday.withColumn("metric_name",lit("KWH"))