示例#1
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

spark = SparkSession.builder.appName("TotalSpentByCustomer").master(
    "local[*]").getOrCreate()

# Create schema when reading customer-orders
customerOrderSchema = StructType([
    StructField("cust_id", IntegerType(), True),
    StructField("item_id", IntegerType(), True),
    StructField("amount_spent", FloatType(), True)
])

# Load up the data into spark dataset
customersDF = spark.read.schema(customerOrderSchema).csv(
    "./data/customer-orders.csv")

totalByCustomer = customersDF\
    .groupBy("cust_id")\
    .agg(func.round(func.sum("amount_spent"), 2).alias("total_spent"))

totalByCustomerSorted = totalByCustomer.sort("total_spent")

totalByCustomerSorted.show(totalByCustomerSorted.count())

spark.stop()
示例#2
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as func

spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

people = (spark.read.option("header", "true").option(
    "inferSchema", "true").csv("./fakefriends-header.csv"))

# now the schema is
# root
#  |-- userID: integer (nullable = true)
#  |-- name: string (nullable = true)
#  |-- age: integer (nullable = true)
#  |-- friends: integer (nullable = true)

results = people.select(
    "age", "friends").groupBy("age").avg("friends").sort("age").show()

# use alias and 2 decimal places
results = (people.select("age", "friends").groupBy("age").agg(
    func.round(func.avg("friends"),
               2).alias("friends_avg")).sort("age").show())

spark.stop()
sc=SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
all_trips= StructType([
    StructField("medallion",StringType(),True),
    StructField("hack_license", StringType(),True),
    StructField("vendor_id", StringType(),True),
    StructField("pickup_datetime", StringType(),True),
    StructField("rate_code", StringType(),True),
    StructField("store_and_fwd_flag", StringType(),True),
    StructField("drop_datetime", StringType(),True),
    StructField("passenger_count", StringType(),True),
    StructField("trip_time_in_secs", StringType(),True),
    StructField("trip_distance", StringType(),True),
    StructField("pickup_longitude", StringType(),True),
    StructField("pickup_latitude", StringType(),True),
    StructField("dropoff_longitude", StringType(),True),
    StructField("dropoff_latitude", StringType(),True),
    StructField("payment_type", StringType(),True),
    StructField("fare_amount", StringType(),True),
    StructField("surcharge", StringType(),True),
    StructField("mta_tax", StringType(),True),
    StructField("tip_amount", StringType(),True),
    StructField("tolls_amount", StringType(),True),
    StructField("total_amount", StringType(),True)])
allTrips = spark.read.format('csv').schema(all_trips).options(header='false',inferschema='true').load(sys.argv[1])
allTrips.createOrReplaceTempView("allTrips")

df = spark.sql("select date(pickup_datetime) as date, round(sum(fare_amount + surcharge + tip_amount),2) as total_revenue, round(sum(tolls_amount),2) as total_tolls from allTrips group by date(pickup_datetime) order by date(pickup_datetime) asc")

df.select(format_string('%s,%s,%s', from_unixtime(unix_timestamp(df.date, "yyyy-MM-dd"),'yyyy-MM-dd'), func.round(df.total_revenue,2), func.round(df.total_tolls,2))).write.save("task2c-sql.out", format="text")
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType,StructField,FloatType,IntegerType

spark = SparkSession.builder.appName("customerOrder").getOrCreate()

schema = StructType([StructField("ID",IntegerType(),True),
                    StructField("ITEM_ID",IntegerType(),True),
                    StructField("PRICE",FloatType(),True)])

df = spark.read.schema(schema).csv("file:///ApacheSparkCourse/customer-orders.csv")

customerBuy = df.select("ID","PRICE")
customerBuy = customerBuy.groupBy("ID").agg(func.round(func.sum("PRICE"),2).alias("TOTAL"))

customerBuy = customerBuy.sort("TOTAL")


customerBuy.show(customerBuy.count())

# ## Exercises

# (1) Extract the hour of day and day of week from `rides.date_time`.

from pyspark.sql.functions import hour, dayofweek
rides \
  .withColumn("hour_of_day", hour("date_time")) \
  .withColumn("day_of_week", dayofweek("date_time")) \
  .select("date_time", "hour_of_day", "day_of_week") \
  .show(5)

# (2) Convert `rides.duration` from seconds to minutes.

from pyspark.sql.functions import col, round
rides \
  .withColumn("duration_in_minutes", round(col("duration") / 60, 1)) \
  .select("duration", "duration_in_minutes") \
  .show(5)

# (3) Convert `rides.cancelled` to a Boolean column.

# Using the `cast` method:
rides \
  .withColumn("cancelled", col("cancelled").cast("boolean")) \
  .select("cancelled") \
  .show(5)

# Using a Boolean expression:
rides \
  .withColumn("cancelled", col("cancelled") == 1) \
  .select("cancelled") \
     estimator=als,
     evaluator=evaluator,
     estimatorParamMaps=paramGrid,
     numFolds=5
)

model = cv.fit(train)


predictions = model.transform(test)
evaluator.evaluate(predictions)

display(predictions)


predictions = predictions.withColumn("prediction", F.abs(F.round(predictions["prediction"],0)))
display(predictions)

userRecommendations = model.bestModel.recommendForAllUsers(10)
display(userRecommendations)

itemRecommendations = model.bestModel.recommendForAllItems(10)
display(itemRecommendations)

display(userpayment)

display(chefmozaccepts)

chefmozaccepts =  chefmozaccepts.withColumnRenamed("Rpayment", "Upayment")

display(chefmozaccepts)
avgSalaryDF.show()

# COMMAND ----------

# MAGIC %md
# MAGIC Convert that value to an integer using the `round()` function. See
# MAGIC <a href "https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.functions$" class="text-info">the documentation for <tt>round()</tt></a>
# MAGIC for more details.

# COMMAND ----------

from pyspark.sql.functions import round

roundedAvgSalaryDF = avgSalaryDF.select(
    round("averageSalary").alias("roundedAverageSalary"))

roundedAvgSalaryDF.show()

# COMMAND ----------

# MAGIC %md
# MAGIC In addition to the average salary, what are the maximum and minimum salaries?

# COMMAND ----------

from pyspark.sql.functions import min, max

salaryDF = peopleDF.select(
    max("salary").alias("max"),
    min("salary").alias("min"),
# COMMAND ----------

days_back = 14
values_per_second = 337
nowTimestamp = time.time()

# COMMAND ----------

dfTimeSeries = sqlContext.range(0, days_back * 24 * 60 * 60 * values_per_second) \
  .withColumn("Timestamp", (nowTimestamp - (F.col("id") / values_per_second)).cast("Timestamp")) \
  .drop("id") \
  .withColumn("Sensor", F.concat_ws('-',
                               1 + (F.rand() * 10).cast("Int"),
                               1 + (F.rand() * 100).cast("Int"),
                               1 + (F.rand() * 350).cast("Int"))) \
  .withColumn("Value", F.round(F.rand() * 100, 3)) \
  .withColumn("year", F.year("Timestamp")) \
  .withColumn("month", F.month("Timestamp")) \
  .withColumn("day", F.dayofmonth("Timestamp"))

display(dfTimeSeries)

# COMMAND ----------

spark.conf.set("fs.azure.account.key.<StorageAccountName>.blob.core.windows.net", \
  "<StorageAccountKey>")

dfTimeSeries.write \
  .mode("overwrite") \
  .partitionBy("year", "month", "day") \
  .csv("wasbs://<StorageContainer>@<StorageAccountName>.blob.core.windows.net/timeseries")
示例#9
0
def calculate_percentage(df: DataFrame, col_name: str):
    return df.groupBy(col_name).agg(f.round(f.count(col_name) * 100 / df.count(), 1).alias('Percentage')) \
        .orderBy('Percentage', ascending=False)
                    .appName('first_spark_application') \
                    .getOrCreate()

cars = spark.read.csv('/Users/wel51x/Box Sync/MyBox/Code/DataCamp/data/cars.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

cars = cars.dropna()

# Get number of records
print("The data contains %d records." % cars.count(), '\n')

cars = cars.withColumnRenamed("ncyl", "cyl")
cars = cars.withColumn('length_meters', round(cars.length * 0.0254, 3))

cars = cars.withColumn('weight_kg', round(cars.weight / 2.205, 0))

cars = cars.withColumn('avg_mpg', round((cars.city_mpg + cars.hwy_mpg) / 2, 1)) \
            .drop("city_mpg", "hwy_mpg")

cars = cars.withColumn('consumption', round((100 * 3.785411784) / (cars.avg_mpg * 1.609344), 2))

pd.set_option('display.max_columns', None) # all cols
pd.set_option('display.width', 161)
pd.set_option('display.max_colwidth', 199)
#print(cars.toPandas().sample(8), '\n')

indexer = StringIndexer(inputCol='type',
                        outputCol='type_idx')
	def cal_indexs(self):
		portfolio_acret = self.portfolio_acret
		risk_free_rateline = self.risk_free_rateline
		basedata = portfolio_acret.join(risk_free_rateline,
										'trading_date') \
			.drop(risk_free_rateline.trading_date)# .dropDuplicates(['trading_date', 'com_id'])

		basedata = basedata.withColumn('order_nm_desc', F.row_number().over(self.__constants.w_h_desc)) \
			.withColumn('order_nm', F.row_number().over(self.__constants.w_h)) \
			.withColumn('date_diff',
						F.datediff(F.col('trading_date'), F.min(F.col('trading_date')).over(self.__constants.w_h))) \
			.cache()

		basedata = basedata.withColumn('init_arh',
									   F.first(F.col('accum_ret_h')).over(self.__constants.w_h)) \
			.withColumn('annl_ret_h',
						F.when(basedata.date_diff >= 30,
							   F.round(F.pow(F.col('accum_ret_h') / F.col('init_arh'),
											 242.0 / F.col('order_nm')) - 1.0, 6)) \
						.otherwise(None)) \
			.withColumn('pre1y_arh',
						F.first(F.col('accum_ret_h')).over(self.__constants.w_y)) \
			.withColumn('pre1y_odn',
						F.first(F.col('order_nm')).over(self.__constants.w_y)) \
			.withColumn('annl_ret_1y',
						F.when((basedata.date_diff >= 30) & (basedata.date_diff < 365),
							   F.round(F.pow(F.col('accum_ret_h') / F.col('pre1y_arh'),
											 242.0 / (F.col('order_nm') - F.col('pre1y_odn') + 1)) - 1.0, 6)) \
						.when(basedata.date_diff >= 365,
							  F.round(F.pow(F.col('accum_ret_h') / F.col('pre1y_arh'),
											242.0 / 242.0) - 1.0, 6))
						.otherwise(None))

		basedata = basedata.withColumn('lograte',
									   F.log(basedata.accum_ret_h / F.lag(basedata.accum_ret_h, 1, default=1.0) \
											 .over(self.__constants.w_unbnd))) \
			.withColumn('annl_std_1y',
						F.when(basedata.date_diff >= 30,
							   F.round(F.stddev(F.col('lograte')).over(self.__constants.w_y) * np.sqrt(242.0), 6)) \
						.otherwise(None)
						) \
			.withColumn('annl_std_h',
						F.when(basedata.date_diff >= 30,
							   F.round(F.stddev(F.col('lograte')).over(self.__constants.w_h) * np.sqrt(242.0), 6)) \
						.otherwise(None)
						) \
			.withColumn('drawdown',
						F.col('accum_ret_h') / F.max(F.col('accum_ret_h')).over(self.__constants.w_h) - 1.0) \
			.withColumn('max_drawdown_h',
						F.round(F.min(F.col('drawdown')).over(self.__constants.w_h), 6)) \
			.withColumn('sharp_ratio_1y',
						F.round((F.col('annl_ret_1y') - F.col('rfrate')) / F.col('annl_std_1y'), 6)) \
			.withColumn('sharp_ratio_h',
						F.round((F.col('annl_ret_h') - F.col('rfrate')) / F.col('annl_std_h'), 6)
						).filter(basedata.order_nm_desc <= 242)

		basedata = basedata.withColumn('drawdown_1y',
									   F.col('accum_ret_h') / F.max(F.col('accum_ret_h')).over(self.__constants.w_h) - 1.0) \
			.withColumn('max_drawdown_1y',
						F.round(F.min(F.col('drawdown_1y')).over(self.__constants.w_h), 6))

		init_result = basedata.select('com_id',
									  'com_name',
									  'trading_date',
									  'annl_ret_h',
									  'annl_std_h',
									  'sharp_ratio_h',
									  'max_drawdown_h',
									  'annl_ret_1y',
									  'annl_std_1y',
									  'sharp_ratio_1y',
									  'max_drawdown_1y')
		return init_result
## Some random challenge , for agg function

df.groupby(df["FilamentType"]).agg(avg("LifeInHours")).show(truncate=False)
# withColumn will replace if there is already col name with same name
df.withColumn("LifeInHours", col("LifeInHours").cast(DoubleType()))
# NOTE for sum and min/max after groupby() we dont need agg as below
df.groupby("FilamentType").sum("LifeInHours").show(truncate=False)
df.groupby("FilamentType").agg(
    countDistinct("LifeInHours")).show(truncate=False)
# NOTE for sum and max after groupby() we dont need agg as below
df.groupby("FilamentType").max("LifeInHours").show(truncate=False)

df.groupby("FilamentType").agg(count("LifeInHours").alias("cnt")) \
    .sort(col("cnt"), ascending=False).show(truncate=False)

df.groupby(col("FilamentType")).agg(
    round(avg(col("LifeInHours"))).alias("avg_rounded")).show(truncate=False)

# Try creating table in-memory in Spark side and make use of the SQL syntax to do the same.
df.createOrReplaceTempView("bulb_table")
sql_1 = spark.sql(
    "select FilamentType, count(distinct LifeInHours) as cnt  from bulb_table group by FilamentType"
)
sql_1.show(truncate=False)

# Computing Average, with round function.
sql_2 = spark.sql(
    " select FilamentType , round(avg(LifeInHours)) as avg_life  from bulb_table "
    "group by FilamentType limit 5")
sql_2.show(truncate=False)
示例#13
0
    def columns(df, columns, buckets=10):
        """
        Return statistical information about a specific column in json format
        count_data_type()
        :param df: Dataframe to be processed
        :param columns: Columns that you want to profile
        :param buckets:
        :return: json object with the
        """

        columns = parse_columns(df, columns)

        # Get just a sample to infer the column data type
        # sample_size_number = sample_size(rows_count, 95.0, 2.0)
        # fraction = sample_size_number / rows_count
        # sample = df.sample(False, fraction, seed=1)

        # Initialize Objects
        column_info = {}
        column_info['columns'] = {}

        rows_count = df.count()
        column_info['rows_count'] = rows_count

        count_dtypes = Profiler.count_data_types(df, columns)

        column_info["count_types"] = count_dtypes["count_types"]
        column_info['size'] = human_readable_bytes(df.size())

        def na(col_name):
            return F.count(
                F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name))

        def zeros(col_name):
            return F.count(F.when(F.col(col_name) == 0, col_name))

        # Cast every column to a specific type to ensure the correct profiling
        # For example if we calculate the min or max of a string column with numeric values the result will be incorrect
        for col_name in columns:
            dtype = count_dtypes["columns"][col_name]['dtype']
            # Not force date type conversion, we can not trust that is going to be representative
            if dtype in ["string", "float", "int", "bool"]:
                df = df.cols.cast(col_name, dtype)

        stats = df.cols._exprs([
            F.min, F.max, F.stddev, F.kurtosis, F.mean, F.skewness, F.sum,
            F.variance, F.approx_count_distinct, na, zeros
        ], columns)

        for col_name in columns:
            logging.info("Processing column '" + col_name + "'...")

            col_info = {}
            col_info["stats"] = {}
            column_info['columns'][col_name] = {}

            column_type = count_dtypes["columns"][col_name]['type']
            col_info['column_dtype'] = count_dtypes["columns"][col_name][
                'dtype']

            na = stats[col_name]["na"]
            max_value = stats[col_name]["max"]
            min_value = stats[col_name]["min"]

            col_info['name'] = col_name
            col_info['column_type'] = column_type

            # Numeric Column
            if column_type == "numeric" or column_type == "date":
                # Merge
                col_info["stats"] = stats[col_name]

            # Missing
            col_info['stats']['missing_count'] = round(na, 2)
            col_info['stats']['p_missing'] = round(na / rows_count * 100, 2)
            col_info["dtypes_stats"] = count_dtypes["columns"][col_name][
                'details']

            if column_type == "categorical" or column_type == "numeric" or column_type == "date" or column_type == "bool":
                # Frequency

                col_info['frequency'] = (
                    df.groupBy(col_name).count().rows.sort([
                        ("count", "desc"), (col_name, "desc")
                    ]).limit(10).withColumn(
                        "percentage",
                        F.round((F.col("count") / rows_count) * 100,
                                3)).cols.rename(col_name, "value").to_json())

                # Uniques
                uniques = stats[col_name].pop("approx_count_distinct")
                col_info['stats']["uniques_count"] = uniques
                col_info['stats']["p_uniques"] = round(
                    uniques / rows_count * 100, 3)

            if column_type == "numeric":
                # Additional Stats
                # Percentile can not be used a normal sql.functions. approxQuantile in this case need and extra pass
                # https: // stackoverflow.com / questions / 45287832 / pyspark - approxquantile - function
                max_value = fast_float(max_value)
                min_value = fast_float(min_value)
                col_info['stats']['quantile'] = df.cols.percentile(
                    col_name, [0.05, 0.25, 0.5, 0.75, 0.95])
                col_info['stats']['range'] = max_value - min_value
                col_info['stats']['median'] = col_info['stats']['quantile'][
                    0.5]
                col_info['stats']['interquartile_range'] = col_info['stats']['quantile'][0.75] - \
                                                           col_info['stats']['quantile'][0.25]
                col_info['stats']['coef_variation'] = round(
                    (col_info['stats']['stddev'] / col_info['stats']['mean']),
                    5)
                col_info['stats']['mad'] = round(df.cols.mad(col_name), 5)

                col_info["hist"] = df.cols.hist(col_name, min_value, max_value,
                                                buckets)

            column_info['columns'][col_name] = col_info

        return column_info
FinalBtc = CleandfBtc.selectExpr("Cleaned_BTC_Time_New as Date_Time", "Price")
FinalBtc = FinalBtc.withColumn("Price",FinalBtc['Price'].cast(DoubleType()))
FinalBtc.show(5)#In this cell, casting to timesstamp, changing col names and casting price type to double


# ## Dataframes Look like this...
FinalTw.printSchema()


# In[15]:
FinalBtc.printSchema()
FinalBtc.count()


# ## Truncating timestamps to hours and then grouping them by hour
dt_truncated = ((round(unix_timestamp(col('Date_Time')) / 3600) * 3600).cast('timestamp'))
FinalTw = FinalTw.withColumn('dt_truncated', dt_truncated)
FinalTw = FinalTw.selectExpr("dt_truncated as Date_Time","Cleaned_Tweets","p_neg","p_neu","p_pos","p_comp")
UTC = ((unix_timestamp(col('Date_Time'))+ 5*60*60).cast('timestamp'))
FinalTw = FinalTw.withColumn('UTC', UTC)
FinalTw = FinalTw.selectExpr("UTC as Date_Time","Cleaned_Tweets","p_neg","p_neu","p_pos","p_comp")
FinalTw.show(5)


# In[17]:
FinalTw.registerTempTable("temp")
FinalTw_avg = sql.sql("SELECT Date_Time As DateTime,AVG(p_neg) as P_Neg,AVG(p_neu) as P_Neu,AVG(p_pos) as P_Pos,AVG(p_comp) as P_Comp FROM temp GROUP BY Date_Time")
#FinalTw_avg = FinalTw.select("Date_Time","polarity","subj","p_pos","p_neg").groupBy("Date_Time").agg(avg(col("polarity","subj","p_pos","p_neg")))
FinalTw_avg.show(5)

#This cell is just to collect all the corpus per hour(for the future work)
示例#15
0
cvModel = crossval2.fit(trainingDataSJ)#RFR
cvModel2 = crossval2.fit(trainingDataIQ)#RFR
cvModel3 = crossval.fit(trainingDataSJ)#GLM
cvModel4 = crossval.fit(trainingDataIQ)#GLM

# COMMAND ----------

cvModelF = cvModel.transform(trainingDataSJ)#RFR
cvModel2F = cvModel2.transform(trainingDataIQ)#RFR
cvModel3F = cvModel3.transform(trainingDataSJ)#GLM
cvModel4F = cvModel4.transform(trainingDataIQ)#GLM

# COMMAND ----------

predictionsAndLabels = cvModelF.select(col("city"), col("year"), col("weekofyear"), col("label").cast("double"), round(col("prediction")))
predictionsAndLabels2 = cvModel2F.select(col("city"), col("year"), col("weekofyear"), col("label").cast("double"), round(col("prediction")))
predictionsAndLabels3 = cvModel3F.select(col("city"), col("year"), col("weekofyear"), col("label").cast("double"), round(col("prediction")))
predictionsAndLabels4 = cvModel4F.select(col("city"), col("year"), col("weekofyear"), col("label").cast("double"), round(col("prediction")))

# COMMAND ----------

display(predictionsAndLabels)

# COMMAND ----------

display(predictionsAndLabels2)

# COMMAND ----------

display(predictionsAndLabels3)
示例#16
0
def get_contrib_open_source(df: DataFrame):
    df.groupBy('OpenSourcer').agg(f.round(f.count('OpenSourcer') * 100 / df.count(), 1).alias('Percentage')) \
        .orderBy('Percentage', ascending=False) \
        .show(20, False)
# Pickups/Dropoffs in Single Districts
taxi_dis_df = taxi_df.withColumnRenamed('Pickup_Count', 'Pickup_Count_Dis').withColumnRenamed('Dropoff_Count', 'Dropoff_Count_Dis').cache()

taxi_dis_1h_df = get_agg_taxi_df(taxi_dis_df, 1, index_columns, sum_aggregations('Dis', 1))
taxi_dis_4h_df = get_agg_taxi_df(taxi_dis_df, 4, index_columns, sum_aggregations('Dis', 4))


# Pickups/Dropoffs in Neighbor Districts
taxi_nb_df = sql_context.createDataFrame([], taxi_df.schema)
for i in range(-1, 2):
    for j in range(-1, 2):
        # Exclude current district
        if i == j == 0:
            continue

        tmp_df = taxi_df.withColumn('Lat', func.round(taxi_df.Lat + i * 0.01, 2))
        tmp_df = tmp_df.withColumn('Lon', func.round(taxi_df.Lon + j * 0.01, 2))
        taxi_nb_df = taxi_nb_df.unionAll(tmp_df)

taxi_nb_df = taxi_nb_df.groupby(index_columns).agg(*sum_aggregations('Nb')).cache()

taxi_nb_1h_df = get_agg_taxi_df(taxi_nb_df, 1, index_columns, sum_aggregations('Nb', 1))
taxi_nb_4h_df = get_agg_taxi_df(taxi_nb_df, 4, index_columns, sum_aggregations('Nb', 4))


# Pickups/Dropoffs in entire NYC
taxi_nyc_df = taxi_df.groupby(taxi_df.Time).agg(*sum_aggregations('Nyc')).cache()

taxi_nyc_1h_df = get_agg_taxi_df(taxi_nyc_df, 1, 'Time', sum_aggregations('Nyc', 1))
taxi_nyc_4h_df = get_agg_taxi_df(taxi_nyc_df, 4, 'Time', sum_aggregations('Nyc', 4))
示例#18
0
# window = Window.partitionBy('LoanID','EffectiveDate_new').orderBy('LoanID', F.asc('EffectiveDate_new'))\
#                .rowsBetween(Window.unboundedPreceding, 0)

window = Window.partitionBy('LoanID').orderBy('LoanID', 'EffectiveDate_new')\
               .rowsBetween(Window.unboundedPreceding, 0)
#pull first date associated with multi payer, some are multipayers on same day
dupe_check = (dec_trans.withColumn(
    'duplicate_count',
    (F.count('LoanID').over(window))).filter("duplicate_count == 1"))
#get multi-payer
multi_payer = dec_trans.groupBy("LoanID").count().filter("count > 1").select(
    "LoanID")

#put get sum of all payments, put dec df together
loan_pay_agg = dec_trans.groupby(['LoanID']).agg(
    F.round(F.sum('Amount'), 2).alias('PaymentReceived_sum'))

dec_agg_trans = dupe_check.join(loan_pay_agg, how='left',
                                on='LoanID').drop("duplicate_count", "Amount")

display(dupe_check.orderBy("LoanID"))

# COMMAND ----------

# COMMAND ----------

# DBTITLE 1,6008 multi-payers out of 92194 people
print(
    multi_payer.select("LoanID").distinct().count(),
    dec_trans.select("LoanID").distinct().count(), dec_trans.count(),
    dupe_check.count(), dec_agg_trans.count())
示例#19
0
                     StructField("date", IntegerType(), True), \
                     StructField("measure_type", StringType(), True), \
                     StructField("temperature", FloatType(), True)])

# // Read the file as dataframe
df = spark.read.schema(schema).csv("1800.csv")
df.printSchema()

# Filter out all but TMIN entries
minTemps = df.filter(df.measure_type == "TMIN")

# Select only stationID and temperature
stationTemps = minTemps.select("stationID", "temperature")

# Aggregate to find minimum temperature for every station
minTempsByStation = stationTemps.groupBy("stationID").min("temperature")
minTempsByStation.show()

# Convert temperature to fahrenheit and sort the dataset
minTempsByStationF = minTempsByStation.withColumn("temperature",
                                                  func.round(func.col("min(temperature)") * 0.1 * (9.0 / 5.0) + 32.0, 2))\
                                                  .select("stationID", "temperature").sort("temperature")

# Collect, format, and print the results
results = minTempsByStationF.collect()

for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))

spark.stop()
示例#20
0
    spark = SparkSession.builder.getOrCreate()
    for file_name in list(glob.glob(str(parent_path / 'jsons' / '*.json'))):
        print(f'processing {file_name}')
        df = spark.read.json(file_name)
        print(f'{df.count()} records read from json')

        df = df.drop('data-group-quantity', 'data-energy-drink', 'data-amount', 'data-item-weight', 'data-energy-drink'). \
            withColumn('old_price', df['data-old-price'].cast(FloatType())). \
            withColumn('old_price_per_kg', df['data-old-price-per-kg'].cast(FloatType())). \
            withColumn('price', df['data-price'].cast(FloatType())). \
            withColumn('price_per_kg', df['data-price-per-kg'].cast(FloatType())). \
            withColumn('product_id', df['data-product-id'].cast(IntegerType())). \
            withColumn('type', df['data-type'].cast(StringType())). \
            withColumn('weight', df['data-weight'].cast(IntegerType()))

        calc_discount = round((1 - df.price / df.old_price) * 100, 2)

        df = df.withColumn('discount', when(df.old_price.isNull(), None).otherwise(calc_discount)). \
            select('shop', 'name', 'weight', 'price', 'old_price', 'price_per_kg',
                   'old_price_per_kg', 'discount', 'cat', 'product_id', 'type', 'datetm', 'url')
        df.sort('name').show(5)

        try:
            store = spark.read.parquet(store_name)
        except Exception as e:
            if 'Path does not exist' in str(e):
                store = spark.createDataFrame([], df.schema)
            else:
                raise e

        store_count = store.count()
languages = pd.DataFrame({
    'language':
    ['ruby', 'python', 'java', 'scala', 'haskell', 'go', 'clojure', 'c++']
})

df = spark.createDataFrame(languages)
df.printSchema()  # shows schema
print((df.count(), len(df.columns)))
df.show(5)

mpg = data('mpg')
mpg = spark.createDataFrame(mpg)

mpg.select(
    F.concat(F.lit('The '), mpg.year, F.lit(' '), mpg.manufacturer,
             F.lit(' '), mpg.model, F.lit(' has a '), mpg.cyl,
             F.lit(' cylinder engine.'))).show(truncate=False)

mpg.select(F.when(mpg.trans.startswith('auto'),
                  'auto').otherwise('manual')).show()

tips = data('tips')
tips = spark.createDataFrame(tips)

tips.filter(tips.smoker == 'Yes').count() / tips.count()

tips.percent = F.round(tips.tip / tips.total_bill, 2)

tips.groupBy(tips.sex, tips.smoker).agg(F.avg(tips.percent)).show()
示例#22
0
文件: BDM_HW3.py 项目: Ram-SN/BDM_HW3
df.createOrReplaceTempView('df')

test = df.select('Date received', 'Product', 'Company')

res1 = test.select(year('Date received').alias('year'), 'Product', 'Company')

res1 = res1.orderBy('Product', 'year')

res2 = res1.groupBy('year', 'Product',
                    'Company').agg(func.count('Product').alias('Count_comp'))

res3 = res2.groupBy('year', 'Product').agg(
    func.sum('Count_comp').alias('Total_Complaints'),
    func.countDistinct('Company').alias('Total_Companies'),
    func.max('Count_comp').alias('maximum'))

res3 = res3.filter(res3.Total_Complaints >= 1)

res4 = res3.withColumn(
    'Percentage',
    func.round(func.col('maximum') / func.col('Total_Complaints') * 100))

res4 = res4.drop(res4.maximum).sort('Product', 'year')

res4 = res4.withColumn("Product", func.lower(func.col("Product")))

res4 = res4.select('Product', 'year', 'Total_Complaints', 'Total_Companies',
                   'Percentage')

res4.write.csv(output_file)
示例#23
0
                    .master('local[*]') \
                    .appName('first_spark_application') \
                    .getOrCreate()

cars = spark.read.csv(
    '/Users/wel51x/Box Sync/MyBox/Code/DataCamp/data/cars.csv',
    sep=',',
    header=True,
    inferSchema=True,
    nullValue='NA')

# Get number of records
print("The data contains %d records." % cars.count(), '\n')

cars = cars.withColumnRenamed("ncyl", "cyl")
cars = cars.withColumn('length_meters', round(cars.length * 0.0254, 3))

cars = cars.withColumn('weight_kg', round(cars.weight / 2.205, 0))

print("Cars with null cyl", cars.filter('cyl IS NULL').count(), '\n')

indexer = StringIndexer(inputCol='type', outputCol='type_idx')
# Assign index values to strings
indexer = indexer.fit(cars)
# Create column with index values
cars = indexer.transform(cars)

#print(cars.toPandas().sample(12))

print(indexer)
# View the first five records
示例#24
0
def app_open(df):
    """ 应用开启报表 """
    #分析数据
    df.createOrReplaceTempView("v_df")
    sql_0 = """select package_id,title,source,site,fsk_cid,grouping_id() id_1,count(custom_uuid) playNum,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum from v_df where date="{date_0}" group by package_id,title,source,site,fsk_cid  grouping sets((package_id,title,source,site,fsk_cid),()) """.format(
        date_0=str_dt_0)
    sql_1 = """select package_id,title,source,site,fsk_cid,grouping_id() id_1,count(custom_uuid) playNum,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum from v_df where date="{date_1}" group by package_id,title,source,site,fsk_cid  grouping sets((package_id,title,source,site,fsk_cid),()) """.format(
        date_1=str_dt_1)
    sql_7 = """select package_id,title,source,site,fsk_cid,grouping_id() id_1,count(custom_uuid) playNum,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum from v_df where date="{date_7}" group by package_id,title,source,site,fsk_cid  grouping sets((package_id,title,source,site,fsk_cid),()) """.format(
        date_7=str_dt_7)
    spark.sql("show databases")
    spark.sql("use sharp")
    df_cube_0 = spark.sql(sql_0)
    df_cube_1 = spark.sql(sql_1)
    df_cube_7 = spark.sql(sql_7)

    ##天环比、周同比连接条件
    condition_0 = (F.coalesce(F.col("t_0.package_id"),
                              F.lit("123")) == F.coalesce(
                                  F.col("t_1.package_id"), F.lit("123")))
    condition_1 = (F.coalesce(F.col("t_0.title"), F.lit("123")) == F.coalesce(
        F.col("t_1.title"), F.lit("123")))
    condition_2 = (F.coalesce(F.col("t_0.source"), F.lit("123")) == F.coalesce(
        F.col("t_1.source"), F.lit("123")))
    condition_3 = (F.coalesce(F.col("t_0.site"), F.lit("123")) == F.coalesce(
        F.col("t_1.site"), F.lit("123")))
    condition_4 = (F.coalesce(F.col("t_0.fsk_cid"),
                              F.lit("123")) == F.coalesce(
                                  F.col("t_1.fsk_cid"), F.lit("123")))
    condition_5 = (F.col("t_0.id_1") == F.col("t_1.id_1"))
    condition_6 = (F.coalesce(F.col("t_0.package_id"),
                              F.lit("123")) == F.coalesce(
                                  F.col("t_7.package_id"), F.lit("123")))
    condition_7 = (F.coalesce(F.col("t_0.title"), F.lit("123")) == F.coalesce(
        F.col("t_7.title"), F.lit("123")))
    condition_8 = (F.coalesce(F.col("t_0.source"), F.lit("123")) == F.coalesce(
        F.col("t_7.source"), F.lit("123")))
    condition_9 = (F.coalesce(F.col("t_0.site"), F.lit("123")) == F.coalesce(
        F.col("t_7.site"), F.lit("123")))
    condition_10 = (F.coalesce(F.col("t_0.fsk_cid"),
                               F.lit("123")) == F.coalesce(
                                   F.col("t_7.fsk_cid"), F.lit("123")))
    condition_11 = (F.col("t_0.id_1") == F.col("t_7.id_1"))

    ##天环比连接条件
    conditions_0_1 = condition_0 & condition_1 & condition_2 & condition_3 & condition_4 & condition_5
    ##周同比连接条件
    conditions_0_7 = condition_6 & condition_7 & condition_8 & condition_9 & condition_10 & condition_11

    ##最终报表
    app_report = df_cube_0.alias("t_0").join(df_cube_1.alias("t_1"),conditions_0_1,"left_outer") \
                                       .join(df_cube_7.alias("t_7"),conditions_0_7,"left_outer") \
                                       .select(F.regexp_replace(F.lit(str_dt_0),"-","").cast("int").alias("date"),F.col("t_0.package_id").alias("appId"),F.col("t_0.title").alias("appName"),F.col("t_0.source").alias("appSource"),F.col("t_0.site").alias("channelName"),F.col("t_0.fsk_cid").alias("typeName"),F.col("t_0.id_1").alias("id_1"), \
                                             F.col("t_0.playNum").alias("totalPlayCount"),F.concat(F.round((F.col("t_0.playNum")/F.col("t_1.playNum")-1)*100,2),F.lit("%")).alias("playCountCompareDay"),F.concat(F.round((F.col("t_0.playNum")/F.col("t_7.playNum")-1)*100,2),F.lit("%")).alias("playCountCompareWeek"), \
                                             F.col("t_0.users").alias("totalUserCount"),F.concat(F.round((F.col("t_0.users")/F.col("t_1.users")-1)*100,2),F.lit("%")).alias("userCountCompareDay"),F.concat(F.round((F.col("t_0.users")/F.col("t_7.users")-1)*100,2),F.lit("%")).alias("userCountCompareWeek"), \
                                             F.col("t_0.avgPlayNum").alias("averagePlayCount"),F.concat(F.round((F.col("t_0.avgPlayNum")/F.col("t_1.avgPlayNum")-1)*100,2),F.lit("%")).alias("avgPlayCountCompareDay"),F.concat(F.round((F.col("t_0.avgPlayNum")/F.col("t_7.avgPlayNum")-1)*100,2),F.lit("%")).alias("avgPlayCountCompareWeek"))

    return app_report
# Column manipulation
# The Federal Aviation Administration (FAA) considers a flight to be "delayed" when it arrives 15 minutes or more after its scheduled time.

# The next step of preparing the flight data has two parts:

# convert the units of distance, replacing the mile column with a kmcolumn; and
# create a Boolean column indicating whether or not a flight was delayed.
# Instructions
# 100 XP
# Import a function which will allow you to round a number to a specific number of decimal places.
# Derive a new km column from the mile column, rounding to zero decimal places. One mile is 1.60934 km.
# Remove the mile column.
# Create a label column with a value of 1 indicating the delay was 15 minutes or more and 0 otherwise.

# Import the required function
from pyspark.sql.functions import round

# Convert 'mile' to 'km' and drop 'mile' column
flights_km = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                    .drop('mile')

# Create 'label' column indicating whether flight delayed (1) or not (0)
flights_km = flights_km.withColumn('label',
                                   (flights.delay >= 15).cast('integer'))

# Check first five records
flights_km.show(5)
示例#26
0
data_apr = data_apr.select(data_apr.FL_DATE, data_apr.OP_UNIQUE_CARRIER, data_apr.OP_CARRIER_FL_NUM, data_apr.DEP_DELAY)
data_may = data_may.select(data_may.FL_DATE, data_may.OP_UNIQUE_CARRIER, data_may.OP_CARRIER_FL_NUM, data_may.DEP_DELAY)
data_june = data_june.select(data_june.FL_DATE, data_june.OP_UNIQUE_CARRIER, data_june.OP_CARRIER_FL_NUM, data_june.DEP_DELAY)
data_july = data_july.select(data_july.FL_DATE, data_july.OP_UNIQUE_CARRIER, data_july.OP_CARRIER_FL_NUM, data_july.DEP_DELAY)
data_aug = data_aug.select(data_aug.FL_DATE, data_aug.OP_UNIQUE_CARRIER, data_aug.OP_CARRIER_FL_NUM, data_aug.DEP_DELAY)
data_sep = data_sep.select(data_sep.FL_DATE, data_sep.OP_UNIQUE_CARRIER, data_sep.OP_CARRIER_FL_NUM, data_sep.DEP_DELAY)
data_oct = data_oct.select(data_oct.FL_DATE, data_oct.OP_UNIQUE_CARRIER, data_oct.OP_CARRIER_FL_NUM, data_oct.DEP_DELAY)
data_nov = data_nov.select(data_nov.FL_DATE, data_nov.OP_UNIQUE_CARRIER, data_nov.OP_CARRIER_FL_NUM, data_nov.DEP_DELAY)
data_dec = data_dec.select(data_dec.FL_DATE, data_dec.OP_UNIQUE_CARRIER, data_dec.OP_CARRIER_FL_NUM, data_dec.DEP_DELAY)

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

final_csv_file = unionAll(*[data_jan, data_feb, data_mar,data_apr,data_may,data_june,data_july,data_aug,data_sep,data_oct,data_nov,data_dec])

overall_airlines_perf = final_csv_file.groupBy([final_csv_file.OP_UNIQUE_CARRIER.alias('CARRIER')]).agg(F.round(F.avg('DEP_DELAY'), 0).alias('AVERAGE_dELAY'))
print("Overall flight operators performance, good performers being on top: \n")
overall_airlines_perf.sort('Average_delay', ascending=True).show()

flight_perf_per_airlines = final_csv_file.groupBy([final_csv_file.OP_UNIQUE_CARRIER, final_csv_file.OP_CARRIER_FL_NUM]).agg(F.round(F.avg('DEP_DELAY'), 0).alias('Expected_delay'), F.min('DEP_DELAY'), F.max('DEP_DELAY'))
condition_on_join = [df_filter.actual_carrierCode == flight_perf_per_airlines.OP_UNIQUE_CARRIER, df_filter.actual_flight_number == flight_perf_per_airlines.OP_CARRIER_FL_NUM]

final_df = df_filter.join(flight_perf_per_airlines, condition_on_join, 'inner' ).select(df_filter.actual_iatacode_origin.alias('ORIGIN'), \
                                                                          df_filter.actual_at_origin.alias('DEP_TIME'), \
                                                                          df_filter.actual_iatacode_destination.alias('DESTINATION'), \
                                                                          df_filter.actual_at_destination.alias('ARR_TIME'), \
                                                                          df_filter.actual_carrierCode.alias('CARRIER'), \
                                                                          df_filter.actual_flight_number.alias('FL_NUM'), \
                                                                          flight_perf_per_airlines.Expected_delay.alias('EXPECTED_DELAY'))
print("Below are the flights info from your chosen origin, Expected delay in minutes: \n")
final_df.show()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, isnan, round, lit

if __name__ == "__main__":
    dir0 = '/home/cloudera/2.kkbox_churn/data01/big_table_01/has-expired_auto-renew-0/'
    subdir = 'last-1-week_has-record/'
    infile = dir0 + subdir + '00.data'
    outfile = dir0 + subdir + '01.added_features'

    ##
    spark = SparkSession.builder.getOrCreate()
    df0 = spark.read.format('parquet').load(infile)

    df1 = df0.withColumn('last1WeekOnLine_D_last12WeekOnLine',
                         round(col('last1WeekOnLine')/col('last12WeekOnLine'), 2)) \
             .withColumn('w1_Mtotal_D_w12_Mtotal',
                         round(col('w1_Mtotal')/col('w12_Mtotal'), 2))
    
    ##
    df1.write.format('parquet').save(outfile)
    
    
totalcovictions

# In[132]:

monthlyGroupeddf.withColumn(
    'percent', (monthlyGroupeddf.total / totalcovictions) * 100).show()

# In[137]:

import pyspark.sql.functions as func

# In[149]:

updateddf = monthlyGroupeddf.withColumn(
    'percent', func.round((monthlyGroupeddf.total / totalcovictions) * 100, 2))
updateddf.printSchema()
updateddf.show()

# In[150]:

#other aggregations
## convictions based on category in london
df.show()

# In[156]:

df.groupBy('major_category').agg({
    'value': 'sum'
}).withColumnRenamed('sum(value)', 'totalValue').orderBy('totalValue').show()
示例#29
0
# aggregate by adding values and increment count each time
rddAvgP = rddPenalty.map( lambda x: (x[0], x[2]))\
    .aggregateByKey((0.0,0.0),\
    (lambda x, newVal: ((x[0] + float(newVal)), (x[1] + 1))),\
    (lambda rdd1, rdd2: (rdd1[0] + rdd2[0], rdd1[1] + rdd2[1])))

# we are penalizing a small amount based on the number of quality measures
rddAvgP = rddAvgP.mapValues(lambda x: round((x[0] / (x[1])), 5))

# break the columns apart after the joins
rddFinal = rddAvgQ.join(rddAvgP).join(rddStdDev).map(
    lambda x: (x[0], x[1][0][0], x[1][0][1], x[1][1]))

# build final dataframes
dfFinal = rddFinal.toDF( ["ProviderID", "QualityScore", "Penalty", "StandardDeviation"])\
          .withColumn("FinalScore", F.round(F.col("QualityScore") - F.col("Penalty"), 5))

dfShow = dfFinal.join( dfHospitals, dfHospitals.id == dfFinal.ProviderID)\
    .select("ProviderID", "name", "state", "rating", "QualityScore", "Penalty", "StandardDeviation", "FinalScore")\
    .sort("FinalScore", ascending = False)

# save this score table for question 4
dfShow.write.parquet("/user/w205/hospital_compare/hospitalQualParquet")

dfShowRank = dfShow.rdd.zipWithIndex().map(lambda x: (x[1] + 1, x[0][0], x[0][1], x[0][2], x[0][3], x[0][4], x[0][5], x[0][6], x[0][7]))\
    .toDF().select(F.col("_1").alias("Rank"), F.col("_2").alias("ProviderID"), F.col("_3").alias("name"), F.col("_4").alias("state"),\
                       F.col("_5").cast("int").alias("rating"), F.col("_6").alias("QualityScore"), F.col("_7").alias("Penalty"),\
                       F.col("_8").alias("StandardDeviation"), F.col("_9").alias("FinalScore"))\
    .show(10, False)
示例#30
0
def _binary_clf_curve(labelAndVectorisedScores, rawPredictionCol, labelCol):

    # sort the dataframe by pred column in descending order
    localPosProbCol = "pos_probability"
    labelAndPositiveProb = labelAndVectorisedScores.select(
        labelCol,
        getitem(1)(rawPredictionCol).alias(localPosProbCol))

    # round the fractional prediction column
    labelAndPositiveProb = labelAndPositiveProb\
        .withColumn("_tmp_pred", F.round(localPosProbCol, 3))\
        .drop(localPosProbCol)\
        .withColumnRenamed("_tmp_pred", localPosProbCol)\
        .sort(F.desc(localPosProbCol))

    # adding index to the dataframe
    sortedScoresAndLabels = labelAndPositiveProb.rdd.zipWithIndex() \
        .toDF(['data', 'index']) \
        .select('data.' + labelCol, 'data.' + localPosProbCol, "index")

    groupSumLabelCol = "group_sum_labels"
    groupMaxIndexCol = "group_max_indices"
    sortedScoresAndLabels = sortedScoresAndLabels\
        .groupBy([localPosProbCol, labelCol])\
        .agg(F.sum(labelCol).alias(groupSumLabelCol), F.max("index").alias(groupMaxIndexCol))

    # sortedScoresAndLabels = labelAndPositiveProb.sort(F.desc(localPosProbCol))

    # creating rank for pred column
    lookup = (sortedScoresAndLabels.select(localPosProbCol).distinct().sort(
        F.desc(localPosProbCol)).rdd.zipWithIndex().map(
            lambda x: x[0] + (x[1], )).toDF([localPosProbCol, "rank"]))

    # join the dataframe with lookup to assign the ranks
    sortedScoresAndLabels = sortedScoresAndLabels.join(lookup,
                                                       [localPosProbCol])

    # sorting in descending order based on the pred column
    sortedScoresAndLabels = sortedScoresAndLabels.sort(groupMaxIndexCol)

    # saving the dataframe to temporary table
    sortedScoresAndLabels.registerTempTable("processeddata")

    # TODO: script to avoid partition by warning, and span data across clusters nodes
    # creating the cumulative sum for tps
    sortedScoresAndLabelsCumSum = labelAndVectorisedScores.sql_ctx \
        .sql(
        "SELECT " + labelCol + ", " + localPosProbCol + ", " + groupSumLabelCol + ", rank, " + groupMaxIndexCol + ", sum(" + groupSumLabelCol + ") OVER (ORDER BY " + groupMaxIndexCol + ") as tps FROM processeddata ")

    # repartitioning
    sortedScoresAndLabelsCumSum = sortedScoresAndLabelsCumSum.coalesce(
        partition_size)

    # # cache after partitioning
    sortedScoresAndLabelsCumSum.cache()

    # retain only the group-wise (according to threshold) max tps

    df_max_tps_in_group = sortedScoresAndLabelsCumSum.groupBy(
        localPosProbCol).agg(F.max("tps").alias("max_tps"))
    dup_removed_scores_labels = \
        sortedScoresAndLabelsCumSum.join(
            df_max_tps_in_group,
            [sortedScoresAndLabelsCumSum[localPosProbCol] == df_max_tps_in_group[localPosProbCol],
             sortedScoresAndLabelsCumSum["tps"] == df_max_tps_in_group["max_tps"]],
            how="right_outer"
        )\
        .drop(df_max_tps_in_group[localPosProbCol])\
        .drop(df_max_tps_in_group["max_tps"])\
        .groupBy([localPosProbCol, "tps"])\
        .agg(F.max(groupMaxIndexCol).alias("max_index"))

    # creating the fps column based on rank and tps column
    df_with_fps = dup_removed_scores_labels \
        .withColumn("fps", 1 + F.col("max_index") - F.col("tps"))

    return df_with_fps
示例#31
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as func

spark = SparkSession.builder.appName("FriendsByAge").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

lines = spark.read.option("header", "true").option("inferSchema", "true")\
    .csv("file:///opt/bitnami/spark/datasets/fakefriends-header.csv")

# Select only age and numFriends columns
friendsByAge = lines.select("age", "friends")

# From friendsByAge we group by "age" and then compute average
friendsByAge.groupBy("age").avg("friends").show()

# Sorted
friendsByAge.groupBy("age").avg("friends").sort("age").show()

# Formatted more nicely
friendsByAge.groupBy("age").agg(func.round(func.avg("friends"),
                                           2)).sort("age").show()

# With a custom column name
friendsByAge.groupBy("age").agg(
    func.round(func.avg("friends"),
               2).alias("friends_avg")).sort("age").show()

spark.stop()
def process_dataset(df):
    '''
    Function for preparation of dataset for machine learning
    INPUT:
    df - initial dataset loaded from json file
    
    OUTPUT:
    df_ft - new dataset prepared for machine learning
    contains the following columns:
    1. userId - initial id of the user
    2. gender - user's gender
    3. avg_events - average number of events per day for the user
    4. avg_songs - average number of songs the user listens to per day
    5. thumbs_up - number of thumbs up events
    6. thumbs_down - number of thumbs down events
    7. active_days - days since user's firts event
    8. last_location - location of the last event
    9. last_level - user's last level (paid or free)
    10. addfriends - number of add friends events
    '''
    
    # clean dataset using clean_data function
    df = clean_data(df)
    
    # define cancellation udf
    cancellation_event = udf(lambda x: 1 if x == "Cancellation Confirmation" else 0, IntegerType())
    
    # set churn = 1 for rows where page == 'Cancellation Confirmation'
    df = df.withColumn("churn", cancellation_event("page"))
    
    
    # get userId with churn == 1
    cancelled_users = df.select(['userId', 'churn']).where(df.churn == 1).groupby('userId').count().toPandas()['userId'].values
    
    # create udf, which sets churn of a row to 1 if userId is in cancelled_users list
    def replace_data(userId, features):
        if(userId in cancelled_users): return 1
        else : return 0
    
    # set churn == 1 for all rows for users who cancelled their subscription
    fill_array_udf = udf(replace_data, IntegerType())
    df = df.withColumn("churn", fill_array_udf(col("userId"), col("churn")))
        
    # set column last ts with the first and the last event timestamp
    w = Window.partitionBy('userId')
    df = df.withColumn('last_ts', max('ts').over(w))
    df = df.withColumn('first_ts', min('ts').over(w))
    
    # convert timestamp to date (string)
    def get_date(ts):
        return str(datetime.utcfromtimestamp(ts / 1000).strftime('%Y-%m-%d'))
    
    get_date_from_ts_udf = udf(get_date, StringType())
    df = df.withColumn('last_date', get_date_from_ts_udf(col('last_ts')))
    df = df.withColumn('first_date', get_date_from_ts_udf(col('first_ts')))
    
    # add column date and convert timetamp to date
    df = df.withColumn('date', get_date_from_ts_udf(col('ts')))
    
    # set column last_level to level when timestamp is last timestamp
    df = df.withColumn('last_level', when(df.last_ts == df.ts, df.level))
    
    #aditional feature: Gender
    # flag_gender = udf(lambda x: 1 if x == 'M' else 0, IntegerType())
    # gender = df.select("userId", "gender").dropDuplicates()
    # gender = df.withColumn("gender", flag_gender("gender"))
    
    # create column avg_songs to calculate average number of events per day
    w = Window.partitionBy('userId', 'date')
    events = df.select('userId', 'date', count('userId').over(w).alias('events')).distinct()
    w = Window.partitionBy('userId')
    events = events.withColumn('avg_events', avg('events').over(w))
    events = events.select(col("userId").alias("events_userId"), 'avg_events')
    events = events.withColumn("avg_events", round(events["avg_events"], 2))
    
    # create column avg_songs to calculate average number of songs per day
    w = Window.partitionBy('userId', 'date')
    songs = df.where(df.page == 'NextSong').select('userId', 'date', count('userId').over(w).alias('songs')).distinct()
    w = Window.partitionBy('userId')
    songs = songs.withColumn('avg_songs', avg('songs').over(w))
    songs = songs.select(col("userId").alias("songs_userId"), 'avg_songs')
    songs = songs.withColumn("avg_songs", round(songs["avg_songs"], 2))
    
    # calculate number of thumbs up for a user
    w = Window.partitionBy('userId')
    thumbsup = df.where(df.page == 'Thumbs Up').select('userId', count('userId').over(w).alias('thumbs_up')).distinct()
    thumbsup = thumbsup.select(col("userId").alias("thumbsup_userId"), 'thumbs_up')
    
    # calculate number of thumbs down for a user
    w = Window.partitionBy('userId')
    thumbsdown = df.where(df.page == 'Thumbs Down').select('userId', count('userId').over(w).alias('thumbs_down')).distinct()
    thumbsdown = thumbsdown.select(col("userId").alias("thumbsdown_userId"), 'thumbs_down')
    
    # calculate days since the date of the first event
    df = df.withColumn("days_active", 
              datediff(to_date(lit(datetime.now().strftime("%Y-%m-%d %H:%M"))),
                       to_date("first_date","yyyy-MM-dd")))
        
    # add column with state of the event based on location column
    def get_state(location):
        location = location.split(',')[-1].strip()
        if (len(location) > 2):
            location = location.split('-')[-1].strip()
    
        return location
    
    state_udf = udf(get_state, StringType())
    df = df.withColumn('state', state_udf(col('location')))
    
    #add column with last location of the user
    df = df.withColumn('last_state',when(df.last_ts == df.ts, df.state))
    
    # calculate number of add friends for a user
    w = Window.partitionBy('userId')
    addfriend = df.where(df.page == 'Add Friend').select('userId', count('userId').over(w).alias('addfriend')).distinct()
    addfriend = addfriend.select(col("userId").alias("addfriend_userId"), 'addfriend')
    
    # merge all results together
    df_ft = df.select('userId', 'gender', 'churn', 'last_level', 'days_active', 'last_state')\
    .dropna().drop_duplicates()
    
    df_ft = df_ft.join(songs, df_ft.userId == songs.songs_userId).distinct()
    df_ft = df_ft.join(events, df_ft.userId == events.events_userId).distinct()
    df_ft = df_ft.join(thumbsup, df_ft.userId == thumbsup.thumbsup_userId, how='left').distinct()
    df_ft = df_ft.fillna(0, subset=['thumbs_up'])
    df_ft = df_ft.join(thumbsdown, df_ft.userId == thumbsdown.thumbsdown_userId, how='left').distinct()
    df_ft = df_ft.fillna(0, subset=['thumbs_down'])
    df_ft = df_ft.join(addfriend, df_ft.userId == addfriend.addfriend_userId, how='left').distinct()
    df_ft = df_ft.fillna(0, subset=['addfriend'])
    df_ft = df_ft.drop('songs_userId','events_userId', 'thumbsup_userId', 'thumbsdown_userId', 'addfriend_userId')
    
    return df, df_ft
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)


# COMMAND ----------

df.selectExpr(
  "CustomerId",
  "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, round, bround

df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)


# COMMAND ----------

from pyspark.sql.functions import corr
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()


# COMMAND ----------

df.describe().show()


# COMMAND ----------
示例#34
0
d1 = spark.read.option("header", "true") \
    .option("sep", ",").option("inferSchema", True) \
    .option("mode", "DROPMALFORMED") \
    .csv("file:///Users/beginspark/Temp/data2.csv")

d2 = d1.toDF("year", "month", "road", "avr_traffic_month", "avr_velo_month", "mon", "tue", "wed", "thu", "fri", "sat",
             "sun")

# data 확인
d2.printSchema()

# null 값 제거
d3 = d2.where("avr_velo_month is not null")

# 도로별 평균 속도
d4 = d3.groupBy("road").agg(functions.round(functions.avg("avr_velo_month"), 1).alias("avr_velo_total"))
d5 = d3.join(d4, ["road"])

# label 부여
d6 = d5.withColumn("label", label(d5.avr_velo_month, d5.avr_velo_total).cast("double"))
d6.select("road", "avr_velo_month", "avr_velo_total", "label").show(5, False)
d6.groupBy("label").count().show(truncate=False)

dataArr = d6.randomSplit([0.7, 0.3])
train = dataArr[0]
test = dataArr[1]

indexer = StringIndexer(inputCol="road", outputCol="roadcode")

assembler = VectorAssembler(inputCols=["roadcode", "mon", "tue", "wed", "thu", "fri", "sat", "sun"],
                            outputCol="features")