def main(): spark = SparkSession\ .builder\ .appName("hello Spark")\ .master("local[3]")\ .getOrCreate() logger = Log4j(spark) print(logger) logger.info("Starting hello Spark Program") logger.info("Finished hello spark program") spark.stop()
def process(spark, input_file, output_path): # this method has the spark transformations required to solve the problem statement. global logger logger = Log4j(spark) # setting global logger if input_file is not None: input_df = spark.read.json(str(input_file)) else: # reading data from API. Not recommended in Production environment to read the file from REST json_response = requests.get( "https://s3-eu-west-1.amazonaws.com/dwh-test-resources/recipes.json" ) input_df = spark.createDataFrame( data=[json.loads(line) for line in json_response.iter_lines()]) # registering UDFs udf_get_seconds_from_duration = udf(get_seconds_from_duration, LongType()) udf_get_duration_from_seconds = udf(get_duration_from_seconds) # cleaning the input data by converting durations to seconds and date field to DateType clean_df = input_df.withColumn("prepSeconds", udf_get_seconds_from_duration("prepTime")). \ withColumn("cookSeconds", udf_get_seconds_from_duration("cookTime")). \ withColumn("datePublished", (to_date(col("datePublished")))).persist() # persist is not needed here # as only single call is happening # Task-2 output_df = clean_df \ .where("cookSeconds>0 and prepSeconds>0") \ .selectExpr("cookSeconds + prepSeconds as totalSeconds") \ .withColumn("difficulty", when(col("totalSeconds") < 60 * 30, lit("easy")) \ .otherwise(when(col("totalSeconds") > 60 * 30, lit("hard")) \ .otherwise(lit("medium")))) \ .groupBy("difficulty").avg() \ .withColumnRenamed("avg(totalSeconds)", "avg_total_cooking_seconds") \ .withColumn("avg_total_cooking_time", udf_get_duration_from_seconds("avg_total_cooking_seconds")) \ .drop("avg_total_cooking_seconds").persist() # output_df.coalesce(1).write.mode("overwrite").csv("../output") # to_pandas not recommended when using spark if data size is large # provided relative path.also not recommended output_df.coalesce(1).toPandas().to_csv(output_path, index=False)
from pyspark.sql import SparkSession from lib.logger import Log4j import os if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("Shuffle Join Demo") \ .master("local[3]") \ .getOrCreate() logger = Log4j(spark) d1 = os.path.expanduser("~/Spark-Programming-In-Python-master/data/d1/") d2 = os.path.expanduser("~/Spark-Programming-In-Python-master/data/d2/") flight_time_df1 = spark.read.json(d1) flight_time_df2 = spark.read.json(d2) spark.conf.set("spark.sql.shuffle.partitions", 3) join_expr = flight_time_df1.id == flight_time_df2.id join_df = flight_time_df1.join(flight_time_df2, join_expr, "inner") join_df.collect() join_df.show() input("press a key to stop...")
import datetime import json import re import isodate import requests from pyspark.sql.functions import udf, when, col, lit, to_date from pyspark.sql.types import LongType from lib.logger import Log4j __author__ = 'Sumeet Gupta' logger: Log4j = Log4j(None) def process(spark, input_file, output_path): # this method has the spark transformations required to solve the problem statement. global logger logger = Log4j(spark) # setting global logger if input_file is not None: input_df = spark.read.json(str(input_file)) else: # reading data from API. Not recommended in Production environment to read the file from REST json_response = requests.get( "https://s3-eu-west-1.amazonaws.com/dwh-test-resources/recipes.json" ) input_df = spark.createDataFrame( data=[json.loads(line) for line in json_response.iter_lines()]) # registering UDFs
import sys from pyspark.sql import * from lib.logger import Log4j from lib.utils import * if __name__ == "__main__": conf = get_spark_app_config() spark = SparkSession \ .builder \ .appName("HelloSpark") \ .master("local[2]") \ .getOrCreate() print(spark) logger = Log4j(spark) #creates a Logger. print(sys.argv) if len(sys.argv) != 2: logger.error("Usage: HelloSpark <filename>") sys.exit(-1) logger.info("Starting HelloSpark") survey_raw_df = load_survey_df(spark, sys.argv[1]) partitioned_survey_df = survey_raw_df.repartition(2) count_df = count_by_country(partitioned_survey_df) count_df.show() logger.info("Finished HelloSpark") spark.stop()