def create_row_output(original_file, modified_file, source_file=[], target_file=[]): """ :param original_file: df for original file :param modified_file: df for modified file :param source_file: list of paths for source files :param target_file: list of paths for target files :return: The funciton from above is used to create the multiple source/target df's, which are then merged for comparison purposes. """ try: df_original = get_row_count(*source_file).withColumn( "Type", F.format_string(original_file)) df_modified = get_row_count(*target_file).withColumn( "Type", F.format_string(modified_file)) combined = df_original.union(df_modified) cols = list(combined.columns) cols = [cols[-1]] + cols[:-1] rowComparisonDf = combined[cols] except LookupError as l: print(str(l)) raise l return rowComparisonDf
def alert_no_threshold(df, keycols, col_dest, str_format, col_origen, col_threshold): col_to_select = keycols + [col_dest] dfret=df.filter(col(col_origen)< col(col_threshold)).withColumn(col_dest,F.when(col(col_origen)< col(col_threshold), F.format_string(str_format, col(col_threshold).cast(IntegerType())-col(col_origen).cast(IntegerType()), \ col(col_origen).cast(IntegerType()),col(col_threshold).cast(IntegerType()))).otherwise(None)) dfret = dfret.select(col_to_select) return (dfret)
def prepare_df(df): num_rows = df.count() # Expand dates. df = expand_date(df) df = df \ .withColumn('Open', df.Open != '0') \ .withColumn('Promo', df.Promo != '0') \ .withColumn('StateHoliday', df.StateHoliday != '0') \ .withColumn('SchoolHoliday', df.SchoolHoliday != '0') # Merge in store information. store = store_csv.join(store_states_csv, 'Store') df = df.join(store, 'Store') # Merge in Google Trend information. google_trend_all = prepare_google_trend() df = df.join(google_trend_all, ['State', 'Year', 'Week']).select(df['*'], google_trend_all.trend) # Merge in Google Trend for whole Germany. google_trend_de = google_trend_all[google_trend_all.file == 'Rossmann_DE'] google_trend_de = google_trend_de.withColumnRenamed('trend', 'trend_de') df = df.join(google_trend_de, ['Year', 'Week']).select(df['*'], google_trend_de.trend_de) # Merge in weather. weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName) df = df.join(weather, ['State', 'Date']) # Fix null values. df = df \ .withColumn('CompetitionOpenSinceYear', F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900))) \ .withColumn('CompetitionOpenSinceMonth', F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1))) \ .withColumn('Promo2SinceYear', F.coalesce(df.Promo2SinceYear, F.lit(1900))) \ .withColumn('Promo2SinceWeek', F.coalesce(df.Promo2SinceWeek, F.lit(1))) # Days & months competition was open, cap to 2 years. df = df.withColumn('CompetitionOpenSince', F.to_date(F.format_string('%s-%s-15', df.CompetitionOpenSinceYear, df.CompetitionOpenSinceMonth))) df = df.withColumn('CompetitionDaysOpen', F.when(df.CompetitionOpenSinceYear > 1900, F.greatest(F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince)))) .otherwise(0)) df = df.withColumn('CompetitionMonthsOpen', (df.CompetitionDaysOpen / 30).cast(T.IntegerType())) # Days & weeks of promotion, cap to 25 weeks. df = df.withColumn('Promo2Since', F.expr('date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)')) df = df.withColumn('Promo2Days', F.when(df.Promo2SinceYear > 1900, F.greatest(F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since)))) .otherwise(0)) df = df.withColumn('Promo2Weeks', (df.Promo2Days / 7).cast(T.IntegerType())) # Check that we did not lose any rows through inner joins. assert num_rows == df.count(), 'lost rows in joins' return df
def main(spark, filename): DF = spark.read.format('csv').options(header='true', inferschema='true').load(filename) DF.createOrReplaceTempView("parking") query = getQuery() result = spark.sql(query) result.select( format_string('%s, %s\t%d', result.plate_id, result.registration_state, result.total)).write.save("task6-sql.out", format="text")
def main(spark,filename1, filename2): parkingDF = spark.read.format('csv').options(header='true',inferschema='true').load(filename1) openDF = spark.read.format('csv').options(header='true',inferschema='true').load(filename2) parkingDF.createOrReplaceTempView("parking") openDF.createOrReplaceTempView("open") parkingDF.select('summons_number').subtract(openDF.select('summons_number')).createOrReplaceTempView("diffVals") query = getQuery() result = spark.sql(query) result.select(format_string('%d\t%s, %d, %d, %s',result.summons_number,result.plate_id,result.violation_precinct,result.violation_code,date_format(result.issue_date,'yyyy-MM-dd'))).write.save("task1-sql.out",format="text")
def create_edge_id_column(cls, datasource, from_column, to_column): """Creates an '~id' column in a DynamicFrame whose values are based on the specified from and to columns. Example: >>> df = GremlinCsvTransforms.create_edge_id_column(df, 'supplierId', 'productId') """ dataframe = datasource.toDF() dataframe = dataframe.withColumn('~id', format_string("%s-%s", dataframe[from_column], dataframe[to_column])) return DynamicFrame.fromDF(dataframe, datasource.glue_ctx, 'create_edge_id_column')
def replicate_df(df, duplicates): if duplicates > 1: uniques = _get_uniques(duplicates) df = (df.crossJoin(uniques.distinct()).withColumn( "customerID", F.format_string("%s-%s", "customerID", "u_value")).drop("u_value")) return df
def main(spark, filename): DF = spark.read.format('csv').options(header='true', inferschema='true').load(filename) DF.createOrReplaceTempView("open") query = getQuery() result = spark.sql(query) result.select( format_string('%s\t%.2f, %.2f', result.license_type, result.Total, result.Average)).write.save("task3-sql.out", format="text")
def alert_less_sps_than_ss(df, keycols, col_dest, str_format, col_sps, col_ss, col_ss_trashed): col_to_select = keycols + [col_dest] df = df.withColumn('ss_no_trashed', col(col_ss) - col(col_ss_trashed)) dfret=df.filter(col(col_sps)< col('ss_no_trashed')).withColumn(col_dest,F.when(col(col_sps)< col('ss_no_trashed'), \ F.format_string(str_format, col('ss_no_trashed').cast(IntegerType())-col(col_sps).cast(IntegerType()), \ col(col_sps).cast(IntegerType()),col(col_ss).cast(IntegerType()), \ col(col_ss_trashed).cast(IntegerType()))).otherwise(None)) dfret = dfret.select(col_to_select) return (dfret)
def alert_trashed_shoes(df, keycols, col_dest, str_format, col_origen, col_excep_1, col_excep_2): col_to_select = keycols + [col_dest] df = df.withColumn('shoes_trashed_to_check', col(col_origen) - col(col_excep_1) - col(col_excep_2)) dfret=df.filter(col('shoes_trashed_to_check') > 0).withColumn(col_dest,F.when(col('shoes_trashed_to_check') > 0, \ F.format_string(str_format,col('shoes_trashed_to_check'), col(col_origen).cast(IntegerType()), \ col(col_excep_1).cast(IntegerType()),col(col_excep_2).cast(IntegerType()))).otherwise(None)) dfret = dfret.select(col_to_select) return (dfret)
def create_prefixed_columns(cls, datasource, mappings): """Creates columns in a DynamicFrame whose values are based on prefixed values from another column in the DynamicFrame. Example: >>> df = GremlinCsvTransforms.create_prefixed_columns(df, [('~id', 'productId', 'p'),('~to', 'supplierId', 's')]) """ dataframe = datasource.toDF() for (column_name, source_column, prefix) in mappings: dataframe = dataframe.withColumn(column_name, format_string(prefix + "-%s", dataframe[source_column])) return DynamicFrame.fromDF(dataframe, datasource.glue_ctx, 'create_vertex_id_columns')
def aggregate_columns(dataframe, dataframe_name): f = open('get_aggregate_results.sh','w') for col in nyc.columns: print(col) query = 'select `{0}` as Data_Values, count(`{0}`) as count_data from nyc group by `{0}` order by `{0}`'.format(col) data = spark.sql(query) data.select(format_string('%s,%s',data.Data_Values,data.count_data)).write.save( "Projects/" +str(col)+".txt" ,format="text") f.write('hfs -getmerge Projects/'+str(col)+'.txt Outputs/'+str(col)+'.txt \n') f.flush() f.close() subprocess.call("get_aggregate_resukts.sh")
def Run(self): print('Running DfCollectList test!!!') df = self.spark.read.load('sample-data.csv', format="csv", inferSchema="true", header="true") from pyspark.sql.functions import concat, regexp_replace, format_string, format_number, struct, collect_list collectDf = df.withColumn('country', regexp_replace(df.rpt_cty, ' ', '_')) \ .withColumn('yrmon', concat(df.year, format_string('%02d', df.mon))) \ .groupBy('country', 'yrmon') \ .agg(collect_list(struct(*df.columns)).alias('my_rows')) print(collectDf.count()) collectDf.printSchema() collectDf.show(5)
def lyrSizeSpark(COUNT): """counting character count for :COUNT: inital bottle of beer Full pyspark version """ df = spark.createDataFrame( sc.range(COUNT,0,-1) ,schema=T.IntegerType()) df = df.withColumn("count", F.when(F.col("value") >0, F.col("value").astype(T.StringType()) ).otherwise("no more")) df = df.withColumn("countm", F.when(F.col("value")-1 >0, (F.col("value")-1).astype(T.StringType()) ).otherwise("no more")) df = df.withColumn("plu", F.when(F.col("count") == "1", "" ).otherwise("s")) df = df.withColumn("plum", F.when(F.col("countm") == "1", "" ).otherwise("s")) df = df.withColumn("lyr",F.format_string("""%s bottle%s of beer on the wall, %s bottle%s of beer.\nTake one down and pass it around, %s bottle%s of beer on the wall.\n\n""", F.col("count"),F.col("plu"),F.col("count"),F.col("plu"),F.col("countm"),F.col("plum")) ) return df.withColumn("lyrc", F.length(F.col("lyr")) ).select(F.sum(F.col("lyrc")).alias("c")).first()["c"]
def aggregate_columns(dataframe, tableName): f = open('get_aggregate_results.sh', 'w') for col in dataframe.columns: query = "select " + str(col) + " as Data_Values, count(" + str( col) + ") as count_data from " + tableName + " group by " + str( col) + " order by " + str(col) data = spark.sql(query) data.select(format_string('%s,%s', data.Data_Values, data.count_data)).write.save( "Projects/" + str(col) + ".txt", format="text") f.write('hfs -getmerge Projects/' + str(col) + '.txt /home/ak6384/Project/results/' + str(col) + '.txt \n') f.flush() f.close() subprocess.call("get_aggregate_results.sh")
def main(spark, filename): DF = spark.read.format('csv').options(header='true', inferschema='true').load(filename) DF.createOrReplaceTempView("parking") query = getQuery() tempResult = spark.sql(query) weekendDF = tempResult.filter( tempResult.day == "Weekend").createOrReplaceTempView("weekendTable") weekdayDF = tempResult.filter( tempResult.day == "Weekday").createOrReplaceTempView("weekdayTable") query = getQuery2() result = spark.sql(query) result.select( format_string('%s\t%.2f, %.2f', result.violation_code, result.weekend_average, result.weekday_average)).write.save("task7-sql.out", format="text")
def Run(self): print('Running DfIter test!!!') df = self.spark.read.load('sample-data.csv', format="csv", inferSchema="true", header="true") #df.printSchema() #df.show() from pyspark.sql.functions import concat, regexp_replace, format_string, format_number df.withColumn('country', regexp_replace(df.rpt_cty, "[ ,/,\\\]", '_')) \ .withColumn('yrmon', concat(df.year, format_string('%02d', df.mon))) \ .repartition('country', 'yrmon') \ .write.partitionBy('country', 'yrmon') \ .option('header', 'true') \ .mode('overwrite') \ .format('csv') \ .save('./out/as_of_date=latest/type=full/')
def add_features(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: print("[wrangle.py] adding features") max_date = df.select(max("case_closed_date")).first()[0] return ( df.withColumn("num_weeks_late", expr("num_days_late / 7 AS num_weeks_late")) .withColumn( "council_district", format_string("%03d", col("council_district").cast("int")), ) .withColumn("zipcode", regexp_extract("request_address", r"\d+$", 0)) .withColumn("case_age", datediff(lit(max_date), "case_opened_date")) .withColumn("days_to_closed", datediff("case_closed_date", "case_opened_date")) .withColumn( "case_lifetime", when(expr("! case_closed"), col("case_age")).otherwise( col("days_to_closed") ), ) )
total_rev_df_add = joined_df.select( F.col('pickup_datetime'), (F.col('fare_amount') + F.col('surcharge') + F.col('tip_amount')).alias('total_revenue'), F.col('tolls_amount')) total_rev_df_add = total_rev_df_add.withColumn( 'pickup_datetime', F.date_format(F.col('pickup_datetime'), 'yyyy-MM-dd')) total_rev_grouped = total_rev_df_add.groupBy( F.col('pickup_datetime').alias('date')).agg( F.sum('total_revenue').alias('total_revenue'), F.sum('tolls_amount').alias('total_tolls')) total_rev_grouped = total_rev_grouped.select( 'date', F.regexp_replace(F.format_number(F.round(F.col('total_revenue'), 2), 2), ',', '').alias('total_revenue'), F.regexp_replace(F.format_number(F.round(F.col('total_tolls'), 2), 2), ',', '').alias('total_tolls')) total_rev_grouped = total_rev_grouped.sort('date') total_rev_grouped.select( format_string('%s,%s,%s', total_rev_grouped.date, total_rev_grouped.total_revenue, total_rev_grouped.total_tolls)).write.save('task2c-sql.out', format="text") spark.stop()
import sys from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.sql import Row from pyspark.sql.functions import format_string, date_format from csv import reader spark = SparkSession.builder.getOrCreate() parking = spark.read.format('csv').options( header='true', inferschema='true').load(sys.argv[1]) openv = spark.read.format('csv').options(header='true', inferschema='true').load(sys.argv[2]) parking.createOrReplaceTempView("parking") openv.createOrReplaceTempView("openv") # Perform left joing between parking and open violations result = spark.sql( "SELECT parking.summons_number, plate_id, violation_precinct, violation_code, parking.issue_date FROM parking LEFT JOIN openv ON parking.summons_number = openv.summons_number WHERE openv.summons_number IS NULL" ) result.select( format_string('%d\t%s, %d, %d, %s', result.summons_number, result.plate_id, result.violation_precinct, result.violation_code, date_format(result.issue_date, 'yyyy-MMdd'))).write.save("task1-sql.result", format="text")
import sys import string from pyspark.sql.functions import format_string from csv import reader from pyspark.sql import SparkSession spark = SparkSession.builder.appName("Python Spark SQL basic example").config( "spark.some.config.option", "some-value").getOrCreate() #lines = spark.read.format('csv').options(header='true',inferschema='true').load ('/user/ecc290/HW1data/open-violations-header.csv') lines = spark.read.format('csv').options(header='true', inferschema='true').load(sys.argv[1]) lines.createOrReplaceTempView("lines") result = spark.sql( "SELECT CASE WHEN registration_state='NY' THEN 'NY' ELSE 'Other' END AS registration_state, COUNT(*) as counts FROM lines GROUP BY CASE WHEN registration_state='NY' THEN 'NY' ELSE 'Other' END ORDER BY registration_state" ) result.select( format_string('%s \t %d', result.registration_state, result.counts)).write.save("task4-sql.out", format="text")
#!/usr/bin/env python import sys from pyspark.sql import SparkSession from csv import reader from pyspark.sql.functions import format_string spark = SparkSession.builder.appName("task2SQL").getOrCreate() parking_violation = spark.read.format('csv').options(header='true', inferschema='true').load( sys.argv[1]) parking_violation.createOrReplaceTempView("parking") violation_count = spark.sql( "SELECT plate_id, registration_state, count(violation_code) violation_code_count FROM parking GROUP BY plate_id, registration_state ORDER BY plate_id" ) violation_count_temp = violation_count.createOrReplaceTempView( "violation_count") max_violation = spark.sql( "SELECT plate_id, registration_state, violation_code_count FROM violation_count ORDER BY violation_code_count DESC LIMIT 20" ) max_violation.select( format_string( '%s, %s\t%d', max_violation.plate_id, max_violation.registration_state, max_violation.violation_code_count)).write.save("task6-sql.out", format="text")
#!/usr/bin/env python import sys from pyspark.sql import SparkSession from csv import reader from pyspark.sql.functions import format_string spark = SparkSession.builder.appName("task2SQL").getOrCreate() parking_violation = spark.read.format('csv').options(header='true', inferschema='true').load( sys.argv[1]) parking_violation.createOrReplaceTempView("parking") result = spark.sql( "SELECT violation_code, count(violation_code) violation_count FROM parking GROUP BY violation_code" ) result.select( format_string('%s\t%d', result.violation_code, result.violation_count)).write.save("task2-sql.out", format="text")
StructField("surcharge", StringType(),True), StructField("mta_tax", StringType(),True), StructField("tip_amount", StringType(),True), StructField("tolls_amount", StringType(),True), StructField("total_amount", StringType(),True), StructField("name", StringType(),True), StructField("type", StringType(),True), StructField("current_status", StringType(),True), StructField("DMV_license_plate", StringType(),True), StructField("vehicle_VIN_number", StringType(),True), StructField("vehicle_type", StringType(),True), StructField("model_year", StringType(),True), StructField("medallion_type", StringType(),True), StructField("agent_number", StringType(),True), StructField("agent_name", StringType(),True), StructField("agent_telephone_number", StringType(), True), StructField("agent_website", StringType(), True), StructField("agent_address", StringType(), True), StructField("last_updated_date", StringType(), True), StructField("last_updated_time", StringType(), True)]) fareLicenses = spark.read.format('csv').schema(fare_licenses).options(header='false',inferschema='true').load(sys.argv[1]) fareLicenses.createOrReplaceTempView("fareLicenses") trips = spark.sql("select medallion_type, count(*) as total_trips from fareLicenses group by medallion_type") total_revenue = spark.sql("select medallion_type, sum(fare_amount) as total_revenue from fareLicenses group by medallion_type") tip = spark.sql("select medallion_type, sum(tip_amount) as total_tip from fareLicenses group by medallion_type") df1 = trips.join(total_revenue,["medallion_type"]) df2 = df1.join(tip,["medallion_type"]) finaloutput = df2.select("medallion_type","total_trips", "total_revenue", ((df2.total_tip/df2.total_revenue)*(100/df2.total_trips)).alias("avg_tip_percentage")).sort(df2.medallion_type.asc()) print(finaloutput.show()) finaloutput.select(format_string('%s,%s,%s,%s',finaloutput.medallion_type,finaloutput.total_trips,finaloutput.total_revenue,finaloutput.avg_tip_percentage)).write.save("task4b-sql.out", format="text")
import sys from pyspark.sql import SparkSession from pyspark.sql.functions import lit, format_string spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() parking = spark.read.format('csv').options( header='true', inferschema='true').load(sys.argv[1]) parking.createOrReplaceTempView("parking") #weekend_dates=[(5,),(6,),(12,),(13,),(19,),(20,),(26,),(27,)] weekend_dates = [("2016-03-05", ), ("2016-03-06", ), ("2016-03-12", ), ("2016-03-13", ), ("2016-03-19", ), ("2016-03-20", ), ("2016-03-26", ), ("2016-03-27", )] weekend = spark.createDataFrame(weekend_dates, ["date"]) weekend.createOrReplaceTempView("weekend") #result = spark.sql("select violation_code, round(sum(case when w.date is null then 1 else 0 end)/23,2) as wkd, round(count(w.date)/8,2) as ctr_weekend from parking p left outer join weekend w on extract(DAY from parking.issue_date)=w.date group by violation_code") result = spark.sql( "select violation_code, round(sum(case when w.date is null then 1 else 0 end)/23,2) as wkd, round(count(w.date)/8,2) as ctr_weekend from parking p left outer join weekend w on issue_date=w.date group by violation_code" ) #result.show(result.count(), False) result.select( format_string("%s\t%.2f, %.2f", result.violation_code, result.ctr_weekend, result.wkd)).write.save("task7-sql.out", format="text")
import sys from pyspark.sql import SparkSession from pyspark.sql.functions import avg, sum, format_string spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() open = spark.read.format('csv').options(header='true',inferschema='true').load(sys.argv[1]) #open.select(open["license_type"], open["amount_due"]).groupBy("license_type").count().show() #open.groupBy("license_type").agg(avg("amount_due"), sum("amount_due")).show() #result = open.groupBy("license_type").count() #result.select(format_string("%d",result.count)).show() open.createOrReplaceTempView("open") result = spark.sql("select license_type, round(avg(amount_due),2) as avg, round(sum(amount_due),2) as sum from open group by license_type") #result.printSchema() #result = result.select(result.license_type, result.sum.cast("float"), result.avg.cast("float")) result.select(format_string("%s\t%.2f, %.2f",result.license_type, result.sum, result.avg)).write.save("task3-sql.out", format="text")
from pyspark.sql import SparkSession from pyspark.sql.functions import format_string import pyspark.sql.functions as F import sys spark = SparkSession.builder.appName("my_pp").getOrCreate() # using joined_df joined_df = spark.read.format('csv').options( header='false', inferschema='true').load(sys.argv[1]).select( F.col('_c15').alias('fare_amount')) invalid_fare_amounts = joined_df.filter(F.col('fare_amount') < 0).select( F.count('*').alias('invalid_fare_amounts')) invalid_fare_amounts.select( format_string("%s", invalid_fare_amounts.invalid_fare_amounts)).write.save( "task3a-sql.out", format="text") spark.stop()
#!/usr/bin/env python import sys import csv from pyspark.sql import SparkSession from pyspark.sql.functions import format_string spark = SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate() parking =spark.read.format('csv').options(header='true',inferschema='true').load(sys.argv[1]) parking.createOrReplaceTempView("parking") result = spark.sql("select * from (select registration_state,plate_id, count(violation_code) as total from parking group by plate_id,registration_state) g order by g.total desc,plate_id limit 20") result.select(format_string('%s, %s\t%d', result.plate_id,result.registration_state,result.total)).write.save("task6-sql.out",format="text")
""" Task-7 : For each violation code, list the average number of violations with that code issued per day on weekdays and weekend days Author : Chinmay Wyawahare """ import sys from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.sql import Row from pyspark.sql.functions import format_string from csv import reader spark = SparkSession.builder.getOrCreate() parkingdata = spark.read.format('csv').options(header='true', inferschema='true').load( sys.argv[1]) parkingdata.createOrReplaceTempView("parkingdata") out = spark.sql( "SELECT t1.violation_code, ROUND(t1.Weekend,2), ROUND(t2.Weekdays,2) FROM (SELECT violation_code, COUNT(violation_code)/8 AS Weekend FROM parkingdata WHERE (EXTRACT(DAY FROM issue_date)=27 OR EXTRACT(DAY FROM issue_date)=26 OR EXTRACT(DAY FROM issue_date)=20 OR EXTRACT(DAY FROM issue_date)=19 OR EXTRACT(DAY FROM issue_date)=13 OR EXTRACT(DAY FROM issue_date)=12 OR EXTRACT(DAY FROM issue_date)=6 OR EXTRACT(DAY FROM issue_date)=5) GROUP BY violation_code) t1 INNER JOIN (SELECT violation_code, COUNT(violation_code)/23 AS Weekdays FROM parkingdata WHERE NOT (EXTRACT(DAY FROM issue_date)=27 OR EXTRACT(DAY FROM issue_date)=26 OR EXTRACT(DAY FROM issue_date)=20 OR EXTRACT(DAY FROM issue_date)=19 OR EXTRACT(DAY FROM issue_date)=13 OR EXTRACT(DAY FROM issue_date)=12 OR EXTRACT(DAY FROM issue_date)=6 OR EXTRACT(DAY FROM issue_date)=5) GROUP BY violation_code) t2 ON t1.violation_code = t2.violation_code" ) out.select( format_string('%d\t%d, %d', t1.violation_code, ROUND(t1.Weekend, 2), ROUND(t2.Weekdays, 2))).write.save("task7-sql.out", format="text")
def prepare_df( df: pyspark.sql.DataFrame, store_csv: pyspark.sql.DataFrame, store_states_csv: pyspark.sql.DataFrame, state_names_csv: pyspark.sql.DataFrame, google_trend_csv: pyspark.sql.DataFrame, weather_csv: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame: num_rows = df.count() # expand dates df = expand_date(df) # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed). df = (df.withColumn("Open", df.Open != "0").withColumn( "Promo", df.Promo != "0").withColumn("StateHoliday", df.StateHoliday != "0").withColumn( "SchoolHoliday", df.SchoolHoliday != "0")) # merge store information store = store_csv.join(store_states_csv, "Store") df = df.join(store, "Store") # merge Google Trend information google_trend_all = prepare_google_trend(google_trend_csv) df = df.join(google_trend_all, ["State", "Year", "Week"]).select(df["*"], google_trend_all.trend) # merge in Google Trend for whole Germany google_trend_de = google_trend_all[google_trend_all.file == "Rossmann_DE"].withColumnRenamed( "trend", "trend_de") df = df.join(google_trend_de, ["Year", "Week"]).select(df["*"], google_trend_de.trend_de) # merge weather weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName) df = df.join(weather, ["State", "Date"]) # fix null values df = (df.withColumn( "CompetitionOpenSinceYear", F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)), ).withColumn( "CompetitionOpenSinceMonth", F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)), ).withColumn("Promo2SinceYear", F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn( "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek, F.lit(1)))) # days and months since the competition has been open, cap it to 2 years df = df.withColumn( "CompetitionOpenSince", F.to_date( F.format_string("%s-%s-15", df.CompetitionOpenSinceYear, df.CompetitionOpenSinceMonth)), ) df = df.withColumn( "CompetitionDaysOpen", F.when( df.CompetitionOpenSinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince)), ), ).otherwise(0), ) df = df.withColumn("CompetitionMonthsOpen", (df.CompetitionDaysOpen / 30).cast(T.IntegerType())) # days and weeks of promotion, cap it to 25 weeks df = df.withColumn( "Promo2Since", F.expr( 'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)' ), ) df = df.withColumn( "Promo2Days", F.when( df.Promo2SinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))), ).otherwise(0), ) df = df.withColumn("Promo2Weeks", (df.Promo2Days / 7).cast(T.IntegerType())) # ensure that no row was lost through inner joins assert num_rows == df.count(), "lost rows in joins" return df