def __appendAggKey(tsdf, freq = None): """ :param tsdf: TSDF object as input :param freq: frequency at which to upsample :return: return a TSDF with a new aggregate key (called agg_key) """ df = tsdf.df checkAllowableFreq(freq) # compute timestamp columns sec_col = f.second(f.col(tsdf.ts_col)) min_col = f.minute(f.col(tsdf.ts_col)) hour_col = f.hour(f.col(tsdf.ts_col)) if (freq == SEC): agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(" "), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lpad(sec_col, 2, '0')).cast("timestamp") elif (freq == MIN): agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lpad(min_col, 2, '0'), f.lit(':'), f.lit('00')).cast("timestamp") elif (freq == HR): agg_key = f.concat(f.col(tsdf.ts_col).cast("date"), f.lit(' '), f.lpad(hour_col, 2, '0'), f.lit(':'), f.lit('00'), f.lit(':'), f.lit('00')).cast("timestamp") elif (freq == DAY): agg_key = f.col(tsdf.ts_col).cast("date").cast("timestamp") df = df.withColumn("agg_key", agg_key) return tempo.TSDF(df, tsdf.ts_col, partition_cols = tsdf.partitionCols)
def vwap(self, frequency='m', volume_col="volume", price_col="price"): # set pre_vwap as self or enrich with the frequency pre_vwap = self.df print('input schema: ', pre_vwap.printSchema()) if frequency == 'm': pre_vwap = self.df.withColumn( "time_group", f.concat(f.lpad(f.hour(f.col(self.ts_col)), 2, '0'), f.lit(':'), f.lpad(f.minute(f.col(self.ts_col)), 2, '0'))) elif frequency == 'H': pre_vwap = self.df.withColumn( "time_group", f.concat(f.lpad(f.hour(f.col(self.ts_col)), 2, '0'))) elif frequency == 'D': pre_vwap = self.df.withColumn( "time_group", f.concat(f.lpad(f.day(f.col(self.ts_col)), 2, '0'))) group_cols = ['time_group'] if self.partitionCols: group_cols.extend(self.partitionCols) vwapped = (pre_vwap.withColumn( "dllr_value", f.col(price_col) * f.col(volume_col)).groupby(group_cols).agg( sum('dllr_value').alias("dllr_value"), sum(volume_col).alias(volume_col), max(price_col).alias("_".join(["max", price_col]))).withColumn( "vwap", f.col("dllr_value") / f.col(volume_col))) return TSDF(vwapped, self.ts_col, self.partitionCols)
def read_customer_hierarchy_from_db( spark: pyspark.SparkContext, ) -> pyspark.sql.DataFrame: """ Reads customer hierarchy data from hierarchies_customer_curr_hierarchy_int_00_001 and changes few columns Parameters ---------- spark : pyspark.SparkContext Spark context to initialize variables and get data from hive Returns ------- pyspark.sql.DataFrame PySpark dataframe with ustomer hierarchy data """ # Read cust hiererachy data - for retailer name & ID cust_hier = spark.table("hierarchies_customer_curr_hierarchy_int_00_001") # Rename Dist channel ID column cust_hier = cust_hier.withColumnRenamed("distribution_channel_id", "distribution_channel_id_cust") # pad 0s to store_id col cust_hier = cust_hier.withColumn( "store_id", lpad(cust_hier["store_id"], 10, "0").alias("store_id")) cust_hier.createOrReplaceTempView("hierarchies_customer") return cust_hier
def read_product_hierarchy_from_db( spark: pyspark.SparkContext, ) -> pyspark.sql.DataFrame: """ Reads product hierarchy data from hierarchies_product_curr_hierarchy_int_00_001 and changes few columns Parameters ---------- spark : pyspark.SparkContext Spark context to initialize variables and get data from hive Returns ------- pyspark.sql.DataFrame PySpark dataframe with product hierarchy data """ # Translates SKU to PPG prod_hier = spark.table("hierarchies_product_curr_hierarchy_int_00_001") # Rename Dist channel ID column prod_hier = prod_hier.withColumnRenamed("distribution_channel_id", "distribution_channel_id_prod") # pad 0s to SKU_Id col prod_hier = prod_hier.withColumn( "sku_id", lpad(prod_hier["sku_id"], 18, "0").alias("sku_id")) prod_hier.createOrReplaceTempView("hierarchies_product") return prod_hier
def compile_lpad(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) length = op.length.op().value pad = op.pad.op().value return F.lpad(src_column, length, pad)
def create_train_data(): w1 = Window.orderBy("uid") w2 = Window.partitionBy("seg").orderBy("uid") df_train = spark.read.csv( os.path.join("datasets", "train.csv"), header=True, schema=schema).withColumn( "uid", monotonically_increasing_id()).withColumn( "idx", row_number().over(w1).cast(IntegerType())).withColumn( "seg", fn.floor(((fn.col("idx") - 1) / 150000)).cast( IntegerType())).withColumn( "no", row_number().over(w2).cast( IntegerType())).withColumn( "name", fn.concat( lit("raw_"), fn.lpad(fn.col("seg"), 4, "0").cast( StringType()))).withColumn( "set", lit(0)) df_train.createOrReplaceTempView("data") df_train_f = spark.sql(""" SELECT uid, set, seg, no, name, x, y FROM data ORDER BY set, seg, no, uid """) df_train_f = df_train_f.repartition(1) df_train_f.write.mode("overwrite").parquet( os.path.join("datasets", "train.parquet"))
def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: column_spec = lpad( col=self.column.get_column_spec(source_df=source_df, current_column=current_column), len=self.length, pad=self.pad, ) return column_spec
def assign_sk(self, df: DataFrame, orderByCol: str): now = datetime.now() # current date and time fmt = '%y%m%d%H' yymmddhh = now.strftime(fmt) df_with_row_num = df.withColumn( "row_num", row_number().over(Window.orderBy(col(orderByCol)))) sk_df = df_with_row_num.select( concat(lit(yymmddhh), lpad(col("row_num"), 5, "0")).cast("long").alias("sys_sk"), col("*")).drop(col("row_num")) return sk_df
def _setup_dataframe(spark, sqlContext, dataset_multiplier_factor, append_ids=True) -> pyspark.sql.DataFrame: """Setup a pyspark dataframe to run against. Then creates a PySpark dataframe, and crossjoins with a table of length :dataset_multiplier_factor: to increase the volume of data for benchmarking. Returns: A Pyspark dataframe with random phrases for string distance testing. """ df = _fetch_phrase_pairs() logger.info(f'{len(df):,} word pairs') pyspark_df = spark.createDataFrame(df, ['left', 'right']) pyspark_df = pyspark_df.repartition(10) pyspark_df.cache().count() logger.debug('Increasing data volume') range_df = sqlContext.range(dataset_multiplier_factor) if append_ids: range_df = range_df.withColumn('id_string', ps_funcs.lpad('id', 12, "0")) pyspark_df = range_df.crossJoin(pyspark_df).select( ps_funcs.concat_ws(' ', ps_funcs.col('left'), ps_funcs.col('id_string')).alias('left'), ps_funcs.concat_ws(' ', ps_funcs.col('right'), ps_funcs.col('id_string')).alias('right')) else: pyspark_df = range_df.crossJoin(pyspark_df).select( ps_funcs.col('left'), ps_funcs.col('right')) pyspark_df = pyspark_df.repartition(__DATASET_PARTITIONS) record_count = pyspark_df.cache().count() logger.info(f'Generated dataframe with {record_count:,} records') sample_data = pyspark_df.sample(withReplacement=False, fraction=0.01).limit(1).collect() logger.info(f'Sample of benchmarking data: {sample_data}') return pyspark_df
def test_auto_mapper_lpad(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "1234"), (2, "1234567"), (3, "123456789"), ], ["member_id", "empi"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns(my_column=A.lpad(column=A.column("empi"), length=9, pad="0")) # Assert assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) assert str(sql_expressions["my_column"]) == str( lpad(col=col("b.empi"), len=9, pad="0").alias("my_column") ) result_df: DataFrame = mapper.transform(df=df) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 1").select("my_column").collect()[0][0] == "000001234" ) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 2").select("my_column").collect()[0][0] == "001234567" ) # noinspection SpellCheckingInspection assert ( result_df.where("member_id == 3").select("my_column").collect()[0][0] == "123456789" )
def generate_customers_from_hierarchies(hierarchy_list): """ Accepts a list of hierarchy numbers and returns a list of customer numbers """ hierarchy_list = hierarchy_list.withColumnRenamed('_c0', 'input_lvl_1') hierarchy_list = hierarchy_list.withColumn( 'input_lvl_1', F.lpad(hierarchy_list['input_lvl_1'], 10, '0')) pricing_hierarchy = spark.read.parquet(location + 'extracts/pricing_hierarchy_table') hierarchy_list = (hierarchy_list.join( pricing_hierarchy, hierarchy_list.input_lvl_1 == pricing_hierarchy.higherlevel_customer_hkunnr, 'left_outer').drop('higherlevel_customer_hkunnr')) hierarchy_list = hierarchy_list.withColumnRenamed('customer_kunnr', 'input_lvl_2') hierarchy_list = (hierarchy_list.join( pricing_hierarchy, hierarchy_list.input_lvl_2 == pricing_hierarchy.higherlevel_customer_hkunnr, 'left_outer').drop('higherlevel_customer_hkunnr')) hierarchy_list = hierarchy_list.withColumnRenamed('customer_kunnr', 'input_lvl_3') hierarchy_list = (hierarchy_list.join( pricing_hierarchy, hierarchy_list.input_lvl_3 == pricing_hierarchy.higherlevel_customer_hkunnr, 'left_outer').drop('higherlevel_customer_hkunnr')) hierarchy_list = hierarchy_list.withColumnRenamed('customer_kunnr', 'input_lvl_4') hierarchy_list = (hierarchy_list.join( pricing_hierarchy, hierarchy_list.input_lvl_4 == pricing_hierarchy.higherlevel_customer_hkunnr, 'left_outer').drop('higherlevel_customer_hkunnr')) hierarchy_list = hierarchy_list.withColumnRenamed('customer_kunnr', 'input_lvl_5') def pick_highest_hierarchy(input_1, input_2, input_3, input_4, input_5): if (input_5): return input_5 elif (input_4): return input_4 elif (input_3): return input_3 elif (input_2): return input_2 else: return input_1 hierarchy_selector_UDF = F.udf(pick_highest_hierarchy, StringType()) hierarchy_list = hierarchy_list.withColumn( 'customer_kunnr', hierarchy_selector_UDF(hierarchy_list.input_lvl_1, hierarchy_list.input_lvl_2, hierarchy_list.input_lvl_3, hierarchy_list.input_lvl_4, hierarchy_list.input_lvl_5)) return hierarchy_list.selectExpr('customer_kunnr as _c0')
if (customer_option == '1'): customer_input = spark.read.csv("gs://data-discovery/" + customer_file) elif (customer_option == '2'): customer_input = generate_customers_from_hierarchies( spark.read.csv("gs://data-discovery/" + customer_file)) # Combine all of the eligibility files into one dataframe customer_eligibility = filter_by_customer_eligibility( spark.read.parquet(location + 'eligibility_files/customer_eligibility_table')) # Filter the eligibility dataframe to the relevant customers if (customer_option == '1' or customer_option == '2'): padded_customer_input = customer_input.withColumn( 'customer_kunnr', F.lpad(customer_input['_c0'], 10, '0')).drop('_c0') customer_eligibility = (customer_eligibility.drop( 'vertical_mkt_desc').drop('customer_market_description').drop( 'customer_submarket_description').drop('cot_desc').drop( 'district_name_bztxt')) filtered_eligibility = customer_eligibility.join( F.broadcast(padded_customer_input), 'customer_kunnr') else: customer_attributes_dict = create_customer_attributes_dict( customer_attributes) filtered_eligibility = filter_by_attributes(customer_eligibility, customer_attributes_dict)
def get_events_by_tier_month( spark, start_date, end_date, tiers_raw=None, remove_raw=None, skims_raw=None, only_valid_files=False, verbose=False, ): """ Generate a pandas dataframe containing data_tier_name, month, nevents for the given time period. It will add virtual tiers based on the skims. args: - spark: Spark session - start_date: String with the date y format yyyy/MM/dd - end_date: String with the date y format yyyy/MM/dd - tiers_raw: List of tiers - remove_raw: List of remove patterns - skims_raw: List of skim patterns - only_valid_files: True if you want to take into account only the valid files. - verbose: True if you want additional output messages, default False. """ if tiers_raw is None: tiers_raw = [".*"] if skims_raw is None: skims_raw = [] if remove_raw is None: remove_raw = [] tiers = "^({})$".format("|".join( ["{}".format(tier.strip()) for tier in tiers_raw])) skims_rlike = (".*-({})-.*".format("|".join([l.strip() for l in skims_raw ])) if skims_raw else "^$") remove_rlike = (".*({}).*".format("|".join( [l.strip().lower() for l in remove_raw])) if remove_raw else "^$") tables = spark_utils.dbs_tables(spark, tables=["ddf", "bdf", "fdf", "dtf"]) if verbose: logger.info("remove %s", remove_rlike) logger.info("skims %s", skims_rlike) for k in tables: # tables[k].cache() logger.info(k) tables[k].printSchema() tables[k].show(5, truncate=False) datablocks_file_events_df = spark.sql( """SELECT sum(fdf.f_event_count) as f_event_count, max(ddf.d_data_tier_id) as d_data_tier_id, d_dataset, b_block_name, max(b_creation_date) as b_creation_date, max(b_block_size) as size FROM ddf JOIN bdf on ddf.d_dataset_id = bdf.b_dataset_id JOIN fdf on bdf.b_block_id = fdf.f_block_id WHERE d_is_dataset_valid = 1 {} group by d_dataset, b_block_name """.format( "AND f_is_file_valid = 1" if only_valid_files else "")) fiter_field = "b_creation_date" datablocks_file_events_df = (datablocks_file_events_df.withColumn( fiter_field, from_unixtime(fiter_field)).filter( fiter_field + " between '{}' AND '{}' ".format(start_date.replace( "/", "-"), end_date.replace("/", "-"))).withColumn( "month", concat(year(fiter_field), lpad(month(fiter_field), 2, "0")))) datablocks_file_events_df.registerTempTable("dbfe_df") # Union of two queries: # - The first query will get all the selected data tiers, # excluding the datasets who match the skims # - The second query will get all the selected data tiers, # but only the dataset who match the skims. grouped = spark.sql(""" select month, data_tier_name, sum(f_event_count) as nevents from dbfe_df join dtf on data_tier_id = d_data_tier_id where data_tier_name rlike '{tiers}' and lower(d_dataset) not rlike '{remove}' and d_dataset not rlike '{skims}' group by month, data_tier_name UNION select month, concat(data_tier_name, '/',regexp_extract(d_dataset,'{skims}',1)) AS data_tier_name, sum(f_event_count) as nevents from dbfe_df join dtf on dtf.data_tier_id = d_data_tier_id where data_tier_name rlike '{tiers}' and lower(d_dataset) not rlike '{remove}' and d_dataset rlike '{skims}' group by month, concat(data_tier_name, '/',regexp_extract(d_dataset,'{skims}',1)) """.format(tiers=tiers, remove=remove_rlike, skims=skims_rlike)) return grouped.toPandas()
def process_covid_country_fact(spark, covid19_lake, output_data): """ Write countries covid data to parquet files on S3. Keyword arguments: spark -- a spark session covid19_lake -- the script reads data from S3 or public datalake output_data -- writes province and country to partitioned parquet on S3 """ # get filepath to country fact table covid_country_data = covid19_lake + 'tableau-jhu/csv/COVID-19-Cases.csv' # define the data frames country_data_df = spark.read.load( covid_country_data, format="csv", sep=",", inferSchema="true", header="true") country_data_df = country_data_df.dropDuplicates() split_date = F.split(country_data_df["Date"], '/') country_data_df = country_data_df.withColumn( 'Month', split_date.getItem(0)) country_data_df = country_data_df.withColumn( 'Day', split_date.getItem(1)) country_data_df = country_data_df.withColumn( 'Year', split_date.getItem(2)) country_data_df.show(5) country_data_df = country_data_df.select( country_data_df.Country_Region, \ country_data_df.Cases, \ country_data_df.Case_Type, \ country_data_df.Combined_Key, \ country_data_df.Date, \ F.lpad(country_data_df.Month,2,'0').alias('Month'), \ F.lpad(country_data_df.Day,2,'0').alias('Day'), \ F.lpad(country_data_df.Year,4,'0').alias('Year'), \ F.to_date( F.concat_ws( '-', country_data_df.Month, country_data_df.Day, country_data_df.Year), \ 'MM-dd-yyyy').alias('Date_format')) country_data_df.createOrReplaceTempView("country_data") country_data_df.createOrReplaceTempView("country_p_data") country_data_df.show(5) fact_covid_country_confirmed = spark.sql( "SELECT DISTINCT \ bd.country_region as country_name, \ to_date(bd.Date_format) as date, \ case when bd.case_type = 'Confirmed' \ then (bd.cases - bp.cases) end as confirmed_cases, \ case when bd.case_type = 'Deaths' \ then (bd.cases - bp.cases) end as death_cases, \ case when bd.case_type = 'Confirmed' \ then bd.cases end as sum_confirmed_cases, \ case when bd.case_type = 'Deaths' \ then bd.cases end as sum_death_cases \ FROM country_data bd \ JOIN country_p_data bp \ ON (date_add(to_date(bd.Date_format), -1) \ = to_date(bp.Date_format) \ AND bd.case_type = bp.case_type \ AND bd.country_region = bp.country_region \ AND bd.combined_key = bp.combined_key) \ WHERE \ bd.case_type = 'Confirmed' \ AND bp.case_type = 'Confirmed'" ) fact_covid_country_confirmed.show(10) fact_covid_country_death = spark.sql( "SELECT DISTINCT \ bd.country_region as country_name, \ to_date(bd.Date_format) as date, \ case when bd.case_type = 'Confirmed' \ then (bd.cases - bp.cases) end as confirmed_cases, \ case when bd.case_type = 'Deaths' \ then (bd.cases - bp.cases) end as death_cases, \ case when bd.case_type = 'Confirmed' \ then bd.cases end as sum_confirmed_cases, \ case when bd.case_type = 'Deaths' \ then bd.cases end as sum_death_cases \ FROM country_data bd \ JOIN country_p_data bp \ ON (date_add(to_date(bd.Date_format),-1) \ = to_date(bp.Date_format) \ AND bd.case_type = bp.case_type \ AND bd.country_region = bp.country_region \ ANd bd.combined_key = bp.combined_key) \ WHERE \ bd.country_region not in ('Brazil', 'US') \ AND bp.country_region not in ('Brazil', 'US') \ AND bd.case_type = 'Deaths' \ AND bp.case_type = 'Deaths'" ) fact_covid_country = fact_covid_country_confirmed \ .unionByName(fact_covid_country_death) fact_covid_country.createOrReplaceTempView("fact_covid_country") fact_covid_country = spark.sql( "SELECT \ country_name, \ date, \ sum(confirmed_cases) as confirmed_cases, \ sum(death_cases) as death_cases, \ sum(sum_confirmed_cases) as sum_confirmed_cases, \ sum(sum_death_cases) as sum_death_cases \ FROM fact_covid_country \ GROUP BY \ country_name, date" ) fact_covid_country.show(10) # write tables to partitioned parquet files fact_covid_country.write.mode('append') \ .partitionBy('country_name', 'date') \ .parquet(output_data + 'fact_covid_country') count_fact = {} count_fact['country'] = spark.sql( "SELECT COUNT (DISTINCT country_name) as countries \ FROM fact_covid_country").collect()[0].countries count_fact['date'] = spark.sql( "SELECT COUNT (DISTINCT date) as dates \ FROM fact_covid_country").collect()[0].dates return count_fact
def process_covid_dimension(spark, input_data, covid19_lake, output_data): """ Write country, province and time dimensions to parquet files on S3. Keyword arguments: spark -- a spark session input_data -- the script reads data from S3 or public datalake covid19_lake -- the script reads data from S3 or public datalake output_data -- the script writes dimension to partitioned parquet on S3 """ # get filepath to dimensions covid_global_data = covid19_lake + 'tableau-jhu/csv/COVID-19-Cases.csv' covid_brazil_data = input_data + 'COVID-19-Brazil.csv.gz' brazil_provinces = input_data + 'provinces_brazil.csv' # define the data frames global_data_df = spark.read.load( covid_global_data, format="csv", sep=",", inferSchema="true", header="true") global_data_df = global_data_df.dropDuplicates() global_data_df.createOrReplaceTempView("global_data") brazil_data_df = spark.read.load( covid_brazil_data, format="csv", sep=",", inferSchema="true", header="true") brazil_data_df = brazil_data_df.dropDuplicates() brazil_data_df.createOrReplaceTempView("brazil_data") brazil_provinces_df = spark.read.load( brazil_provinces, format="csv", sep=",", inferSchema="true", header="true") brazil_provinces_df = brazil_provinces_df.dropDuplicates() brazil_provinces_df.createOrReplaceTempView("brazil_provinces") date_df = global_data_df.filter("country_region = 'Brazil'") date_df = date_df.select("Date") date_df = date_df.dropDuplicates() split_date = F.split(date_df["Date"],'/') date_df = date_df.withColumn('Month', split_date.getItem(0)) date_df = date_df.withColumn('Day', split_date.getItem(1)) date_df = date_df.withColumn('Year', split_date.getItem(2)) date_df = date_df.select( date_df.Date, F.lpad(date_df.Month,2,'0').alias('Month'), F.lpad(date_df.Day,2,'0').alias('Day'), F.lpad(date_df.Year,4,'0').alias('Year')) date_df = date_df.select( date_df.Date, date_df.Month, date_df.Day, date_df.Year, F.to_date(F.concat_ws('-',date_df.Month, date_df.Day, date_df.Year), 'MM-dd-yyyy').alias('Date_format')) date_df = date_df.select( date_df.Date, date_df.Date_format, date_df.Year, F.weekofyear(date_df.Date_format).alias('Week'), date_df.Month, date_df.Day, F.dayofweek(date_df.Date_format).alias('Week_Day')) date_df.show(2) date_df.createOrReplaceTempView("date") # extract columns to create tables dim_country = spark.sql( "SELECT DISTINCT \ Country_Region, \ iso2, \ iso3, \ sum(Population_Count) as population_count \ FROM global_data \ WHERE \ case_type = 'Deaths' \ AND Date = '5/19/2020' \ GROUP BY \ Country_Region, \ iso2, \ iso3" ) dim_province_us = spark.sql( "SELECT DISTINCT \ country_region, \ province_state, \ max(Population_Count) as population_count \ FROM global_data \ WHERE \ case_type = 'Deaths' \ AND country_region = 'US' \ AND Date = '5/19/2020' \ GROUP BY \ Country_Region, \ province_state" ) dim_province_br = spark.sql( "SELECT DISTINCT \ 'Brazil' as country_region, \ bp.province_state as province_state, \ sum(estimated_population_2019) as population_count \ FROM brazil_data bd \ JOIN brazil_provinces bp ON bd.state = bp.state \ WHERE \ place_type = 'state' \ AND date = '2020-05-19' \ GROUP BY \ Country_Region, \ province_state" ) dim_province = dim_province_us.unionByName(dim_province_br) dim_time = spark.sql( "SELECT DISTINCT \ date, year, week, month, day, week_day \ FROM date" ) # write tables to partitioned parquet files dim_country.write.mode('overwrite') \ .partitionBy('country_region') \ .parquet(output_data + 'dim_country') dim_province.write.mode('overwrite') \ .partitionBy('country_region', 'province_state') \ .parquet(output_data + 'dim_province') dim_time.write.mode('overwrite') \ .partitionBy('year','month') \ .parquet(output_data + 'dim_time') count_dim = {} count_dim['country'] = spark.sql( "SELECT COUNT (DISTINCT country_region) as countries \ FROM global_data").collect()[0].countries count_dim['date'] = spark.sql( "SELECT COUNT (DISTINCT date) as dates \ FROM global_data").collect()[0].dates return count_dim
def main(): # Download data now = datetime.datetime.now().strftime("%m-%d-%H-%M-%S") folder = config['local_folder'] os.system(f"rm -r {folder}") # Daily dataset ##big_table_name = 'vartefact.forecast_sprint4_add_dm_to_daily' # Promo dataset big_table_name = f"{config['database']}.forecast_sprint4_promo_mecha_v4" file_name = 'test_to_delete_' + now download_data(folder=folder, big_table_name=big_table_name, file_name=file_name) os.system( f"cp {config['config_folder']}/calendar.pkl {config['local_folder']}") # Preprocess the datase big_table = file_name + '.csv' sql_table = big_table_name target_value = 'dm_sales_qty' dataset_name = 'dataset_test' preprocess_promo(folder=folder, big_table=big_table, sql_table=sql_table, target_value=target_value, dataset_name=dataset_name) # Train the model desc = 'promo_test' data_name = dataset_name + '.pkl' target_value = 'dm_sales_qty' date_stop_train = config['date_stop_train'] learning_rate = 0.3 folder_name = train(desc=desc, folder=folder, data_name=data_name, target_value=target_value, learning_rate=learning_rate, date_stop_train=date_stop_train) # Make a prediction prediction(folder_name, folder, data_name, target_value, learning_rate, date_stop_train) # save csv as table print('saving csv as table') file_list = os.listdir(config['local_folder']) for f in file_list: if desc in f: result_file_name = f result_csv_path = f"{config['local_folder']}/{result_file_name}/promo_sales_order_prediction_by_item_store_dm.csv" os.system( f"hadoop fs -rm promo_sales_order_prediction_by_item_store_dm.csv") os.system(f"hadoop fs -put -f {result_csv_path}") spark = SparkSession \ .builder \ .appName("Forecast_saveastable") \ .config("spark.sql.warehouse.dir", warehouse_location) \ .config("spark.num.executors", '10') \ .config("spark.executor.memory", '15G') \ .config("spark.executor.cores", '20') \ .enableHiveSupport() \ .getOrCreate() sqlContext = SQLContext(spark) spark_df = sqlContext.read.format('com.databricks.spark.csv').options( header='true').load( f"promo_sales_order_prediction_by_item_store_dm.csv") spark_df = spark_df.withColumn("item_id", spark_df["item_id"].cast(IntegerType())) spark_df = spark_df.withColumn("sub_id", spark_df["sub_id"].cast(IntegerType())) spark_df = spark_df.withColumn( "current_dm_theme_id", spark_df["current_dm_theme_id"].cast(IntegerType())) spark_df = spark_df.withColumn("store_code", lpad(spark_df['store_code'], 3, '0')) spark_df = spark_df.withColumn("sales", spark_df["sales"].cast(FloatType())) spark_df = spark_df.withColumn( "sales_prediction", spark_df["sales_prediction"].cast(FloatType())) spark_df = spark_df.withColumn( "squared_error_predicted", spark_df["squared_error_predicted"].cast(FloatType())) spark_df = spark_df.withColumn( "std_dev_predicted", spark_df["std_dev_predicted"].cast(FloatType())) spark_df = spark_df.withColumn( "confidence_interval_max", spark_df["confidence_interval_max"].cast(FloatType())) spark_df = spark_df.withColumn( "order_prediction", spark_df["order_prediction"].cast(FloatType())) spark_df = spark_df.withColumn('sub_family_code', regexp_replace('sub_family_code', '.0', '')) spark_df.write.mode('overwrite').saveAsTable( f"{config['database']}.promo_sales_order_prediction_by_item_store_dm") spark.stop() sql = f""" invalidate metadata {config['database']}.promo_sales_order_prediction_by_item_store_dm """ impalaexec(sql) print('csv saved in the table')
.option("inferSchema","true")\ .load('/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv') #data schema 확인 df.printSchema() #initcap : 주어진 문자열에서 공백을 나눠 첫글자를 대문자로 반환 df.select(initcap(col("Description"))).show(2, False) #lower // upper df.select(lower(col("StockCode"))).show(2) #공백 추가 및 제거 (lit,ltrim,rtrim,rpad,lpad,trim) from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lpad"), rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2) ##정규 표현식 #description컬럼의 값을 COLOR 값으로 치환 from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) #주어진 문자를 다른 문자로 치환 from pyspark.sql.functions import translate df.select(translate(col("Description"), "WHI", "123")).show(2) #color name 추출
df.selectExpr( 'Description', 'lower(Description)', 'upper(lower(Description))').show(2) # select description, lower(Description), upper(lower(Description)) from dfTable from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(' HELLO ')).alias('ltrim'), rtrim(lit(' HELLO ')).alias('rtrim'), trim(lit(' HELLO ')).alias('trim'), lpad(lit('HELLO'), 3, ' ').alias('lp'), rpad(lit('HELLO'), 10, ' ').alias('rp')).show(2) df.selectExpr( 'ltrim( "HELLO" ) as ltrim', 'rtrim( "HELLO" ) as rtrim', 'trim( "HELLO" )as trim', 'lpad("HELLO", 3, " ") as lp', 'rpad("HELLO", 3, " ")as rp').show(2) # select # ltrim(' HELLO '), # rtrim(' HELLO '), # trim(' HELLO '), # lpad('HELLO', 3, ' '), # rpad('HELLO', 10, ' ')
print("Start %d Cities Cluster Test, NSlaves = %d" % (NCities, NSlaves), flush=True) print("Execution time #0 %f" % (time.time() - startTime), flush=True) # ## Read two mapping files into dataframes # - Read files from Amazon S3 bucket into Spark dataframes # - Format columns as required to enable joins to dataset below # read and process city FIPS to county FIPS mapping file city_to_fips = spark.read.format("org.apache.spark.csv").option("header","true") \ .csv(latlonFilename) # .csv("/home/ubuntu/project/data/uscitiesLatLongFIPS.csv") city_to_fips = city_to_fips.withColumn( "county_FIPS", f.lpad(city_to_fips['county_FIPS'], 5, "0")) city_to_fips = city_to_fips.drop("city", "zip", "id", "source", "population") city_to_fips = city_to_fips.withColumn( "city_ascii", f.regexp_replace('city_ascii', 'Saint', 'St.')) city_to_fips = city_to_fips.withColumnRenamed("city_ascii","CityName") \ .withColumnRenamed("state_name","StateDesc") \ .withColumnRenamed("county_FIPS","FIPS") print((city_to_fips.count(), len(city_to_fips.columns))) city_to_fips.limit(5).toPandas() # read and process commuting zone to county FIPS mappingfile cz_to_fips = spark.read.format("org.apache.spark.csv").option("header","true").option("delimiter", "\t") \ .csv(lmaFilename) # .csv("/home/ubuntu/project/data/1990LMAascii.csv")
def process_temperatures_data(spark, input_data, output_data): """ input_parameters: - spark: Spark session object - input_data: s3a route to public udacity input data - output_data: s3 route to private results output parameters: - none Description: Reads input source for temperatures data and perfoms etl and wrangling. Store final information in output source with parquet format """ # get filepath to temperatures data file temp_data_path = input_data+"city_temperature.zip" print ("Getting Path to Temperatures ...") # Read in the data here df3 = pd.read_csv(temp_data_path,sep=',',low_memory=False, compression='zip') # Filter for only US cities df3=df3[df3['Country']=='US'] # Drop unsued columns and duplicates df3.drop('Region', axis='columns', inplace=True) df3.drop('Country', axis='columns', inplace=True) df3=df3.drop_duplicates() # Create a reverse dict from SAS i94addrl i94addrl_r = dict(zip(i94addrl.values(),i94addrl.keys())) # Convert to spark Dataframe df3_spark=spark.createDataFrame(df3) # Define a lookup UDF and create StateCode column based on State name informationç def getState(key, default=None): if str(key).upper() in i94addrl_r: return i94addrl_r[str(key).upper()] return default get_i94addrl_r = udf(lambda x: getState(x)) df3_spark = df3_spark.withColumn("stateCode",get_i94addrl_r(df3_spark.State)) # Create CityCode with contatenation df3_spark = df3_spark.withColumn('cityCode', sf.concat(sf.col('City'),sf.lit('_'), sf.col('stateCode'))) # Group observation date into a single column df3_spark = df3_spark.withColumn('temperatureDate', sf.concat(sf.col('Year'),sf.lit('-'), sf.lpad(df3_spark['Month'],2,'0'), sf.lit('-'), sf.lpad(df3_spark['Day'],2,'0'))) # Remove unused columns df3_spark=df3_spark.drop('State') df3_spark=df3_spark.drop('City') # Remove null values df3_spark=df3_spark.na.drop() # Write to parquet file (pending partition by STATECODE / YEAR / MONTH) df3_spark.write.mode('append').partitionBy("stateCode","Year", "Month").parquet(output_data+"temperatures/temperatures.parquet") print ("Writing Temperatures Table to Parquet") print ("Number of rows written: ",df3_spark.count())
def transform(self, sources: dict) -> DataFrame: ri = self.invoice_dataframe(sources['rptt_invoice']) rst = self.read_source(source=sources['rptm_sbu_subset_txt']) cmf = self.read_source(source=sources['customer_mapping']) cmf = cmf.withColumnRenamed('sales_rep', 'sales_rep_override') cmf = cmf.withColumnRenamed('sales_rep_id', 'sales_rep_id_override') cmf = cmf.withColumnRenamed('end_market', 'cmf_end_market') mmf = self.read_source(source=sources['material_mapping']) srtr = self.read_source(source=sources['sales_rep_to_region']) rsrt = self.read_source(source=sources['rptm_sales_rep_txt']) rsrt = rsrt.withColumnRenamed('med_desc', 'sales_rep_original') edataA = self.read_source(source=sources['exclusion_dataA']) edataA = edataA.withColumnRenamed('sold_customer_id', 'edataA_sold_customer_id') # Source contains system_id/material_id pairs that need excluded excmat = self.read_source(source=sources['exclude_mat']) cerd = self.read_source(source=sources['currency_exchange_rates']) cerd = fixCurExchangeToAvg(self, cerd) cers = cerd.select('currency_code_from', 'cur_year', 'cur_month', 'conversion_rate_multiplier') cers = cers.withColumnRenamed('currency_code_from', 'std_currency_code_from') cers = cers.withColumnRenamed('cur_year', 'std_cur_year') cers = cers.withColumnRenamed('cur_month', 'std_cur_month') cers = cers.withColumnRenamed('conversion_rate_multiplier', 'std_conversion_rate_multiplier') dcust_sold = self.read_source(source=sources['dim_customer']) dcust_sold = dcust_sold.withColumnRenamed('dim_customer_id', 'sold_dim_customer_id') dcust_ship = self.read_source(source=sources['dim_customer']) dcust_ship = dcust_ship.withColumnRenamed('dim_customer_id', 'ship_dim_customer_id') dcust_brand = self.read_source(source=sources['dim_customer']) dcust_brand = dcust_brand.withColumnRenamed('dim_customer_id', 'brand_dim_customer_id') dloc_ship = self.read_source(source=sources['dim_location']) dloc_ship = dloc_ship.withColumnRenamed('dim_location_id', 'ship_from_dim_location_id') dloc_inv = self.read_source(source=sources['dim_location']) dloc_inv = dloc_inv.withColumnRenamed('dim_location_id', 'invoice_dim_location_id') dmat = self.read_source(source=sources['dim_material']) dmat = dmat.withColumnRenamed('dim_material_id', 'ship1_dim_material_id') df = (ri.join(excmat, [ excmat.material_id == ri.ship1_material_id_int, excmat.system == ri.system_id ], 'left_anti').join( rst, [rst.sbu_subset_id == ri.sbu_subset_id], 'left_outer').join( mmf, [mmf.material == ri.mmf_material], 'left_outer').join(dmat, [ dmat.billing_system == ri.system_id, dmat.material_id == ri.ship_mat1_id, dmat.end_market_or_prime == F.when( ri.prime_flag == 1, 'Prime').otherwise('Non-Prime') ], 'left_outer').join(cmf, [ F.upper(F.trim(cmf.sold_to_ship_to)) == ri.commercial_print_customer_key, F.upper(F.trim(cmf.cmf_end_market)) == F.upper( dmat.end_market) ], 'left_outer').join(srtr, [ srtr.sales_rep_id == cmf.sales_rep_id_override ], 'left_outer').join(cerd, [ cerd.currency_code_from == ri.currency_id, cerd.cur_year == ri.inv_year, cerd.cur_month == ri.inv_month ], 'left_outer').join(cers, [ cers.std_currency_code_from == ri.std_cost_currency_id, cers.std_cur_year == ri.inv_year, cers.std_cur_month == ri.inv_month ], 'left_outer').join(dcust_sold, [ dcust_sold.billing_system == ri.system_id, dcust_sold.customer_id == ri.sold_customer_id ], 'left_outer').join(dcust_ship, [ dcust_ship.billing_system == ri.system_id, dcust_ship.customer_id == ri.ship_customer_id ], 'left_outer').join(dcust_brand, [ dcust_brand.billing_system == ri.system_id, dcust_brand.customer_id == ri.brand_owner ], 'left_outer').join(dloc_ship, [ dloc_ship.location_id == ri.ship_location_id ], 'left_outer').join( dloc_inv, [dloc_inv.location_id == ri.mfg_location_id], 'left_outer').join(edataA, [ edataA.edataA_sold_customer_id == ri.sold_customer_id_lstrip_0, ri.system_id == 'S3', ri.rev_acct_id == 'R6000' ], 'left_anti').join( rsrt, [rsrt.sales_rep_id == ri.ri_sales_rep_id], 'left_outer').select( ri.system_id, ri.invoice_id, ri.line_number, ri.month, ri.source_type, ri.rev_acct_id, ri.weight_qty, ri.currency_id, ri.std_cost_currency_id, ri.inv_date, ri.quality_class, ri.sale_type, ri.invoice_line_value, ri.line_qty, ri.invoice_uom_id, ri.inv_line_std_cost, ri.period, ri.year, ri.sales_order, ri.ri_sales_rep_id, ri.line_desc1, rst.med_desc, mmf.cp_subset, cmf.channel, cmf.drop_ship_into_stock, cmf.sales_rep_override, cmf.cmf_end_market, cmf.sales_rep_id_override, cerd.conversion_rate_multiplier, cers.std_conversion_rate_multiplier, dmat.ship1_dim_material_id, dmat.product_code, dmat.force_product_code, dmat.nominal_basis_weight, dmat.material_id, dmat.end_market, dloc_ship.ship_from_dim_location_id, dloc_inv.invoice_dim_location_id, dcust_ship.ship_dim_customer_id, dcust_sold.sold_dim_customer_id, dcust_brand.brand_dim_customer_id, rsrt.sales_rep_original, srtr.region, ri.invoice_volume)) df = df.where( "case when system_id = 'S3' then product_code else '~' end not in ('SC', 'CR')" ) df = df.withColumn('iptmeta_source_system', F.lit('dataA')) df = df.withColumn('bol_number', F.lit(MISSING_NUMBER)) df = df.withColumn( 'product_sold_flag', F.when((df.weight_qty.isNull()) | (df.weight_qty == 0), F.lit('N')).otherwise(F.lit('Y'))) df = df.withColumn( 'fx_conversion_to_usd', F.coalesce( F.when(df.currency_id == 'USD', 1).otherwise( df.conversion_rate_multiplier.cast(T.DoubleType())), F.lit(MISSING_NUMBER))) df = df.withColumn( 'std_fx_conversion_to_usd', F.coalesce( F.when(df.std_cost_currency_id == 'USD', 1).otherwise( df.std_conversion_rate_multiplier.cast(T.DoubleType())), F.lit(MISSING_NUMBER))) df = df.withColumn('grade', df.product_code) df = df.withColumn('invoice_date', F.to_date(df.inv_date)) df = prime_enrich(df) df = df.withColumn('sales_order_number', F.coalesce(df.sales_order, F.lit('0'))) df = df.withColumn( 'sale_type', F.when(df.sale_type == 'I', F.lit('Internal')).when( df.sale_type == 'E', F.lit('External')).otherwise(df.sale_type)) df = df.withColumn( 'subset', F.coalesce(df.cp_subset, df.med_desc, F.lit(NOT_APPLICABLE_DESC))) df = ( df.withColumn( 'claims', F.when(df.rev_acct_id.isin('R4900', 'R4350'), df.invoice_line_value * df.fx_conversion_to_usd). otherwise(MISSING_NUMBER)).withColumn( 'discounts', F.when(df.rev_acct_id.isin('R4500'), df.invoice_line_value * df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)). withColumn("freight_invoice_calc", F.lit('actual')).withColumn( 'freight_invoice', F.when(df.rev_acct_id.isin('R8200'), df.invoice_line_value * df.fx_conversion_to_usd). otherwise(MISSING_NUMBER)).withColumn( 'freight_upcharge', F.when(df.rev_acct_id.isin('R0300'), df.invoice_line_value * df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)). withColumn( 'gross_price', F.when(df.rev_acct_id.isin('R0100', 'R0500', 'R0700', 'R0105'), df.invoice_line_value * df.fx_conversion_to_usd). otherwise(MISSING_NUMBER)).withColumn( 'other_deductions', F.when(df.rev_acct_id.isin('R5300'), df.invoice_line_value * df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)). withColumn( 'standard_cost', F.coalesce( df.inv_line_std_cost * df.std_fx_conversion_to_usd, F.lit(MISSING_NUMBER))).withColumn( 'rebates', F.when( df.rev_acct_id.isin( 'R4110', 'R4130'), df.invoice_line_value * df.fx_conversion_to_usd).otherwise(MISSING_NUMBER)) # TODO Confirm exclusions and/or data predicate should be here .withColumn( 'service_allowances', F.when(df.rev_acct_id.isin('R6000'), df.invoice_line_value * df.fx_conversion_to_usd).otherwise(MISSING_NUMBER))) df = df.withColumn( 'msf', F.when(df.invoice_uom_id == 'MSF', df.line_qty).when(df.invoice_uom_id == 'M2', df.line_qty * .0107639).otherwise(0)) df = df.withColumn('nominal_tons', df.nominal_basis_weight * df.msf / 2000) df = df.withColumn( 'net_price', df.gross_price + df.discounts + df.rebates + df.claims + df.freight_upcharge + df.other_deductions + df.service_allowances) df = df.withColumn( 'standard_gross_margin', df.net_price - (df.standard_cost + df.freight_invoice)) df = dataA_sales_rep_override(df) df = df.withColumn( 'sales_rep_id', F.coalesce(df.sales_rep_id_override, df.ri_sales_rep_id, F.lit(MISSING_NUMBER))) df = ( df.withColumn( 'ship_from_dim_location_id', F.coalesce( df.ship_from_dim_location_id, F.lit(MISSING_STRING_ID))).withColumn( 'invoice_dim_location_id', F.coalesce( df.invoice_dim_location_id, F.lit(MISSING_STRING_ID))).withColumn( 'ship1_dim_material_id', F.coalesce( df.ship1_dim_material_id, F.lit(MISSING_STRING_ID))).withColumn( 'channel', F.coalesce( df.channel, F.lit(MISSING_DESC))).withColumn( 'drop_ship_into_stock', F.coalesce( df.drop_ship_into_stock, F.lit(MISSING_DESC))). withColumn('region', F.coalesce( df.region, F.lit(MISSING_DESC))).withColumn( 'ship_dim_customer_id', F.coalesce( df.ship_dim_customer_id, F.lit(MISSING_STRING_ID))).withColumn( 'sold_dim_customer_id', F.coalesce( df.sold_dim_customer_id, F.lit(MISSING_STRING_ID))).withColumn( 'brand_dim_customer_id', F.coalesce( df.brand_dim_customer_id, F.lit(MISSING_STRING_ID))).withColumn( 'invoice_period', F.lpad(df.month, 6, '0'))) df = (df.withColumnRenamed( 'system_id', 'billing_system').withColumnRenamed( 'rev_acct_id', 'invoice_line_code').withColumnRenamed( 'invoice_id', 'invoice_number').withColumnRenamed( 'line_number', 'invoice_line_number').withColumnRenamed( 'source_type', 'invoice_source_type').withColumnRenamed( 'channel', 'commercial_print_channel').withColumnRenamed( 'drop_ship_into_stock', 'commercial_print_mode').withColumnRenamed( 'region', 'commercial_print_region'). withColumnRenamed('currency_id', 'invoiced_currency').withColumnRenamed( 'weight_qty', 'actual_tons').withColumnRenamed( 'period', 'report_month').withColumnRenamed( 'year', 'report_year').withColumnRenamed( 'line_desc1', 'invoice_line_desc_1')) df = df.select( df.billing_system, df.invoice_number, df.invoice_line_number, df.invoice_period, df.invoice_source_type, df.invoice_line_code, df.iptmeta_source_system, df.product_sold_flag, df.commercial_print_channel, df.commercial_print_mode, df.fx_conversion_to_usd, df.grade, df.invoice_date, df.ship_from_dim_location_id, df.invoiced_currency, df.ship1_dim_material_id, df.prime, df.sales_order_number, df.sale_type, df.sales_representative, df.ship_dim_customer_id, df.sold_dim_customer_id, df.brand_dim_customer_id, df.subset, df.actual_tons, df.claims, df.discounts, df.freight_invoice, df.freight_invoice_calc, df.freight_upcharge, df.gross_price, df.msf, df.net_price, df.nominal_tons, df.other_deductions, df.rebates, df.service_allowances, df.standard_cost, df.standard_gross_margin, df.invoice_dim_location_id, df.commercial_print_region, df.invoice_volume, df.invoice_uom_id, df.bol_number, df.report_month, df.report_year, df.sales_rep_id, df.invoice_line_desc_1) return df
def generate_data(sale_hdfs_path, store_path): spark = SparkSession.builder.\ master("local[*]").\ appName("spark_df").\ config("spark.some.config.option", "some-value").\ getOrCreate() # sale qty try: salesfile = "hdfs://10.230.0.11:8020" + sale_hdfs_path sale_df = spark.read.parquet(salesfile) except: salesfile = "hdfs://10.230.0.10:8020" + sale_hdfs_path sale_df = spark.read.parquet(salesfile) sele_col = ["store_id", "good_gid", "date", "fill_sale_qty"] sale_df = sale_df.select(*sele_col) # set_name = ["store_id","good_gid","date","fill_sale_qty"] # sale_df = sale_df.toDF(*set_name) #94626013 from pyspark.sql.functions import lpad # df22 = sale_df.withColumn("store_id", lpad(sale_df['store_id'],6,'0')).withColumn("good_gid", sale_df['good_gid'].cast('string')).cache() df22 = sale_df.withColumn("store_id", lpad(sale_df['store_id'], 6, '0')).withColumn( "good_gid", sale_df['good_gid'].cast('string')) # store try: storfile = "hdfs://10.230.0.11:8020" + store_path store_df = spark.read.parquet(storfile) except: storefile = "hdfs://10.230.0.10:8020" + store_path store_df = spark.read.parquet(storefile) # store_path = "/home/test/nby/data_dir/storeinfo.csv" # store_df = spark.read.format("com.databricks.spark.csv").option("inferSchema","true").option("header","true").load(store_path) store_col = ["store_code", "wh_name", "alcscm"] store_df = store_df.select(*store_col) store_set = ["store_id", "warename", "ALCSCM"] store_df = store_df.toDF(*store_set) from pyspark.sql import functions as F store_df = store_df.withColumn( 'ALCSCM', F.when(store_df['ALCSCM'] == '-', '01').otherwise(store_df['ALCSCM'])) store_df = store_df.withColumn("store_id", lpad(store_df['store_id'], 6, '0')) df22_join = df22.join(store_df, df22.store_id == store_df.store_id, "inner").drop(store_df.store_id) from pyspark.sql.functions import concat, col, lit from pyspark.sql.functions import year, month, dayofmonth df22_join = df22_join.withColumn( "date", concat(year("date"), lit("-"), lpad(month("date"), 2, '0'))) # 89847788 # 3148347 # count store df22_join_uniq = df22_join.dropDuplicates(['store_id', 'good_gid', 'date']) # 14467 df22_join_uniq_gb = df22_join_uniq.groupBy("good_gid", "date", "ALCSCM").count() # sum qty df22_join_sum = df22_join.groupBy( "good_gid", "date", "ALCSCM", "warename").sum("fill_sale_qty").withColumnRenamed( 'sum(fill_sale_qty)', 'sum_sale_qty') # df22_join_uniq_gb.filter("good_gid =='30018144' AND date =='2017-09'").show() df22_join_storecount = df22_join_sum.alias("a").join( df22_join_uniq_gb.alias("b"), (df22_join_sum.good_gid == df22_join_uniq_gb.good_gid) & (df22_join_sum.ALCSCM == df22_join_uniq_gb.ALCSCM) & (df22_join_sum.date == df22_join_uniq_gb.date), "left_outer").select("a.good_gid", "a.date", "a.ALCSCM", "a.warename", "a.sum_sale_qty", "b.count") # df22_join.groupBy("good_gid","date","ALCSCM").agg(sum("fill_sale_qty").alias("sum_sale_qty"),func.countDistinct("good_gid","date","ALCSCM")).show() sale_data_top100 = df22_join_storecount.toPandas() sale_data_top100['good_gid'] = sale_data_top100['good_gid'].astype(str) sale_data_top100 = sale_data_top100.sort_values( by=['warename', 'good_gid', 'date'], ascending=True) sale_data_top100['ALCSCM'] = sale_data_top100['ALCSCM'].astype(str).map( lambda x: x.zfill(2)) sale_data_top100['sale_qty_perstore'] = sale_data_top100[ 'sum_sale_qty'] / sale_data_top100['count'] sale_data_top100['uniq_ID'] = sale_data_top100[ 'good_gid'] + '_' + sale_data_top100['ALCSCM'] sale_data_top100 = sale_data_top100.loc[sale_data_top100.ALCSCM != '09', :] return sale_data_top100
# for the cols. However, if you want to custom specify a schema, you can use the approach below. A benefit for the below # approach is that you might have, through your pre-processing data validation stage identified data types most suited # to your dataset. Additionally, you can pass in the lists below from various sources including as parameters # to your pipeline. to_int_cols = [ 'WindSpeed', 'WindDirection', 'WindGust', 'Pressure', 'SignificantWeatherCode' ] to_long_cols = ['ForecastSiteCode', 'Visibility'] to_date_cols = ['ObservationDate'] to_double_cols = ['ScreenTemperature', 'Latitude', 'Longitude'] # the assumption was that time fields in the weather datasets were of int type, and required formatting to # a time format clean_df = clean_df.withColumn( 'ObservationTime', F.lpad(clean_df['ObservationTime'], 4, '0').substr(3, 4)) clean_df = clean_df.withColumn('ObservationTime', F.rpad(clean_df['ObservationTime'], 6, '0')).\ withColumn("ObservationTime", (F.regexp_replace('ObservationTime',"""(\d\d)""", "$1:")).substr(0,8)) # clean_df.select('ObservationTime').distinct().show() # using a cast function from spark to modify the data types for col in clean_df.columns: try: if col in to_int_cols: clean_df = clean_df.withColumn(col, F.col(col).cast('int')) elif col in to_long_cols: clean_df = clean_df.withColumn(col, F.col(col).cast('long')) elif col in to_date_cols: clean_df = clean_df.withColumn(col, F.col(col).cast('date')) elif col in to_double_cols: clean_df = clean_df.withColumn(col, F.col(col).cast('double'))
def collect_orderby(sdf, cols, groupby, orderby=None, suffix='__collect', sep='____', orderby_func={}, dtype=StringType(), ascending=True, drop_null=True): # 暂时不考虑空值填充,orderby=None时将去除空值 """ Paramter: ---------- sdf: pyspark dataframe to be processed cols: str/list of the sdf'cols to be processed groupby: str/list of sdf' cols to be groupbyed when collect_orderby orderby: str/list of sdf' cols to be orderbyed when collect_orderby suffix: str of cols' names converted bycollect_orderby(renamed by cols+suffix) sep: str of the sep when concat_ws(don't change by default) dtype: pyspark.sql.types of the return values Return: ---------- sdf: pyspark dataframe of collect_list orderby Example: ---------- sdf=collect_orderby(sdf,cols,groupby='user_id',orderby='time') """ # cols:需collect_list项 # groupby:为空时可传入[] # orderby:必为string、int、float项(也可有int,float型) assert not orderby_func or orderby orderby_agg_func = [] orderby_agg_cols = [] orderby_copy_cols_dict, orderby_recover_cols_dict = {}, { } # 用于orderby中有非string字段进行collect时的名称统一 if not isinstance(cols, list): cols = [cols] if not isinstance(groupby, list): groupby = [groupby] if orderby is None: orderby = [] orderby_func = {} if not isinstance(orderby, list): orderby = [orderby] # 如果orderby有字段也要进行collect且是非string类型时,需要做一次字段复制,否则会将1变成'01' for i, c in enumerate(orderby): if c in cols and dict(sdf.select(orderby).dtypes)[c] != 'string': c_orderby = f"{c}{sep}orderby" sdf = sdf.withColumn(c_orderby, F.col(c)) orderby[i] = c_orderby orderby_copy_cols_dict[c_orderby] = c if not isinstance(orderby_func, dict): if not isinstance(orderby_func, list): orderby_func = [orderby_func] orderby_func = dict(zip(orderby, [orderby_func] * len(orderby))) if not drop_null: split_udf = F.udf( lambda x: [ i.split(sep)[-1] if len(i.split(sep)) > 1 else None #当原始字段包含sep时,这里将有问题!!!! for i in x ], ArrayType(dtype)) else: split_udf = F.udf( lambda x: [ i.split(sep)[-1] #当原始字段包含sep时,这里将有问题!!!! for i in x if len(i.split(sep)) > 1 ], ArrayType(dtype)) for c in [ k for k, v in sdf.dtypes if k in cols and len(re.findall(re.compile(r'^(array|vector)'), v)) > 0 ]: logstr(f'{c}类型转换为StringType') sdf = sdf.withColumn(c, cast2str_udf(c)) # 不符合要求的先统计转为StringType() logstr('orderby', orderby) if len(orderby) != 0: # 处理orderby_func for c, f_list in orderby_func.items(): if not isinstance(f_list, list): f_list = [f_list] for i, f in enumerate(f_list): if c not in orderby: continue if isinstance(f, str): f = f_list[i] = eval(f"F.{f}") key = f"{c}{sep}{f.__name__}" orderby_agg_func.append(f(c).alias(key)) orderby_agg_cols.append(key) if c in orderby_copy_cols_dict: orderby_recover_cols_dict[ key] = f"{orderby_copy_cols_dict[c]}{sep}{f.__name__}" # 处理非字符型orderby order_int_list = [ k for k, v in sdf.dtypes if k in orderby and len(re.findall(re.compile(r'(int)'), v)) > 0 ] order_float_list = [ k for k, v in sdf.dtypes if k in orderby and len(re.findall(re.compile(r'(float|double)'), v)) > 0 ] if order_int_list: logstr('order_int_list', order_int_list) order_int_max_sdf = sdf.select(order_int_list).agg( *[F.max(c).alias(c) for c in order_int_list]) order_int_max_df = sdf.select(order_int_list).agg( *[F.max(c).alias(c) for c in order_int_list]).toPandas() order_int_max_dict = dict( zip(order_int_max_df.keys(), order_int_max_df.values.flatten().tolist())) logstr('order_int_max_dict', order_int_max_dict) for c in order_int_list: sdf = sdf.withColumn( c, F.lpad( F.col(c).cast(StringType()), len(str(order_int_max_dict[c])), '0')) if order_float_list: logstr('order_float_list', order_float_list) for c in order_float_list: sdf = sdf.withColumn(c, F.col(c).cast(StringType())) max_df = sdf.select(F.split(c, r"\.").alias(c)).select([ F.length(F.col(c)[i]).alias(c + f"__{i}") for i in range(2) ]).agg(*[ F.max(c + f"__{i}").alias(c + f"__{i}") for i in range(2) ]).toPandas() max_dict = dict( zip(max_df.keys(), max_df.values.flatten().tolist())) logstr('max_dict', max_dict) sdf = sdf.withColumn( c, F.lpad( F.col(c).cast(StringType()), max_dict[c + "__0"], '0')).withColumn( c, F.rpad( F.col(c).cast(StringType()), max_dict[c + "__1"], '0')) agg_fun_list = [ F.sort_array(F.collect_list(f"%s{sep}{c}" % '_'.join(orderby)), asc=ascending).alias(c + '_temp') for c in cols ] # 这里对于Null值的处理仍不友好,即空值会以['a',,'b']这种形式给出 sdf = sdf.select([ F.concat_ws(sep, *orderby, c).alias(f"%s{sep}{c}" % '_'.join(orderby)) for c in cols ] + groupby + orderby) sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func)) sdf = sdf.select( [split_udf(c + '_temp').alias(c + suffix) for c in cols] + orderby_agg_cols + groupby) else: agg_fun_list = [F.collect_list(c).alias(c + '_temp') for c in cols] sdf = sdf.select(cols + groupby + orderby) sdf = sdf.groupBy(groupby).agg(*(agg_fun_list + orderby_agg_func)) sdf = sdf.select([ F.col(c + '_temp').cast(ArrayType(dtype)).alias(c + suffix) for c in cols ] + orderby_agg_cols + groupby) for c1, c2 in orderby_recover_cols_dict.items(): sdf = sdf.withColumnRenamed(c1, c2) return sdf
pd.set_option('display.width',110); pd.set_option('display.max_columns',100) pd.set_option('display.max_colwidth', 200); pd.set_option('display.max_rows', 500) print("Start %d Cities Cluster Test, NSlaves = %d" % (NCities, NSlaves), flush=True) print("Execution time #0 %f" % (time.time() - startTime),flush=True) # ## Read two mapping files into dataframes # - Read files from Amazon S3 bucket into Spark dataframes # - Format columns as required to enable joins to dataset below # read and process city FIPS to county FIPS mapping file city_to_fips = spark.read.format("org.apache.spark.csv").option("header","true") \ .csv(latlonFilename) # .csv("/home/ubuntu/project/data/uscitiesLatLongFIPS.csv") city_to_fips = city_to_fips.withColumn("county_FIPS", f.lpad(city_to_fips['county_FIPS'],5,"0")) city_to_fips = city_to_fips.drop("city","zip","id","source","population") city_to_fips = city_to_fips.withColumn("city_ascii", f.regexp_replace('city_ascii', 'Saint', 'St.')) city_to_fips = city_to_fips.withColumnRenamed("city_ascii","CityName") \ .withColumnRenamed("state_name","StateDesc") \ .withColumnRenamed("county_FIPS","FIPS") print((city_to_fips.count(), len(city_to_fips.columns))) city_to_fips.limit(5).toPandas() # read and process commuting zone to county FIPS mappingfile cz_to_fips = spark.read.format("org.apache.spark.csv").option("header","true").option("delimiter", "\t") \ .csv(lmaFilename) # .csv("/home/ubuntu/project/data/1990LMAascii.csv")
# COMMAND ---------- from pyspark.sql.functions import lower, upper df.select(col("Description"), lower(col("Description")), upper(lower(col("Description")))).show(2) # COMMAND ---------- from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lp"), rpad(lit("HELLO"), 10, " ").alias("rp")).show(2) # COMMAND ---------- from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) # COMMAND ---------- from pyspark.sql.functions import translate
def execute(self, spark: SparkSession, args: Namespace): # select and cleanup the raw events for the data date events = spark \ .table(f'{args.input_database_name}.{self.partner}') \ .withColumnRenamed('impressionid', 'impression_id') \ .withColumnRenamed('sourceid', 'measurement_source_id') \ .withColumnRenamed('advertiserid', 'partner_measured_advertiser_id') \ .withColumnRenamed('campaignid', 'partner_measured_campaign_id') \ .withColumn('partner_measured_channel_id', lit(None).cast(LongType())) \ .withColumnRenamed('placementid', 'partner_measured_placement_id') \ .withColumnRenamed('creativeid', 'partner_measured_creative_id') \ .withColumnRenamed('mediatypeid', 'media_type_id') \ .withColumnRenamed('inview50', 'in_view_50') \ .withColumnRenamed('inview100', 'in_view_100') \ .withColumnRenamed('noncontinuousplaytime', 'non_continuous_play_time') \ .withColumnRenamed('audibleplaytime', 'audible_play_time') \ .withColumnRenamed('fullscreen', 'full_screen') \ .withColumnRenamed('fraudcategory', 'fraud_category') \ .withColumnRenamed('givt', 'general_invalid') \ .filter(get_source_partitions_condition( args.data_date, args.data_date_tz_offset )) \ .dropDuplicates(['impression_id', 'type', 'timestamp']) # assemble ad sessions adsessions = events \ .groupBy('impression_id') \ .agg( first(lpad( ( ( col('utchour').cast(IntegerType()) + lit(24 + args.data_date_tz_offset / 3600).cast(IntegerType()) ) % lit(24) ).cast(StringType()), 2, '0' )).alias('hour'), collect_set('type').alias('events'), _event_prop('impression', col('site')).alias('site'), _event_prop('impression', col('measurement_source_id')).alias('measurement_source_id'), _event_prop('impression', col('partner_measured_advertiser_id')) \ .alias('partner_measured_advertiser_id'), _event_prop('impression', col('partner_measured_campaign_id')).alias('partner_measured_campaign_id'), _event_prop('impression', col('partner_measured_channel_id')).alias('partner_measured_channel_id'), _event_prop('impression', col('partner_measured_placement_id')).alias('partner_measured_placement_id'), _event_prop('impression', col('partner_measured_creative_id')).alias('partner_measured_creative_id'), _event_prop('impression', col('media_type_id')).alias('media_type_id'), _event_prop('impression', col('endpoint')).alias('endpoint'), _event_prop('impression', col('suspicious')).alias('suspicious'), _event_prop('impression', col('general_invalid')).alias('general_invalid'), _event_prop('impression', col('fraud_category')).alias('fraud_category'), _event_prop('impression', (col('measurable') == 1) | (col('measurable') == 3)) \ .alias('mrc_measurable'), _event_prop('impression', (col('measurable') == 2) | (col('measurable') == 3)) \ .alias('groupm_measurable'), first( when( ( ((col('endpoint') == 'display') & (col('type') == 'impression')) | ((col('endpoint') == 'video') & (col('type') == 'started')) ), col('in_view_50') ), ignorenulls=True ).alias('mrc_in_view_on_load'), _event_prop('unload', col('in_view_50')).alias('mrc_in_view_on_unload'), _event_prop('first-quartile', col('in_view_50')).alias('mrc_in_view_1q'), _event_prop('second-quartile', col('in_view_50')).alias('mrc_in_view_2q'), _event_prop('thrid-quartile', col('in_view_50')).alias('mrc_in_view_3q'), _event_prop('completed', col('in_view_50')).alias('mrc_in_view_4q'), _event_prop('started', col('in_view_100')).alias('groupm_in_view_on_load'), _event_prop('unload', col('in_view_100')).alias('groupm_in_view_on_unload'), _event_prop('first-quartile', col('in_view_100')).alias('groupm_in_view_1q'), _event_prop('second-quartile', col('in_view_100')).alias('groupm_in_view_2q'), _event_prop('thrid-quartile', col('in_view_100')).alias('groupm_in_view_3q'), _event_prop('completed', col('in_view_100')).alias('groupm_in_view_4q'), ((smax('non_continuous_play_time') - smax('audible_play_time')) > lit(1000)).alias('muted'), _event_prop('impression', col('full_screen')).alias('full_screen') ) \ .filter(array_contains(col('events'), 'impression')) \ .withColumn('date', lit(''.join(args.data_date.split('-')))) \ .withColumn('mrc_viewable', ( col('mrc_measurable') & ( ((col('endpoint') == 'display') & array_contains(col('events'), 'in-view')) | ((col('endpoint') == 'video') & array_contains(col('events'), 'in-view-50')) ) )) \ .withColumn('groupm_viewable', ( col('groupm_measurable') & array_contains(col('events'), '100-percent-in-view') )) \ .withColumn('clean', ( ~col('suspicious') & ~col('general_invalid') )) \ .select( 'date', 'hour', 'impression_id', 'site', 'measurement_source_id', 'partner_measured_advertiser_id', 'partner_measured_campaign_id', 'partner_measured_channel_id', 'partner_measured_placement_id', 'partner_measured_creative_id', 'media_type_id', lit(None).cast(BooleanType()).alias('below_the_fold'), lit(None).cast(BooleanType()).alias('on_the_fold'), lit(None).cast(BooleanType()).alias('above_the_fold'), lit(None).cast(IntegerType()).alias('time_on_page'), lit(None).cast(IntegerType()).alias('in_view_time'), (col('clean') & col('mrc_viewable')).alias('in_view'), (col('clean') & array_contains(col('events'), 'in-view-5s')).alias('in_view_5s'), (col('clean') & array_contains(col('events'), 'in-view-15s')).alias('in_view_15s'), (col('clean') & ( ((col('endpoint') == 'display') & ~array_contains(col('events'), 'in-view')) | ((col('endpoint') == 'video') & ~array_contains(col('events'), 'in-view-50')) )).alias('not_in_view'), lit(None).cast(BooleanType()).alias('never_in_view'), (col('clean') & col('mrc_measurable') & col('mrc_in_view_on_load')).alias('in_view_load'), (col('clean') & col('mrc_measurable') & col('mrc_in_view_on_unload')).alias('in_view_unload'), (col('clean') & array_contains(col('events'), 'first-quartile')).alias('completed_1q'), (col('clean') & col('mrc_measurable') & col('mrc_in_view_1q')).alias('in_view_1q'), (col('clean') & array_contains(col('events'), 'second-quartile')).alias('completed_2q'), (col('clean') & col('mrc_measurable') & col('mrc_in_view_2q')).alias('in_view_2q'), (col('clean') & array_contains(col('events'), 'thrid-quartile')).alias('completed_3q'), (col('clean') & col('mrc_measurable') & col('mrc_in_view_3q')).alias('in_view_3q'), (col('clean') & array_contains(col('events'), 'completed')).alias('completed_4q'), (col('clean') & col('mrc_measurable') & col('mrc_in_view_4q')).alias('in_view_4q'), (col('clean') & ~array_contains(col('events'), 'started')).alias('never_started'), (col('clean') & col('muted')).alias('muted'), (col('clean') & col('full_screen')).alias('full_screen'), lit(False).alias('click_through'), (~col('clean') & col('mrc_viewable')).alias('sivt_in_view'), (~col('clean') & ~col('mrc_viewable')).alias('sivt_not_in_view'), lit(None).cast(IntegerType()).alias('groupm_time_on_page'), lit(None).cast(IntegerType()).alias('groupm_in_view_time'), (col('clean') & col('groupm_viewable')).alias('groupm_in_view'), lit(False).alias('groupm_in_view_5s'), lit(False).alias('groupm_in_view_15s'), (col('clean') & ~col('groupm_viewable')).alias('groupm_not_in_view'), lit(False).alias('groupm_never_in_view'), (col('clean') & col('groupm_measurable') & col('groupm_in_view_on_load')).alias('groupm_in_view_load'), (col('clean') & col('groupm_measurable') & col('groupm_in_view_on_unload')) \ .alias('groupm_in_view_unload'), (col('clean') & array_contains(col('events'), 'first-quartile')).alias('groupm_completed_1q'), (col('clean') & col('groupm_measurable') & col('groupm_in_view_1q')).alias('groupm_in_view_1q'), (col('clean') & array_contains(col('events'), 'second-quartile')).alias('groupm_completed_2q'), (col('clean') & col('groupm_measurable') & col('groupm_in_view_2q')).alias('groupm_in_view_2q'), (col('clean') & array_contains(col('events'), 'thrid-quartile')).alias('groupm_completed_3q'), (col('clean') & col('groupm_measurable') & col('groupm_in_view_3q')).alias('groupm_in_view_3q'), (col('clean') & array_contains(col('events'), 'completed')).alias('groupm_completed_4q'), (col('clean') & col('groupm_measurable') & col('groupm_in_view_4q')).alias('groupm_in_view_4q'), (col('clean') & ~array_contains(col('events'), 'started')).alias('groupm_never_started'), (col('clean') & col('muted')).alias('groupm_muted'), (col('clean') & col('full_screen')).alias('groupm_full_screen'), lit(False).alias('groupm_click_through'), (~col('clean') & col('groupm_viewable')).alias('groupm_sivt_in_view'), (~col('clean') & ~col('groupm_viewable')).alias('groupm_sivt_not_in_view'), 'suspicious', lit(None).cast(BooleanType()).alias('measured'), lit(None).cast(BooleanType()).alias('groupm_measured'), 'general_invalid', col('mrc_measurable').alias('viewability_measurement_trusted'), (col('suspicious') & (col('fraud_category') == lit('Sitting Duck'))).alias('sitting_duck_bot'), (col('suspicious') & (col('fraud_category') == lit('Standard'))).alias('standard_bot'), (col('suspicious') & (col('fraud_category') == lit('Volunteer'))).alias('volunteer_bot'), (col('suspicious') & (col('fraud_category') == lit('Profile'))).alias('profile_bot'), (col('suspicious') & (col('fraud_category') == lit('Masked'))).alias('masked_bot'), (col('suspicious') & (col('fraud_category') == lit('Nomadic'))).alias('nomadic_bot'), (col('suspicious') & (col('fraud_category') == lit('Other'))).alias('other_bot'), lit(None).cast(BooleanType()).alias('true_view_viewable'), lit(None).cast(BooleanType()).alias('true_view_measurable'), lit(None).cast(BooleanType()).alias('yahoo_gemini_billable'), (col('clean') & col('groupm_measurable') & array_contains( col('events'), '100-percent-in-view-2-seconds' )).alias('full_ad_in_view'), (col('clean') & col('groupm_measurable') & array_contains( col('events'), '100-percent-in-view-2-seconds' )).alias('publicis_in_view'), lit(None).cast(BooleanType()).alias('yahoo_gemini_billable_suspicious'), lit(None).cast(DoubleType()).alias('average_in_view_time'), lit(None).cast(BooleanType()).alias('in_view_lt_1s'), lit(None).cast(BooleanType()).alias('in_view_1s_2s'), lit(None).cast(BooleanType()).alias('in_view_2s_5s'), lit(None).cast(BooleanType()).alias('in_view_5s_10s'), lit(None).cast(BooleanType()).alias('in_view_10s_15s'), lit(None).cast(BooleanType()).alias('in_view_15s_20s'), lit(None).cast(BooleanType()).alias('in_view_20s_25s'), lit(None).cast(BooleanType()).alias('in_view_25s_30s'), lit(None).cast(BooleanType()).alias('in_view_30s_35s'), lit(None).cast(BooleanType()).alias('in_view_35s_40s'), lit(None).cast(BooleanType()).alias('in_view_40s_45s'), lit(None).cast(BooleanType()).alias('in_view_45s_50s'), lit(None).cast(BooleanType()).alias('in_view_ge_50s'), (col('mrc_measurable') | col('groupm_measurable') | ~col('clean')) \ .alias('viewability_measured_or_fraud') ) # save ad sessions adsessions \ .write \ .mode('overwrite') \ .partitionBy('date', 'hour') \ .format('avro') \ .option('compression', 'snappy') \ .save( f's3://{args.output_bucket_name}/{args.output_bucket_data_prefix}' f'/{self.partner}/' )
temp = temp.withColumn('measure', lit('total distinct collectors')) temp = temp.withColumn('customer_type', lit('Overall')) totals = totals.union(temp.select('customer_type', 'total', 'measure')) totals.write.format('jdbc').options( url='jdbc:mysql://35.238.212.81:3306/assignment_db', driver='com.mysql.jdbc.Driver', dbtable='totals', user='******', password='******').mode('overwrite').save() #Making transaction date consistent (/,-, consistent) and replacing day part of the trans_dt to 01 to calculate monthly aggregate df_trans_date_slash = df.filter(df.trans_dt.contains('/')) split_col = split(df_trans_date_slash['trans_dt'], '/') df_trans_date_slash = df_trans_date_slash.withColumn( 'month', split_col.getItem(0)) df_trans_date_slash = df_trans_date_slash.withColumn( 'month', lpad('month', 2, "0")) df_trans_date_slash = df_trans_date_slash.withColumn( 'year', split_col.getItem(2)) df_trans_date_slash = df_trans_date_slash.withColumn( 'trans_dt', concat(col('month'), lit('-01-'), col('year'))) df_trans_date_hyphen = df.filter(df.trans_dt.contains('-')) split_col = split(df_trans_date_hyphen['trans_dt'], '-') df_trans_date_hyphen = df_trans_date_hyphen.withColumn( 'month', split_col.getItem(1)) df_trans_date_hyphen = df_trans_date_hyphen.withColumn( 'month', lpad('month', 2, "0")) df_trans_date_hyphen = df_trans_date_hyphen.withColumn( 'year', split_col.getItem(0)) df_trans_date_hyphen = df_trans_date_hyphen.withColumn( 'trans_dt', concat(col('month'), lit('-01-'), col('year'))) df = df_trans_date_hyphen.union(df_trans_date_slash)
def generate_dim_date(spark, start_year=1901, number_years_out_from_start=300): """Create `dim_date` table containing various date feature columns. Args: spark (SparkSession): Instantiated SparkSession start_year (int): starting year for dim_date table. number_years_out_from_start (int): number out from `start_year` to increment. Returns: Spark DataFrame. """ years = [start_year + i for i in range(number_years_out_from_start + 1)] months = [i for i in range(1, 13)] days = [i for i in range(1, 32)] years_df = spark.createDataFrame( pd.DataFrame({ 'year': years, 'temp_join_key': '1' })) months_df = spark.createDataFrame( pd.DataFrame({ 'month': months, 'temp_join_key': '1' })) days_df = spark.createDataFrame( pd.DataFrame({ 'day_of_month': days, 'temp_join_key': '1' })) years_months_df = (years_df.join(months_df, ['temp_join_key'], how='inner')) years_month_days_df = (years_months_df.join(days_df, ['temp_join_key'], how='inner')) date_keys = ( years_month_days_df.withColumn( 'date', to_date( concat(col('year'), lpad(col('month'), 2, '0'), lpad(col('day_of_month'), 2, '0')), 'yyyyMMdd')) # remove invalid dates .filter("date IS NOT NULL").withColumn( 'date_key', regexp_replace(col('date').cast('string'), '-', '').cast('integer'))) date_features = (date_keys # get `week` and `quarter` .withColumn('week', weekofyear(col('date'))) .withColumn('quarter', quarter(col('date'))) # get `day_name` and `month_name` .withColumn('day_name', date_format(col('date'), 'EEEE')) .withColumn('month_name', date_format(col('date'), 'MMMM')) # get `date_year`, `date_quarter`, `date_month`, `date_week` .withColumn('date_week', expr("MIN(date) OVER(PARTITION BY week, year)")) .withColumn('date_month', date_format(col('date'), 'yyyy-MM-01')) .withColumn('date_quarter', expr("MIN(date) OVER(PARTITION BY quarter, year)")) .withColumn('date_year', date_format(col('date'), 'yyyy-01-01')) # get `day_of_week`, `day_of_quarter`, `day_of_year` .withColumn('day_of_week', dayofweek(col('date'))) .withColumn('day_of_quarter', datediff(col('date'), col('date_quarter')) + lit(1)) .withColumn('day_of_year', dayofyear(col('date'))) # get `weekend_flag`, `us_holiday_flag`, `business_day_flag`, `leap_year_flag`, # `month_start_flag`, `month_end_flag` .withColumn('weekend_flag', when(col('day_of_week').isin([7, 1]), 'Y').otherwise('N')) .withColumn('us_holiday_flag', pd_is_holiday_usa(col('date').cast('timestamp'))) .withColumn('us_biz_day_flag', when((col('weekend_flag') == lit('Y')) | (col('us_holiday_flag') == lit('Y')), 'Y').otherwise('N')) .withColumn('leap_year_flag', when(dayofmonth(last_day(concat(col('year'), lit('-02-01')).cast('date'))) == 29, 'Y') .otherwise('N')) .withColumn('month_start_flag', when(col('day_of_month') == lit(1), 'Y').otherwise('N')) .withColumn('month_end_flag', when(col('date') == last_day(col('date')), 'Y').otherwise('N')) # get `pct_into_month`, `pct_into_quarter`, `pct_into_year` .withColumn('pct_into_month', (col('day_of_month') / dayofmonth(last_day(col('date')))).cast('decimal(7, 6)')) .withColumn('date_quarter_end', when(col('quarter') == lit(1), concat(col('year'), lit('-03-31'))) .when(col('quarter') == lit(2), concat(col('year'), lit('-06-30'))) .when(col('quarter') == lit(3), concat(col('year'), lit('-09-30'))) .when(col('quarter') == lit(4), concat(col('year'), lit('-12-31'))) .otherwise(None) .cast('date')) .withColumn('days_in_quarter', datediff(col('date_quarter_end'), col('date_quarter')) + lit(1)) .withColumn('pct_into_quarter', (col('day_of_quarter') / col('days_in_quarter')).cast('decimal(7, 6)')) .withColumn('pct_into_year', (col('day_of_year') / when(col('leap_year_flag') == lit('Y'), 366.0).otherwise(365.0)) .cast('decimal(7, 6)')) # get seasons .withColumn('season_northern', when(col('month').isin(12, 1, 2), 'Winter') .when(col('month').isin(3, 4, 5), 'Spring') .when(col('month').isin(6, 7, 8), 'Summer') .when(col('month').isin(9, 10, 11), 'Fall') .otherwise('UNKNOWN')) .withColumn('season_southern', when(col('month').isin(6, 7, 8), 'Winter') .when(col('month').isin(9, 10, 11), 'Spring') .when(col('month').isin(12, 1, 2), 'Summer') .when(col('month').isin(3, 4, 5), 'Fall') .otherwise('UNKNOWN'))) dim_date = (date_features.sort('date').select([ 'date_key', 'date', 'date_week', 'date_month', 'date_quarter', 'date_year', 'day_of_week', 'day_of_month', 'day_of_quarter', 'day_of_year', 'week', 'month', 'quarter', 'year', 'days_in_quarter', 'day_name', 'month_name', 'season_northern', 'season_southern', 'weekend_flag', 'us_holiday_flag', 'us_biz_day_flag', 'month_start_flag', 'month_end_flag', 'leap_year_flag', 'pct_into_month', 'pct_into_quarter', 'pct_into_year' ])) return dim_date
def main(): # Download data now = datetime.datetime.now().strftime("%m-%d-%H-%M-%S") folder = config['local_folder'] os.system(f"rm -r {folder}") # Daily dataset ##big_table_name = 'vartefact.forecast_sprint4_add_dm_to_daily' # Promo dataset ##big_table_name = 'vartefact.forecast_sprint4_promo_mecha_v4' # Weekly dataset big_table_name = f"{config['database']}.forecast_sprint3_v10_flag_sprint4" file_name = 'test_to_delete_' + now download_data(folder=folder, big_table_name=big_table_name, file_name=file_name) # Preprocess the datase big_table = file_name + '.csv' sql_table = big_table_name target_value = 'sales_qty_sum' dataset_name = 'dataset_test_weekly' preprocess(folder=folder, big_table=big_table, sql_table=sql_table, target_value=target_value, dataset_name=dataset_name) # Train the model desc = 'weekly_test' data_set1 = dataset_name + '_part1.pkl' data_set2 = dataset_name + '_part2.pkl' date_stop_train = config['date_stop_train'] os.system( f"cp {config['config_folder']}/calendar.pkl {config['local_folder']}") # learning_rate = 0.3 run_model(folder=folder, data_set1=data_set1, data_set2=data_set2, futur_prediction=True, date_stop_train=date_stop_train) # save csv as table print('saving csv as table') file_list = os.listdir(config['local_folder']) for f in file_list: if 'weekly_model_training_for_futur_predictions' in f: result_file_name = f result_csv_path = f"{config['local_folder']}/{result_file_name}/resulst_forecast_10w_on_the_fututre.csv" os.system(f"hadoop fs -rm resulst_forecast_10w_on_the_fututre.csv") os.system(f"hadoop fs -put -f {result_csv_path}") spark = SparkSession \ .builder \ .appName("Forecast_saveastable") \ .config("spark.sql.warehouse.dir", warehouse_location) \ .config("spark.num.executors", '10') \ .config("spark.executor.memory", '15G') \ .config("spark.executor.cores", '20') \ .config("spark.sql.crossJoin.enable", True) \ .enableHiveSupport() \ .getOrCreate() sqlContext = SQLContext(spark) spark_df = sqlContext.read.format('com.databricks.spark.csv').options( header='true').load(f"resulst_forecast_10w_on_the_fututre.csv") split_col = pyspark.sql.functions.split(spark_df['full_item'], '_') spark_df = spark_df.withColumn('item_id', split_col.getItem(0)) spark_df = spark_df.withColumn('sub_id', split_col.getItem(1)) spark_df = spark_df.withColumn("item_id", spark_df["item_id"].cast(IntegerType())) spark_df = spark_df.withColumn("sub_id", spark_df["sub_id"].cast(IntegerType())) spark_df = spark_df.withColumn("week", spark_df["week"].cast(IntegerType())) spark_df = spark_df.withColumn( "train_mape_score", spark_df["train_mape_score"].cast(FloatType())) spark_df = spark_df.withColumn("predict_sales", spark_df["predict_sales"].cast(FloatType())) spark_df = spark_df.withColumn( "predict_sales_error_squared", spark_df["predict_sales_error_squared"].cast(FloatType())) spark_df = spark_df.withColumn( "predict_sales_max_confidence_interval", spark_df["predict_sales_max_confidence_interval"].cast(FloatType())) spark_df = spark_df.withColumn( "order_prediction", spark_df["order_prediction"].cast(FloatType())) spark_df = spark_df.withColumn("store_code", lpad(spark_df['store_code'], 3, '0')) spark_df = spark_df.withColumnRenamed('week', 'week_key') spark_df = spark_df.withColumnRenamed('predict_sales', 'sales_prediction') spark_df = spark_df.withColumnRenamed( 'predict_sales_max_confidence_interval', 'max_confidence_interval') spark_df.write.mode('overwrite').saveAsTable( f"{config['database']}.result_forecast_10w_on_the_fututre") sql = f""" invalidate metadata {config['database']}.result_forecast_10w_on_the_fututre """ impalaexec(sql) print('csv saved in the table') ############ forecast_sprint3_v10_flag_sprint4 sales information ## get max_week_key min_week_key big_table_name = f"{config['database']}.forecast_sprint3_v10_flag_sprint4" week_numbes_to_roll_median = 4 sql_query = f""" select cast((cast(max(week_key) as int) - {week_numbes_to_roll_median}) as varchar(10)) as min_week_key, max(week_key) as max_week_key from {big_table_name} """ week_keys = spark.sql(sql_query).toPandas() max_week = week_keys['max_week_key'].iloc[0] min_week = week_keys['min_week_key'].iloc[0] ## get median median_sales sql_query = f""" select sales_qty_sum, sub_id, week_key, store_code from {big_table_name} where week_key >= {min_week} """ df_from_sql = spark.sql(sql_query).toPandas() df_from_sql['sales_qty_sum'] = df_from_sql['sales_qty_sum'].astype(float) median_df = df_from_sql[['sales_qty_sum', 'sub_id', 'store_code']].groupby(['sub_id', 'store_code' ]).median().reset_index() median_df = median_df.rename({'sales_qty_sum': 'median_sales'}, axis=1) ############ result_forecast_10w_on_the_fututre_all prediction result information results_table = f"{config['database']}.result_forecast_10w_on_the_fututre" sql_query = f""" select * from {results_table} """ file_name = 'results' forecast_df = download_csv_by_spark(spark, sql_query, file_name) print('length before merge: ', len(forecast_df)) key_columns = ['sub_id', 'store_code'] for col in key_columns: median_df[col] = median_df[col].astype(float) forecast_df[col] = forecast_df[col].astype(float) forecast_df_w_sales = forecast_df.merge(median_df, how='left') print('after merge: ', len(forecast_df_w_sales)) forecast_df_w_sales.loc[:, 'forecast_min_median'] = forecast_df_w_sales[[ 'median_sales', 'order_prediction' ]].max(axis=1) forecast_df_w_sales.loc[(forecast_df_w_sales['sales_prediction'] > 7) & (forecast_df_w_sales['forecast_min_median'] > forecast_df_w_sales['median_sales']), 'max_order_3xmedian'] = np.minimum( 3 * forecast_df_w_sales['median_sales'], forecast_df_w_sales['forecast_min_median']) df_prediction_final = forecast_df_w_sales.copy() df_prediction_final['max_confidence_interval'] = df_prediction_final[[ 'max_order_3xmedian', 'forecast_min_median' ]].min(axis=1) df_prediction_final['sales_prediction'] = df_prediction_final[[ 'max_confidence_interval', 'sales_prediction' ]].min(axis=1) df_prediction_final['order_prediction'] = df_prediction_final[[ 'order_prediction', 'max_confidence_interval' ]].min(axis=1) df_prediction_final = df_prediction_final.drop( ['median_sales', 'forecast_min_median', 'max_order_3xmedian'], axis=1) df_prediction_final.describe().T df_prediction_final.to_csv(config['local_folder'] + 'df_prediction_final_roger_normal_adhoc.csv', index=False) result_csv_path = f"{config['local_folder']}df_prediction_final_roger_normal_adhoc.csv" os.system(f"hadoop fs -rm df_prediction_final_roger_normal_adhoc.csv") os.system(f"hadoop fs -put -f {result_csv_path}") spark_df = sqlContext.read.format('com.databricks.spark.csv').options( header='true').load(f"df_prediction_final_roger_normal_adhoc.csv") split_col = split(spark_df['full_item'], '_') spark_df = spark_df.withColumn('item_id', split_col.getItem(0)) spark_df = spark_df.withColumn('sub_id', split_col.getItem(1)) spark_df = spark_df.withColumn("item_id", spark_df["item_id"].cast(IntegerType())) spark_df = spark_df.withColumn("sub_id", spark_df["sub_id"].cast(IntegerType())) spark_df = spark_df.withColumn("week_key", spark_df["week_key"].cast(IntegerType())) spark_df = spark_df.withColumn( "train_mape_score", spark_df["train_mape_score"].cast(FloatType())) spark_df = spark_df.withColumn( "sales_prediction", spark_df["sales_prediction"].cast(FloatType())) spark_df = spark_df.withColumn( "predict_sales_error_squared", spark_df["predict_sales_error_squared"].cast(FloatType())) spark_df = spark_df.withColumn( "max_confidence_interval", spark_df["max_confidence_interval"].cast(FloatType())) spark_df = spark_df.withColumn( "order_prediction", spark_df["order_prediction"].cast(FloatType())) spark_df = spark_df.withColumn("store_code", lpad(spark_df['store_code'], 3, '0')) spark_df.write.mode('overwrite').saveAsTable( f"{config['database']}.result_forecast_10w_on_the_fututre") sql = f""" invalidate metadata {config['database']}.result_forecast_10w_on_the_fututre """ impalaexec(sql) print('csv saved in the table') spark.conf.set("spark.sql.crossJoin.enabled", True) sql = f''' with latest_prediction as ( select * from {config['database']}.result_forecast_10w_on_the_fututre ), week_key_list as ( select distinct week_key from latest_prediction ), count_predict_week_number as ( select item_id, sub_id, store_code, count(distinct week_key) as week_number from latest_prediction group by item_id, sub_id, store_code ), get_missing_item_list as ( select distinct item_id, sub_id, store_code from count_predict_week_number where week_number != 10 ), get_missed_item_full_list as ( select get_missing_item_list.item_id, get_missing_item_list.sub_id, get_missing_item_list.store_code, week_key_list.week_key from get_missing_item_list left join week_key_list on 1=1 ), get_missing_item_week_list as ( select distinct item_id, sub_id, store_code, week_key from latest_prediction where item_id in (select item_id from get_missing_item_list) ), get_final_list as ( select get_missed_item_full_list.item_id, get_missed_item_full_list.sub_id, get_missed_item_full_list.store_code, get_missed_item_full_list.week_key, missed.item_id as m_item_id, missed.week_key as m_week_key from get_missed_item_full_list left join get_missing_item_week_list missed on missed.item_id = get_missed_item_full_list.item_id and missed.sub_id = get_missed_item_full_list.sub_id and missed.week_key = get_missed_item_full_list.week_key and missed.store_code = get_missed_item_full_list.store_code ) insert into {config['database']}.result_forecast_10w_on_the_fututre select concat(cast(item_id as string),'_',cast(sub_id as string)) as full_item, store_code, week_key, 0 as train_mape_score, 0 as sales_prediction, 0 as predict_sales_error_squared, 0 as max_confidence_interval, 0 as order_prediction, item_id, sub_id from get_final_list where m_item_id is null ''' df_get_final_list2 = spark.sql(sql).toPandas() spark.stop() sql = f""" invalidate metadata {config['database']}.result_forecast_10w_on_the_fututre """ impalaexec(sql) print('csv saved in the table')
f.col('dpc').isNotNull(), f.col('dpc')) .otherwise(f.regexp_extract(f.col('housenumber'), '([0-9]+)([.*-])([0-9]+)', 3))) df_test = df_test.withColumn('dpc', blanks_to_null('dpc')) df_test.show(30) # In[ ]: #rule13 df_test = df_test.withColumn('dpc', f.coalesce(f.col('dpc'), f.lit('99'))) df_test.show(30) # In[ ]: #rule3 df_test = df_test.withColumn('dpc', f.lpad('dpc', 2, '0')) df_test.show(30) # In[ ]: