def registry_etl( spark: SparkSession, config: confuse.core.Configuration, icd_to_desc_map: pd.DataFrame, ) -> pd.DataFrame: registry_claims_bucket = config["buckets"]["registry_claims"].get(str) log.info(f"Registry claim bucket: {registry_claims_bucket}") log.info(f"Reading in registry claims data") registry_rdd = spark.read.parquet( registry_claims_bucket.replace("s3:", "s3a:")).withColumnRenamed( "patient_id", "registry_id") # Select claims around patient reference date and filter out claims before 2017 to remove ICD9 codes registry_rdd = ( registry_rdd.where( # Filters to claims falling before reference date F.col("claim_date") < F.date_sub(F.col("reference_date"), 0)).where( F.col("claim_date") > F.date_sub(F.col("reference_date"), 1 * 365)) # TODO [Low] Move time length into YAML .where(F.col("reference_date") > F.lit("2017-01-01")) # TODO [Low] Move cut-off date into YAML ) registry_id_count = registry_rdd.select("registry_id").distinct().count() log.info(f"Registry ID Count: {registry_id_count}") registry_count_min = config["etl"]["registry_count_min"].get(int) log.info(f"ETL of registry data and bringing to pandas") registry_df = (registry_rdd.select( "registry_id", F.explode(F.col("dx_list")).alias("code")).where( F.col("code").isNotNull()).groupBy("code").agg( F.collect_set(F.col("registry_id")).alias("registry_ids"), F.countDistinct(F.col("registry_id")).alias("registry_count"), ).where(F.col("registry_count") > registry_count_min).withColumn( "registry_rate", F.col("registry_count") / F.lit(registry_id_count))).toPandas() # TODO [Medium] Move below this into Spark log.info(f"Registry ETL pandas operations") registry_df.drop( registry_df.index[~registry_df.code.isin(icd_to_desc_map.code)], inplace=True) registry_df.sort_values("code").reset_index(drop=True, inplace=True) n_total_test = int( round( registry_df["registry_count"].iloc[0] / registry_df["registry_rate"].iloc[0], 0, )) log.info(f"N Total Test: {n_total_test}") registry_df["n_total_test"] = n_total_test return registry_df
def get_previous_bus_day(df_calendar:pyspark.sql.DataFrame, p_end_dt, col_name: str): """ this function return the previous business day key from the current day (p_end_dt) """ df_calendar = df_calendar \ .select(col_name) \ .where(F.col("bus_day_flg") == F.lit("Y")) \ .filter(F.col("cal_day").between(F.to_timestamp(F.date_sub(F.lit(p_end_dt), 7), "yyyy-MM-dd"), F.to_timestamp(F.date_sub(F.lit(p_end_dt), 1), "yyyy-MM-dd"))) \ .orderBy(F.col("cal_day").desc()).collect() return df_calendar[0][col_name]
def test_date_sub_function(self): dt = datetime.date(2021, 12, 27) # Note; number var in Python gets converted to LongType column; # this is not supported by the function, so cast to Integer explicitly df = self.spark.createDataFrame([Row(date=dt, sub=2)], "date date, sub integer") self.assertTrue( all( df.select( date_sub(df.date, df.sub) == datetime.date(2021, 12, 25), date_sub(df.date, "sub") == datetime.date(2021, 12, 25), date_sub(df.date, 3) == datetime.date(2021, 12, 24), ).first()))
def cal_F1(submit, fault_tag_df): npp = submit.count() tmp = submit.join(fault_tag_df, on=['model', 'serial_number'], how='left') ntpp = tmp.filter(F.col('fault_time').isNotNull()).filter( F.date_sub(tmp['fault_time'], 30) <= tmp['dt']).count() precision = ntpp / (npp + 1e-20) pr_df = fault_tag_df.filter( fault_tag_df['fault_time'] >= test1_begin_date).filter( fault_tag_df['fault_time'] < test1_end_date).repartition( 1) # .filter(F.col('model')==1) npr = pr_df.count() ntpr = pr_df.join(submit, on=['model', 'serial_number'], how='left') \ .filter(F.col('dt').isNotNull()).filter(F.col('fault_time') >= F.col('dt')).count() recall = ntpr / (npr + 1e-20) F1_score = 2 * precision * recall / (precision + recall + 1e-10) return ({ 'npp': npp, 'ntpp': ntpp, 'npr': npr, 'ntpr': ntpr, 'precision': precision, 'recall': recall, 'F1': F1_score })
def county_reality_supply(): # 各区县各档位该品规上周投放量 try: print(f"{str(dt.now())} 各区县各档位该品规上周投放量") plm_item = get_plm_item(spark).select("item_id", "item_name") co_cust = get_co_cust(spark).select("cust_id", "sale_center_id", "cust_seg") area = get_area(spark) # com_id与city的映射关系 city = area.dropDuplicates(["com_id"]).select("com_id", "city") # sale_center_id与区(list)的映射关系 county = area.groupBy("sale_center_id") \ .agg(f.collect_list("county").alias("county")) \ .select("sale_center_id", "county") # 获取上周实际投放量 # cust_item_spw = spark.sql( # "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \ # .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \ # .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \ # .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \ # .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \ # .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun")))\ # .join(co_cust,"cust_id") cust_item_spw = spark.sql( "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \ .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \ .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \ .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7 * 4)) \ .withColumn("last_sun", f.date_add(col("last_mon"), 6 + 7 * 3)) \ .where((col("begin_date") >= col("last_mon")) & (col("end_date") <= col("last_sun")))\ .join(co_cust,"cust_id") #需要计算的值的列名 colName = "county_gauge_week_volume" result = cust_item_spw.groupBy("com_id","sale_center_id","cust_seg", "item_id") \ .agg(f.sum("qty_allocco").alias(colName)) columns = [ "com_id", "city", "sale_center_id", "county", "gears", "gauge_id", "gauge_name", "city", "gears_data_marker", colName ] result.withColumn("row", f.concat_ws("_", col("sale_center_id"),col("cust_seg"), col("item_id"))) \ .withColumn("gears_data_marker", f.lit("4")) \ .join(plm_item, "item_id") \ .join(city, "com_id") \ .join(county,"sale_center_id")\ .withColumnRenamed("item_id","gauge_id")\ .withColumnRenamed("item_name","gauge_name")\ .withColumnRenamed("cust_seg","gears")\ .foreachPartition(lambda x: write_hbase1(x, columns, hbase)) except Exception: tb.print_exc()
def read_buste_data_v3(filepath, sqlContext): data_frame = sqlContext.read.csv(filepath, header=True, inferSchema=True, nullValue="-") date = "-".join(filepath.split("/")[-1].split("_")[:3]) data_frame = data_frame.withColumn("date", F.lit(date)) data_frame = data_frame.withColumn( "date", F.unix_timestamp(F.date_sub(F.col("date"), 1), 'yyyy-MM-dd')) return data_frame
def get_listens_for_rec_generation_window(mapped_df): """ Get listens to fetch top artists. Args: mapped_df (dataframe): Dataframe with all the columns/fields that a typical listen has. """ df = mapped_df.select('*') \ .where((col('listened_at') >= to_timestamp(date_sub(current_timestamp(), config.RECOMMENDATION_GENERATION_WINDOW))) & (col('listened_at') <= current_timestamp())) return df
def read_hdfs_folder(sqlContext, folderpath): data_frame = sqlContext.read.csv(folderpath, header=True, inferSchema=True, nullValue="-") data_frame = rename_columns(data_frame, [ ("cardNum18", "cardNum"), ("cardNum19", "userGender"), ]) date = "-".join(folderpath.split("/")[-2].split("_")[:3]) data_frame = data_frame.withColumn("date", F.lit(date)) data_frame = data_frame.withColumn("date", F.date_sub(F.col("date"), 1)) return data_frame
def clear_periods_intersections(source: DataFrame, columns_subset: List[str]) -> DataFrame: """ The method will transform periods "start_date - end_date" for subset of columns and change periods for linear continuations source: +---+----------+----------+ |key|start_date| end_date| +---+----------+----------+ | 5|2018-07-01|2018-09-01| | 5|2018-08-15|2018-09-13| | 5|2018-07-20|2018-08-30| +---+----------+----------+ result: +---+----------+----------+ |key|start_date| end_date| +---+----------+----------+ | 5|2018-07-01|2018-09-13| +---+----------+----------+ :param source: source data frame. :param columns_subset: subset of data frame columns that need to have not periods intersections """ custom_window = Window.partitionBy(*columns_subset).orderBy('start_dt') result = source \ .withColumn('lag_end', F.lag('end_date').over(custom_window)) \ .withColumn('flg', F.when(F.col('lag_end') < F.date_sub(F.col('start_date'), 1), 1) .otherwise(0)) \ .withColumn('grp', F.sum('flg').over(custom_window)) \ .groupBy(*columns_subset, 'grp') \ .agg(F.min('start_date').alias('start_date'), F.max('end_date').alias('end_date')) \ .drop('grp') return result
from pyspark.sql import SparkSession from pyspark.sql.functions import col, current_date, current_timestamp, date_sub, date_add, datediff, months_between, to_date, lit spark = SparkSession.builder.appName("Pyspark example").getOrCreate() dateDf = spark.range(10).withColumn("Today", current_date()).withColumn( "Now", current_timestamp()) dateDf.createOrReplaceTempView("dftable") dateDf.show() dateDf.printSchema() #'date_sub' and 'date_add' can be used to add and subtract from a particular date dateDf.select(date_sub(col("Today"), 5), date_add(col("Today"), 4)).show(2) #'dateddiff' and 'months_between' functions give number of dates and number of months between two dates dateDf.withColumn("week_ago", date_sub(col("Today"), 7)).select(datediff(col("week_ago"), col("Today"))).show(1) dateformat = "yyyy-dd-MM" dateDF1 = spark.range(1).select( to_date(lit("2020-27-08"), dateformat).alias("date"), to_date(lit("2020-26-08"), dateformat).alias("date1")) dateDF1.createOrReplaceTempView("dataTable2") dateDF1.show()
def clean_columns(prepared_clients, effective_version, start_ds): """Clean columns in preparation for aggregation. This removes invalid values, tidies up dates, and limits the scope of several dimensions. :param prepared_clients: `main_summary` rows that conform to the schema of `prepare_client_rows(...)` :param effective_version: DataFrame mapping dates to the active Firefox version that was distributed at that time :param start_ds: DateString to determine whether a row that is being processed is active during the current week :returns: DataFrame with cleaned columns and rows """ # Temporary column used for determining the validity of a row is_valid = "_is_valid" pcd = F.from_unixtime(F.col("profile_creation_date") * SECONDS_PER_DAY) client_date = utils.to_datetime('subsession_start_date', "yyyy-MM-dd") days_since_creation = F.datediff(client_date, pcd) is_funnelcake = F.col('distribution_id').rlike("^mozilla[0-9]+.*$") attr_mapping = { 'distribution_id': None, 'default_search_engine': None, 'locale': None, 'subsession_start': client_date, 'channel': (F.when( is_funnelcake, F.concat(F.col("normalized_channel"), F.lit("-cck-"), F.col("distribution_id"))).otherwise( F.col(("normalized_channel")))), 'geo': in_top_countries("country"), # Bug 1289573: Support values like "mozilla86" and "mozilla86-utility-existing" 'is_funnelcake': (F.when(is_funnelcake, F.lit("yes")).otherwise(F.lit("no"))), 'acquisition_period': F.date_format(F.date_sub(F.next_day(pcd, 'Sun'), 7), "yyyy-MM-dd"), 'sync_usage': sync_usage("sync_count_desktop", "sync_count_mobile", "sync_configured"), 'current_version': F.col("app_version"), 'current_week': ( # -1 is a placeholder for bad data F.when(days_since_creation < 0, F.lit(-1)).otherwise(F.floor(days_since_creation / 7) ).cast("long")), 'source': F.col('attribution.source'), 'medium': F.col('attribution.medium'), 'campaign': F.col('attribution.campaign'), 'content': F.col('attribution.content'), 'is_active': (F.when(client_date < utils.to_datetime(F.lit(start_ds)), F.lit("no")).otherwise(F.lit("yes"))), } usage_hours = F.col('usage_seconds') / SECONDS_PER_HOUR metric_mapping = { 'n_profiles': F.lit(1), 'total_uri_count': None, 'unique_domains_count_per_profile': None, 'usage_hours': usage_hours, 'sum_squared_usage_hours': F.pow(usage_hours, 2), } # Set the attributes to null if it's invalid select_attr = utils.build_col_expr({ attr: F.when(F.col(is_valid), expr).otherwise(F.lit(None)) for attr, expr in utils.preprocess_col_expr(attr_mapping).items() }) select_metrics = utils.build_col_expr(metric_mapping) select_expr = select_attr + select_metrics cleaned_data = ( # Compile per-client rows for the current retention period prepared_clients # Filter out seemingly impossible rows. One very obvious notion # is to make sure that a profile is always created before a sub-session. # Unlike `sane_date` in previous versions, this is idempotent and only # depends on the data. .withColumn( "profile_creation", F.date_format(pcd, 'yyyy-MM-dd')).withColumn( is_valid, (F.col("profile_creation").isNotNull() & (F.col("profile_creation") > DEFAULT_DATE) & (pcd <= client_date))).select( # avoid acquisition dates in the future (F.when( F.col(is_valid), F.col("profile_creation")).otherwise( F.lit(None)).alias("profile_creation")), *select_expr) # Set default values for the rows .fillna({ 'acquisition_period': DEFAULT_DATE, 'is_funnelcake': "no", "current_week": -1, }).fillna(0).fillna(0.0).fillna('unknown')) result = release.with_effective_version(cleaned_data, effective_version, "profile_creation") return result
def df_smartstore_sourcing_one_day_ago(df_smartstore_sourcing): yield df_smartstore_sourcing.withColumn( DATE_ID_COLUMN_NAME, date_sub(col(DATE_ID_COLUMN_NAME), 1) )
# 4. 去重: df.select('name').dropDuplicates().show() # 5. 分割字符串 : df.withColumn("splited_name",split(df_webpages["name"],",")[1]).show() """操作Date""" from pyspark.sql.functions import current_date, current_timestamp dateDF = spark.range(10).withColumn("today",current_date())\ .withColumn("now",current_timestamp()) from pyspark.sql.functions import date_add, date_sub dateDF.select(date_sub(col("today"),5), date_add(col("today"),5)) # 時間差 from pyspark.sql.functions import to_date, datediff, months_between dateDF.select(to_date(lit("2020-06-01")).alias("start"),\ to_date(lit("2022-06-01")).alias("end"))\ .select(months_between(col("start"),col("end"))).show(1) """處理遺漏值""" df.na.drop() df.na.fill(0) """處理多型態""" # array是否包含某個值 array_contains() # first() last()
#for debugging print(corona_in.printSchema()) corona_in.show(10) #select only necessary rows corona = corona_in.select(to_date('dateRep', format='dd/MM/yyyy').alias('date'), corona_in.cases, corona_in.deaths, corona_in.countriesAndTerritories.alias('country'), corona_in.continentExp.alias('continent')) corona0 = corona.filter(corona.continent == "Europe") corona0.show(50) #calculate relative difference to previous day for number of cases and death corona1 = corona0.withColumn("prev_date", date_sub(corona.date, 1)) #regester dataframe as temporary view for querieng and transforming it corona1.createOrReplaceTempView("corona_out") corona1.show(10) corona_out1 = spark.sql("SELECT cor1.date, cor1.cases, cor1.deaths, cor1.country, cor1.continent, \ cor2.cases as cases_prev, cor2.deaths as dead_prev \ FROM corona_out as cor1 \ LEFT OUTER JOIN corona_out as cor2 on cor2.date = cor1.prev_date and cor1.country=cor2.country ") corona_out = corona_out1.withColumn("cases_rel_diff", expr("(cases-cases_prev)/cases_prev"))\ .withColumn("deaths_rel_diff", expr("(deaths-dead_prev)/dead_prev")) #for debugging corona_out.show(10)
def get_item_stats_info_weekly(): # 市/区每款卷烟上一周销量/环比/同比 try: # 烟id,烟名称 plm_item = get_plm_item(spark).select("item_id", "item_name") area = get_area(spark) # com_id与city的映射关系 city = area.dropDuplicates(["com_id"]).select("com_id", "city") # sale_center_id与区(list)的映射关系 county = area.groupBy("sale_center_id") \ .agg(f.collect_list("county").alias("county")) \ .select("sale_center_id", "county") # 标识列的值 markers = ["1", "3"] # 按照 市或区统计 groups = ["com_id", "sale_center_id"] joins = [city, county] # 除需要计算的值,其他的数据 cols_comm = [["city", "gauge_id", "gauge_name", "ciga_data_marker"], [ "county", "sale_center_id", "gauge_id", "gauge_name", "ciga_data_marker" ]] # 需要计算的值的列名 cols = [[ "gauge_city_week_sales", "gauge_city_week_sales_last_year", "gauge_city_week2_sales" ], [ "gauge_county_week_sales", "gauge_county_week_sales_last_year", "gauge_county_week2_sales" ]] co_co_line = get_co_co_line(spark, scope=[1, 2], filter="week") \ .select("item_id", "com_id", "sale_center_id", "qty_ord", "week_diff") co_co_line.cache() last_year = spark.sql("select * from DB2_DB2INST1_CO_CO_LINE") \ .where(col("com_id").isin(cities)) \ .withColumn("born_date", f.to_date("born_date", "yyyyMMdd")) \ .withColumn("last_year_today", f.date_sub(f.current_date(), 365)) \ .withColumn("week_diff", week_diff("last_year_today", "born_date")) \ .where(col("week_diff") == 1) \ .withColumn("qty_ord", col("qty_ord").cast("float")) \ .select("com_id","sale_center_id", "item_id", "qty_ord") last_year.cache() for i in range(len(groups)): group = groups[i] join = joins[i] c = cols[i] marker = markers[i] try: # 获取上一周订单行表数据 last_week = co_co_line.where(col("week_diff") == 1) # 1.市/区每款烟的订单量 print(f"{str(dt.now())} 上一周{group}每款烟的订单量") qty_amt = last_week.groupBy(group, "item_id").agg( f.sum(col("qty_ord")).alias(c[0])) # -------------------环比 # 获取上第二周订单行表数据 last_two_week = co_co_line.where(col("week_diff") == 2) # 2.市/区每款卷烟销量 上次同期 print(f"{str(dt.now())} 上第二周{group}每款烟的订单量") qty_amt_last = last_two_week.groupBy(group, "item_id").agg( f.sum(col("qty_ord")).alias(c[1])) # # 2.市/区每款卷烟销量 环比 # print(f"{str(dt.now())} 上一周{group}每款烟销量环比") # qty_amt_last.join(qty_amt, [group, "item_id"]) \ # .withColumn("qty_ring_rate", period_udf(col("qty_ord"), col("qty_ord_last"))) # --------------------同比 # 3.市/区每款烟的订单量 去年 print(f"{str(dt.now())} 去年同期{group}每款烟的订单量") qty_ord_ly = last_year.groupBy(group, "item_id") \ .agg(f.sum(col("qty_ord")).alias(c[2])) # #3 市/区每款卷烟销量同比 # print(f"{str(dt.now())} 上一周{group}每款烟销量同比") # qty_ord_ly.join(qty_amt, [group, "item_id"]) \ # .withColumn("qty_ord_yoy", period_udf(col("qty_ord"), col("qty_ord_ly"))) all_df = qty_amt.join(qty_amt_last, [group, "item_id"], "outer") \ .join(qty_ord_ly, [group, "item_id"], "outer")\ .na.fill(0,c) columns = cols_comm[i] + c all_df.withColumn("row", f.concat_ws("_", col(group), col("item_id"))) \ .join(plm_item, "item_id") \ .join(join, group) \ .withColumnRenamed("item_id", "gauge_id") \ .withColumnRenamed("item_name", "gauge_name") \ .withColumn("ciga_data_marker", f.lit(marker)) \ .foreachPartition(lambda x: write_hbase1(x, columns, hbase)) except Exception: tb.print_exc() co_co_line.unpersist() last_year.unpersist() except Exception: tb.print_exc()
'''Now we drop year,month,day,hour,minute,date,time columns as we will again try to create these from timestamp column that we created''' df_nycflights = df_nycflights. \ drop('year'). \ drop('month'). \ drop('day'). \ drop('hour'). \ drop('minute'). \ drop('date'). \ drop('time') df_nycflights.show() '''Now we extract the fields back''' df_nycflights = df_nycflights. \ withColumn('year',year(df_nycflights.timestamp)). \ withColumn('month',month(df_nycflights.timestamp)). \ withColumn('day',dayofmonth(df_nycflights.timestamp)). \ withColumn('hour',hour(df_nycflights.timestamp)). \ withColumn('minute',minute(df_nycflights.timestamp)) df_nycflights.show() '''Now few operations on timestamp ''' df_nycflights = df_nycflights.\ withColumn('date_sub',date_sub(df_nycflights.timestamp ,10)). \ withColumn('date_add',date_add(df_nycflights.timestamp ,10)). \ withColumn('months_between',months_between(df_nycflights.timestamp,df_nycflights.timestamp)) df_nycflights.show()
from pyspark.sql import SparkSession from pyspark.sql.functions import col, lit, to_date, current_date, \ current_timestamp, date_add, date_sub, datediff, months_between, to_timestamp, hour if __name__ == '__main__': spark = SparkSession.builder.appName("learning").master( "local").getOrCreate() spark.range(5).withColumn('date', to_date(lit('2019-01-01'))).show() spark.read.jdbc spark.range(5)\ .select(current_date().alias('date'), current_timestamp().alias('timestamp'))\ .select(date_add(col('date'), 1), date_sub(col('timestamp'), 1)).show() spark.range(5).select(to_date(lit('2019-01-01')).alias('date1'), to_date(lit('2019-01-05')).alias('date2'))\ .select(datediff(col('date2'), col('date1')), months_between(col('date2'), col('date1'))).show() spark.range(5).withColumn('date', to_date( lit('2019-XX-XX'))).show() #No emite excepcion spark.range(5).withColumn('date_comp1', to_date(lit('2019-01-01')) > to_date(lit('2019-01-02'))) \ .withColumn('date_comp2', to_date(lit('2019-01-01')) > to_timestamp(lit('2019-01-02'))) \ .withColumn('date_comp3', to_date(lit('2019-01-01')) > "2019-01-02") \ .withColumn('date_comp3', to_date(lit('2019-01-01')) > "'2019-01-02'").show() spark.range(5).select(current_timestamp().alias("timestamp")).select( hour(col('date')))
# There are dates which focus exclusively on calendar dates, and timestamps, which include both data and time # - dates focus on calendar dates # - timestamps focus on data and time # Spark will make a best effort to correctly identify column types, including dates and timestamps when we enable inferSchema # Spark can be a bit particular about what format you have at any given point in time # It's important to be explicit when parsing or converting to ensure that there are not issues in doing so from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date dateDF = spark.range(10).alias('number')\ .withColumn('today', current_date())\ .withColumn('now', current_timestamp()) dateDF.createOrReplaceTempView('dateTable') dateDF.printSchema() dateDF.select(date_sub(col('today'), 5), date_add(col('today'), 5)).show(1) dateDF.selectExpr( 'date_sub(today, 5)', 'date_add(today, 5)' ).show(1) # select date_sub(today, 5), date_add(today, 5) from dateTable # Another task is to take a look at the difference between two dates. We can do this with datediff function dateDF.withColumn('week_ago', date_sub(col('today'), 7))\ .select(datediff(col('week_ago'), col('today'))).show(1) dateDF.withColumn('week_ago', date_sub(col('today'), 7))\ .selectExpr('datediff(week_ago, today)').show(1) spark.sql('select datediff(week_ago, today) from (select *, date_sub(today, 7) as week_ago from dateTable)')
def process_sdf(sdf_drive, sdf_vehicle): sdf_join_drive_vehicle = sdf_drive.alias("drive").join(sdf_vehicle.alias("vehicle"), ["vehicle_id"]) sdf_join_drive_vehicle_fillna = sdf_join_drive_vehicle.fillna(0) sdf_drive_start_of_week = sdf_join_drive_vehicle_fillna.withColumn("week_start_date", \ (F.date_sub(F.next_day( F.from_utc_timestamp(F.col("datetime"), "America/New_York"), 'monday'), 7))) sdf_Active_horsepower = sdf_drive_start_of_week.withColumn("Active_horsepower" , (F.col("eng_load") / 255) \ * (F.col("max_torque") * F.col("rpm")) / 5252) # Horsepower utilization – Active horsepower / Max Horsepower sdf_Horsepower_utilization = sdf_Active_horsepower.withColumn("Horsepower_utilization", F.col("Active_horsepower") / F.col("max_horsepower")) # # Torque Utilization - calculated as Engine load/ 255 sdf_Torque_Utilization = sdf_Horsepower_utilization.withColumn("Torque_Utilization", F.col("eng_load") / 255) # # RPM Utilization – RPM / Maximum horsepower rpm sdf_RPM_Utilization = sdf_Torque_Utilization.withColumn("RPM_Utilization", F.col("rpm") / F.col("max_horsepower_rpm") ) sdf_engine_features = sdf_RPM_Utilization.withColumn("ft_torque_util_60pct_s", F.when((F.col("Torque_Utilization") >= 0.6) \ & (F.col("Torque_Utilization") < 0.7), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_torque_util_70pct_s", F.when((F.col("Torque_Utilization") >= 0.7) \ & (F.col("Torque_Utilization") < 0.8), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_torque_util_80pct_s", F.when((F.col("Torque_Utilization") >= 0.8) \ & (F.col("Torque_Utilization") < 0.9), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_torque_util_90pct_s", F.when((F.col("Torque_Utilization") >= 0.9) \ & (F.col("Torque_Utilization") < 1), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_50pct_s", F.when((F.col("Horsepower_utilization") >= 0.5) \ & (F.col("Horsepower_utilization") < 0.6), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_60pct_s", F.when((F.col("Horsepower_utilization") >= 0.6) \ & (F.col("Horsepower_utilization") < 0.7), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_70pct_s", F.when((F.col("Horsepower_utilization") >= 0.7) \ & (F.col("Horsepower_utilization") < 0.8), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_horsepower_util_80pct_s", F.when((F.col("Horsepower_utilization") >= 0.8) \ & (F.col("Horsepower_utilization") < 0.9), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_rpm_util_50pct_s", F.when((F.col("RPM_Utilization") >= 0.5) \ & (F.col("RPM_Utilization") < 0.6), \ F.lit(1)).otherwise(F.lit(0))) \ .withColumn("ft_rpm_util_60pct_s", F.when((F.col("RPM_Utilization") >= 0.6) \ & (F.col("RPM_Utilization") < 0.7), \ F.lit(1)).otherwise(F.lit(0))) \ \ sdf_engine_features_total = sdf_engine_features.select("vehicle_id", "week_start_date", "datetime", \ "ft_torque_util_60pct_s", "ft_torque_util_70pct_s", "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \ "ft_horsepower_util_50pct_s", "ft_horsepower_util_60pct_s", "ft_horsepower_util_70pct_s", "ft_horsepower_util_80pct_s", \ "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s") sdf_sdf_engine_features_agg = sdf_engine_features_total.groupBy("vehicle_id", "week_start_date") \ .agg(F.sum("ft_torque_util_60pct_s").alias("ft_torque_util_60pct_s"), \ F.sum("ft_torque_util_70pct_s").alias("ft_torque_util_70pct_s"), \ F.sum("ft_torque_util_80pct_s").alias("ft_torque_util_80pct_s"), \ F.sum("ft_torque_util_90pct_s").alias("ft_torque_util_90pct_s"), \ F.sum("ft_horsepower_util_50pct_s").alias("ft_horsepower_util_50pct_s"), \ F.min("ft_horsepower_util_60pct_s").alias("ft_horsepower_util_60pct_s"), \ F.min("ft_horsepower_util_70pct_s").alias("ft_horsepower_util_70pct_s"), \ F.min("ft_horsepower_util_80pct_s").alias("ft_horsepower_util_80pct_s"), \ F.min("ft_rpm_util_50pct_s").alias("ft_rpm_util_50pct_s"), \ F.min("ft_rpm_util_60pct_s").alias("ft_rpm_util_60pct_s"), ) sdf_sdf_engine_features_final = sdf_sdf_engine_features_agg.select("vehicle_id", "week_start_date", \ "ft_torque_util_60pct_s", "ft_torque_util_70pct_s", "ft_torque_util_80pct_s", "ft_torque_util_90pct_s", \ "ft_horsepower_util_50pct_s", "ft_horsepower_util_60pct_s", "ft_horsepower_util_70pct_s", "ft_horsepower_util_80pct_s", \ "ft_rpm_util_50pct_s", "ft_rpm_util_60pct_s") sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.sort(F.col("vehicle_id"), F.col("week_start_date")) sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.withColumn("week_start_date", F.date_format(F.col("week_start_date"), "yyyy-MM-dd")) sdf_sdf_engine_features_final = sdf_sdf_engine_features_final.fillna(0) return sdf_sdf_engine_features_final
# In[15]: df.printSchema() # In[16]: df.show() # In[17]: df.withColumn("date_sub_10",F.date_sub("date",10)).show() # In[18]: df.withColumn("date_add_10",F.date_add("date",20)).show() # In[ ]: # difference in days between today and date columns # In[19]:
retailOri = retail.select(col('CustomerID'),totalQty.alias('OriginalQty')) select *,POWER((UnitPrice*Quantity),2.0)+5 as OriginalQty from dfTable #SparkSql df = df.select(round(lit('2.5')),bround(lit('2.5'))) #from pyspark.sql.functions import lit,round,bround ans: 3(round),2(bround) #pearson correlation(-1 to 1), 1 means both are linearly corelated, and -1 means inversly related, 0 means no relation between two from pyspark.sql.functions import corr corrr = retailOri.select(corr('Quantity','UnitPrice')) #regular expressions retailexp = retail.select(regexp_exp(col('Description'),'BLUE|ADVENT|WHITE','color')) retailext = retail.select(regexp_extract(col('Description'),'(BLUE|ADVENT)',1)) #date&Time from pyspark.sql.functions import current_date,current_timestamp,date_add,date_sub,datediff,months_between,to_date df = spark.range(5).withColumn('Today',current_date()).withColumn('now',current_timestamp()) dfdiff = df.withColumn('weekAgo',date_sub(col('Today'),7)).withColumn('weekLater',date_add(col('Today'),7)).select('*') df = df.withColumn('diff',datediff(c1,c2)) dateFormat = 'yyyy-dd-MM' dff = spark.range(1).withColumn('dates',to_date(lit("2019-20-12"),dateFormat)).select('dates') dff = spark.range(1).withColumn('dates',to_timestamp(lit("2019-20-12"),dateFormat)).select('dates') #with nulls df.na.drop('all')#If all values in rows are null then that row will be dropped df.na.drop('any')#IF any of the value is null that row will be dropped df.na.drop('all',subset=['c1','c2']) #fill fillALlCols = {'StockCode':5,'Description':'It is null'} df.na.fill(fillALlCols) #replace df.replace('[]',['Its null'],'Description') complexDf = retail.select(struct('InvoiceNo','UnitPrice').alias('Complex'))
sc = SparkContext() spark = SparkSession(sc) dateDF = spark.range(10)\ .withColumn("today", current_date())\ .withColumn("now", current_timestamp()) dateDF.createTempView("dateTable") dateDF.printSchema() #1 take days or add days print("1") dateDF.select( date_sub(col("today"), 5), date_add(col("today"), 5))\ .show(1) #2 date diference print("2") dateDF\ .withColumn("week_ago", date_sub(col("today"), 7))\ .select(datediff(col("week_ago"), col("today")))\ .show(1) #3 between two month print("3") dateDF\ .select( to_date(lit("2016-01-01")).alias("start"),
def _bin_time_stamp(self, origin: pd.Timestamp, ts_scol: Column) -> Column: sql_utils = SparkContext._active_spark_context._jvm.PythonSQLUtils origin_scol = F.lit(origin) (rule_code, n) = (self._offset.rule_code, self._offset.n ) # type: ignore[attr-defined] left_closed, right_closed = (self._closed == "left", self._closed == "right") left_labeled, right_labeled = (self._label == "left", self._label == "right") if rule_code == "A-DEC": assert (origin.month == 12 and origin.day == 31 and origin.hour == 0 and origin.minute == 0 and origin.second == 0) diff = F.year(ts_scol) - F.year(origin_scol) mod = F.lit(0) if n == 1 else (diff % n) edge_cond = (mod == 0) & (F.month(ts_scol) == 12) & (F.dayofmonth(ts_scol) == 31) edge_label = F.year(ts_scol) if left_closed and right_labeled: edge_label += n elif right_closed and left_labeled: edge_label -= n if left_labeled: non_edge_label = F.when(mod == 0, F.year(ts_scol) - n).otherwise(F.year(ts_scol) - mod) else: non_edge_label = F.when( mod == 0, F.year(ts_scol)).otherwise(F.year(ts_scol) - (mod - n)) return F.to_timestamp( F.make_date( F.when(edge_cond, edge_label).otherwise(non_edge_label), F.lit(12), F.lit(31))) elif rule_code == "M": assert (origin.is_month_end and origin.hour == 0 and origin.minute == 0 and origin.second == 0) diff = ((F.year(ts_scol) - F.year(origin_scol)) * 12 + F.month(ts_scol) - F.month(origin_scol)) mod = F.lit(0) if n == 1 else (diff % n) edge_cond = (mod == 0) & (F.dayofmonth(ts_scol) == F.dayofmonth( F.last_day(ts_scol))) truncated_ts_scol = F.date_trunc("MONTH", ts_scol) edge_label = truncated_ts_scol if left_closed and right_labeled: edge_label += sql_utils.makeInterval("MONTH", F.lit(n)._jc) elif right_closed and left_labeled: edge_label -= sql_utils.makeInterval("MONTH", F.lit(n)._jc) if left_labeled: non_edge_label = F.when( mod == 0, truncated_ts_scol - sql_utils.makeInterval("MONTH", F.lit(n)._jc), ).otherwise(truncated_ts_scol - sql_utils.makeInterval("MONTH", mod._jc)) else: non_edge_label = F.when(mod == 0, truncated_ts_scol).otherwise( truncated_ts_scol - sql_utils.makeInterval("MONTH", (mod - n)._jc)) return F.to_timestamp( F.last_day( F.when(edge_cond, edge_label).otherwise(non_edge_label))) elif rule_code == "D": assert origin.hour == 0 and origin.minute == 0 and origin.second == 0 if n == 1: # NOTE: the logic to process '1D' is different from the cases with n>1, # since hour/minute/second parts are taken into account to determine edges! edge_cond = ((F.hour(ts_scol) == 0) & (F.minute(ts_scol) == 0) & (F.second(ts_scol) == 0)) if left_closed and left_labeled: return F.date_trunc("DAY", ts_scol) elif left_closed and right_labeled: return F.date_trunc("DAY", F.date_add(ts_scol, 1)) elif right_closed and left_labeled: return F.when(edge_cond, F.date_trunc("DAY", F.date_sub( ts_scol, 1))).otherwise( F.date_trunc("DAY", ts_scol)) else: return F.when(edge_cond, F.date_trunc("DAY", ts_scol)).otherwise( F.date_trunc("DAY", F.date_add(ts_scol, 1))) else: diff = F.datediff(end=ts_scol, start=origin_scol) mod = diff % n edge_cond = mod == 0 truncated_ts_scol = F.date_trunc("DAY", ts_scol) edge_label = truncated_ts_scol if left_closed and right_labeled: edge_label = F.date_add(truncated_ts_scol, n) elif right_closed and left_labeled: edge_label = F.date_sub(truncated_ts_scol, n) if left_labeled: non_edge_label = F.date_sub(truncated_ts_scol, mod) else: non_edge_label = F.date_sub(truncated_ts_scol, mod - n) return F.when(edge_cond, edge_label).otherwise(non_edge_label) elif rule_code in ["H", "T", "S"]: unit_mapping = {"H": "HOUR", "T": "MINUTE", "S": "SECOND"} unit_str = unit_mapping[rule_code] truncated_ts_scol = F.date_trunc(unit_str, ts_scol) diff = sql_utils.timestampDiff(unit_str, origin_scol._jc, truncated_ts_scol._jc) mod = F.lit(0) if n == 1 else (diff % F.lit(n)) if rule_code == "H": assert origin.minute == 0 and origin.second == 0 edge_cond = (mod == 0) & (F.minute(ts_scol) == 0) & (F.second(ts_scol) == 0) elif rule_code == "T": assert origin.second == 0 edge_cond = (mod == 0) & (F.second(ts_scol) == 0) else: edge_cond = mod == 0 edge_label = truncated_ts_scol if left_closed and right_labeled: edge_label += sql_utils.makeInterval(unit_str, F.lit(n)._jc) elif right_closed and left_labeled: edge_label -= sql_utils.makeInterval(unit_str, F.lit(n)._jc) if left_labeled: non_edge_label = F.when(mod == 0, truncated_ts_scol).otherwise( truncated_ts_scol - sql_utils.makeInterval(unit_str, mod._jc)) else: non_edge_label = F.when( mod == 0, truncated_ts_scol + sql_utils.makeInterval(unit_str, F.lit(n)._jc), ).otherwise(truncated_ts_scol - sql_utils.makeInterval(unit_str, (mod - n)._jc)) return F.when(edge_cond, edge_label).otherwise(non_edge_label) else: raise ValueError("Got the unexpected unit {}".format(rule_code))
df_merge.src_id.alias('id'), df_merge.src_attr.alias('attr'), lit(True).alias('is_current'), lit(False).alias('is_deleted'), df_merge.src_start_date.alias('start_date'), df_merge.src_end_date.alias('end_date')) # For records that needs to be deleted df_merge_p3 = df_merge.filter( df_merge.action == 'DELETE').select(column_names).withColumn( 'is_current', lit(False)).withColumn('is_deleted', lit(True)) # For records that needs to be expired and then inserted df_merge_p4_1 = df_merge.filter(df_merge.action == 'UPSERT').select( df_merge.src_id.alias('id'), df_merge.src_attr.alias('attr'), lit(True).alias('is_current'), lit(False).alias('is_deleted'), df_merge.src_start_date.alias('start_date'), df_merge.src_end_date.alias('end_date')) df_merge_p4_2 = df_merge.filter(df_merge.action == 'UPSERT').withColumn( 'end_date', date_sub(df_merge.src_start_date, 1)).withColumn( 'is_current', lit(False)).withColumn('is_deleted', lit(False)).select(column_names) # Union all records together df_merge_final = df_merge_p1.unionAll(df_merge_p2).unionAll( df_merge_p3).unionAll(df_merge_p4_1).unionAll(df_merge_p4_2) df_merge_final.orderBy(['id', 'start_date']).show() # At last, you can overwrite existing data using this new data frame. # ... # COMMAND ---------- DF
df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)), trim(lit(string_with_space))).show() #regular expressions #working with dates, timestamps from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp dateDF = spark.range(10).withColumn("today", current_date()).withColumn( "now", current_timestamp()) dateDF.show() dateDF.select( date_add(col("today"), 5).alias("today+5"), date_sub(col("today"), 5).alias("today-5")).show() #convert string to date, default format is 'YYYY-MM-DD' spark.range(1).select( to_date(lit("2019-02-01")).alias("start_date"), to_date(lit("2019-03-06")).alias("end_date")).select( datediff(col("start_date"), col("end_date"))).show() cleanDateDF = spark.range(1).withColumn("date1", current_date()) date_format = 'YYYY-MM-DD' cleanDateDF.select(to_timestamp(col("date1"), date_format)).show() #working with nulls in data #coalesce, returns first not null value from a set of columns
def udf_get_sunday(date): return F.date_sub(F.next_day(date, "sunday"), 7)
.select("Description").show(3, False) # COMMAND ---------- from pyspark.sql.functions import current_date, current_timestamp dateDF = spark.range(10)\ .withColumn("today", current_date())\ .withColumn("now", current_timestamp()) dateDF.createOrReplaceTempView("dateTable") # COMMAND ---------- from pyspark.sql.functions import date_add, date_sub dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1) # COMMAND ---------- from pyspark.sql.functions import datediff, months_between, to_date dateDF.withColumn("week_ago", date_sub(col("today"), 7))\ .select(datediff(col("week_ago"), col("today"))).show(1) dateDF.select( to_date(lit("2016-01-01")).alias("start"), to_date(lit("2017-05-22")).alias("end"))\ .select(months_between(col("start"), col("end"))).show(1) # COMMAND ----------
fileLog("bc data has been read successfully") bc = bc.orderBy('customer_ID','meter_reading_date') # discarding agreements who have only 1 meter_reading_date : Also combining the step to find max reading date per concat_agreement bc = bc.withColumn('CONCAT_AGMNT_NO',F.concat(F.col('CUSTOMER_ID'),F.col('ACCOUNT_SEQ'),F.col('AGREEMENT_SEQ')).cast(LongType())) grouped = bc.groupBy('CONCAT_AGMNT_NO').agg(count('meter_reading_date').alias('agr_count'),max('meter_reading_date').alias('Billing1_End_Date')).where(F.col('agr_count') > 1) # join it with bc to get full column details df1 = bc.alias('df1') df2 = grouped.alias('df2') bc_req = df1.join(df2,['CONCAT_AGMNT_NO']).select('df1.*',df2.Billing1_End_Date) billingcycle = bc_req # get max date for every agreement billingcycle = bc_req.groupBy('CONCAT_AGMNT_NO','customer_id','account_seq','agreement_seq').agg(max('meter_reading_date').alias('Billing1_End_Date')) # generate last 13 billing cycles billingcycle = billingcycle.withColumn('Billing1_Start_Date',date_sub(F.col('Billing1_End_Date'),29)) billingcycle = billingcycle.withColumn('Billing2_End_Date',date_sub(F.col('Billing1_Start_Date'),1)) billingcycle = billingcycle.withColumn('Billing2_Start_Date',date_sub(F.col('Billing2_End_Date'),29)) billingcycle = billingcycle.withColumn('Billing3_End_Date',date_sub(F.col('Billing2_Start_Date'),1)) billingcycle = billingcycle.withColumn('Billing3_Start_Date',date_sub(F.col('Billing3_End_Date'),29)) billingcycle = billingcycle.withColumn('Billing4_End_Date',date_sub(F.col('Billing3_Start_Date'),1)) billingcycle = billingcycle.withColumn('Billing4_Start_Date',date_sub(F.col('Billing4_End_Date'),29)) billingcycle = billingcycle.withColumn('Billing5_End_Date',date_sub(F.col('Billing4_Start_Date'),1)) billingcycle = billingcycle.withColumn('Billing5_Start_Date',date_sub(F.col('Billing5_End_Date'),29)) billingcycle = billingcycle.withColumn('Billing6_End_Date',date_sub(F.col('Billing5_Start_Date'),1)) billingcycle = billingcycle.withColumn('Billing6_Start_Date',date_sub(F.col('Billing6_End_Date'),29)) billingcycle = billingcycle.withColumn('Billing7_End_Date',date_sub(F.col('Billing6_Start_Date'),1)) billingcycle = billingcycle.withColumn('Billing7_Start_Date',date_sub(F.col('Billing7_End_Date'),29)) billingcycle = billingcycle.withColumn('Billing8_End_Date',date_sub(F.col('Billing7_Start_Date'),1)) billingcycle = billingcycle.withColumn('Billing8_Start_Date',date_sub(F.col('Billing8_End_Date'),29))
df2 = grouped.alias('df2') bc_req = df1.join(df2, ['CONCAT_AGMNT_NO']).select('df1.*', df2.Billing1_End_Date) billingcycle = bc_req fileLog( "counts of unique aggreements in billing cycle before joining are: " + str(billingcycle.select('CONCAT_AGMNT_NO').distinct().count())) # get max date for every agreement billingcycle = bc_req.groupBy( 'CONCAT_AGMNT_NO', 'customer_id', 'account_seq', 'agreement_seq').agg( max('meter_reading_date').alias('Billing1_End_Date')) # generate last 13 billing cycles billingcycle = billingcycle.withColumn( 'Billing1_Start_Date', date_sub(F.col('Billing1_End_Date'), 29)) billingcycle = billingcycle.withColumn( 'Billing2_End_Date', date_sub(F.col('Billing1_Start_Date'), 1)) billingcycle = billingcycle.withColumn( 'Billing2_Start_Date', date_sub(F.col('Billing2_End_Date'), 29)) billingcycle = billingcycle.withColumn( 'Next_Billing_StartDate', date_add(F.col('Billing1_End_Date'), 1)) billingcycle = billingcycle.withColumn( 'Next_Billing_EndDate', date_add(F.col('Next_Billing_StartDate'), 29)) billingcycle = billingcycle.select( 'CONCAT_AGMNT_NO', 'CUSTOMER_ID', 'ACCOUNT_SEQ', 'AGREEMENT_SEQ', 'Billing1_Start_Date', 'Billing1_End_Date', 'Billing2_Start_Date', 'Billing2_End_Date', 'Next_Billing_StartDate', 'Next_Billing_EndDate') fileLog("generated bc1, bc2 and bc14 for prediction")
pivot_dt = date(2020, 12, 1) first_dt = date(2020, 10, 1) # TODO create function that construct path2conf through enviroment path2conf = "../conf" # uncomment if run as py # path2conf = "conf" spark = create_spark_session('dataset_creation', n_executors=16, n_cores=8, executor_memory=32, driver_memory=64, ) partners = dict_from_file(f"{path2conf}/partners.json") aggr = spark.table("sbx_t_team_mp_cmpn_ds.day_aggr") aggr = aggr.where(F.col("report_dt") < last_dt) aggr = aggr.where (F.col("client_city").isin(get_list_of_cities(partner, path2conf=path2conf))) sales = spark.table("sbx_t_team_mp_cmpn_ds.dm_partner_sales") sales = sales.where(F.col("partner_name") == partners[partner]) sales = sales.where((F.col("evt_dt") < last_dt) & (F.col("evt_dt") >= first_dt)) sales = sales.withColumn("target", F.lit(1)) sales = sales.withColumn("report_dt", F.date_sub('evt_dt', 3)) # sales = sales.withColumnRenamed("evt_dt", "report_dt") sales = sales.select("epk_id", "report_dt", "target") # TODO crete good algorithm to do this shit dataset = aggr.join(sales, ['epk_id', "report_dt"], how="left") print(dataset.select(F.sum("target").alias('amount_of')).show()) dataset = dataset.fillna({"target": 0}) dataset = dataset.where(F.col("report_dt").isNotNull()) train = dataset.where(F.col("report_dt") < pivot_dt).sampleBy("target", fractions={1: 0.05, 0: 0.95}) # sdf2cluster(train, f'{partner}_train') spark.sql(f"drop table if exists sbx_t_team_mp_cmpn_ds.{partner}_train") delete_folder_from_ps(f'{partner}_train') save2ps(train, f'{partner}_train', partition="report_dt") oot = dataset.where(F.col("report_dt") >= pivot_dt).sampleBy("target", fractions={1: 0.03, 0: 0.97}) # sdf2cluster(oot, f'{partner}_oot')