def baseline_model(self, df_processed): cond_dvs = F.when(col("dvs_p30days") > 400, 400).otherwise(col("dvs_p30days")) stats=df_processed\ .withColumn("dvs_p30days",cond_dvs)\ .filter(col("recent_event_date")==1)\ .filter(col("event_date")>=F.date_add(F.current_date(),-15))\ .groupby(['grt_l2_cat_name'])\ .agg(F.avg(col("dvs_p30days")).alias("avg_dvs_p30days") ,F.stddev(col("dvs_p30days")).alias("std_deal_view") ,F.round(F.avg(col("dvs_p30days")).cast('integer')).alias("avg_deal_view") ,F.max("dvs_p30days").alias("max_deal_view")) w = Window.partitionBy(F.col('consumer_id')).orderBy( F.col('normalized_dvs_p30days').desc()) df_final=df_processed\ .filter(col("recent_event_date")==1)\ .filter(col("event_date")>=F.date_add(F.current_date(),-15))\ .join(stats,on='grt_l2_cat_name')\ .withColumn('normalized_dvs_p30days',(col('dvs_p30days')-col('avg_deal_view'))/col('std_deal_view'))\ .withColumn('normalized_dvs_p30days_rank',F.row_number().over(w)) df_micro=df_final\ .filter(col('normalized_dvs_p30days')>=0)\ .filter(col('normalized_dvs_p30days_rank')==1)\ .filter(col('grt_l2_purchaser_14d')==0) return df_micro
def age_calculator(self, columns, age_colname): """ @columns: a string of column name @age_colname: a string of the new column of age, e.g.'age' """ assert isinstance(columns, str), "Error: columns argument must be a string!" assert isinstance( age_colname, str), "Error: the name for age column argument must be a string!" df = self._df.select( columns, from_unixtime(unix_timestamp(columns, 'MM/dd/yyy')).alias('with time')) df = df.withColumn('without time', df['with time'].cast(DateType())) df = df.withColumnRenamed(columns, 'tmp') df = df.withColumn(age_colname, ( (datediff(current_date(), col('without time')).cast(FloatType())) / 365).cast(IntegerType())) self._df = self._df.join( df, col(columns) == col('tmp'), 'leftouter').drop('tmp').drop('with time').drop('without time') return self
def _preprocess(self): self._df = self._df[self._REQUIRED_COLUMNS] self._df = self._df.toDF(*self._HEADER_NAMES) self._df = self._df.withColumn("logged_datetime", to_timestamp( self._df.logged_datetime, 'yyyy/MM/dd')) self._df = self._df.withColumn("processed_datetime", current_date()) self._df = self._df.withColumn('processed_datetime', to_timestamp(self._df.processed_datetime)) self._df = self._df.withColumn( "source_port", self._df["source_port"].cast(IntegerType())) self._df = self._df.withColumn( "destination_port", self._df["destination_port"].cast(IntegerType())) self._df = self._df.withColumn( "bytes_sent", self._df["bytes_sent"].cast(LongType())) self._df = self._df.withColumn( "bytes_received", self._df["bytes_received"].cast(LongType())) self._df = self._df.withColumn( "packets_received", self._df["packets_received"].cast(LongType())) self._df = self._df.withColumn( "packets_sent", self._df["packets_sent"].cast(LongType())) self._df = self._df.withColumn( "time_elapsed", self._df["time_elapsed"].cast(LongType())) self._df = self._df.withColumn( "repeat_count", self._df["repeat_count"].cast(LongType())) self._df = self._df.withColumn("count_events", lit(1))
def ageCalculate(self, column, dateFormat, nameColAge): """ This method compute the age of clients based on their born dates. :param column Name of the column born dates column. :param dateFormat String format date of the column provided. :param nameColAge Name of the new column, the new columns is the resulting column of ages. """ # Check if column argument a string datatype: self.__assertTypeStr(column, "column") # Check if dateFormat argument a string datatype: self.__assertTypeStr(dateFormat, "dateFormat") # Asserting if column if in dataFrame: assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame" # Output format date Format = "yyyy-MM-dd" # Some SimpleDateFormat string exprs = format_number( mag( months_between( date_format( unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias(nameColAge) self.__df = self.__df.withColumn(nameColAge, exprs) self.__addTransformation() # checkpoint in case return self
def process_expense_file(file_path): tempDf = spark.read.load(file_path, format="csv", header="true", escape='"') tempDf = tempDf.filter(tempDf["Tags"].isNotNull()).select( col("SubscriptionName"), col("ResourceGroup"), col("ResourceLocation"), col("ProductName"), col("PreTaxCost"), col("ResourceType"), col("UsageDateTime"), split(col("Tags"), ",").alias("tags") ) return tempDf.withColumn("costCenterNumber", lit(get_cost_center(tempDf["tags"])))\ .withColumn("team", lit(get_team(tempDf["tags"])))\ .withColumn("usageMonth", month(tempDf['UsageDateTime']))\ .withColumn("usageYear", year(tempDf['UsageDateTime']))\ .withColumn("processedDate", current_date())\ .select( col("processedDate"), col("SubscriptionName"), col("ResourceGroup"), col("ResourceLocation"), col("ProductName"), col("ResourceType"), col("costCenterNumber"), col("team"), col("usageMonth"), col("usageYear"), col("UsageDateTime"), col("PreTaxCost") ).filter("costCenterNumber is not NULL")
def put(self, df): """Writes data to final CSV file and stops spark session. Arguments: df[spark datafarme] -- spark dataframe that """ try: self.logger.info('Begin putting data to disk // {}'.format( self.etl_id)) jdbc_params = self._inst_jdbc_params() jdbc_params['dbtable'] = 'final_data' df = df.withColumn('process_date', F.current_date()) df.write.format('jdbc').options(**jdbc_params)\ .mode('overwrite')\ .save() self.logger.info('Job complete // {}'.format(self.etl_id)) self.spark_session.stop() except Exception as e: self.logger.error('{} // {}'.format(e, self.etl_id))
def getReadRDBMSDataFrame(): from_format = "dd/MMM/yyyy:HH:mm:ss" to_format = 'dd-MM-yyyy' storeIncrementalValue(0, "ignore") df = spark.read.format("csv") \ .option("inferSchema", True) \ .option("header", True) \ .load(argv[17]) df = df.filter(df.createdt.isNotNull() & (df.createdt != "null")) result = df.withColumn("createdt", f.from_unixtime(f.unix_timestamp(f.col("createdt"), from_format), to_format)) \ .withColumn("value",f.col("value").cast(f.StringType()))\ .withColumn("score",f.col("score").cast(f.StringType()))\ .withColumn("regioncode",f.col("regioncode").cast(f.StringType()))\ .withColumn("status",f.col("status").cast(f.StringType()))\ .withColumn("count",f.col("count").cast(f.StringType()))\ .withColumn("statuscode",f.col("statuscode").cast(f.StringType()))\ .select("id", "username", "amount", "ip", "createdt", "value", "score", "regioncode", "status", "method", "key", "count", "type", "site", "statuscode") \ .withColumn("rdbms_current_date", f.current_date()) \ .filter(f.col("id") > getIncrementalValue()) result = result.withColumn( "id", f.col("id").cast(f.StringType())).na.fill("Not_Applicable") max_value = df.agg({"id": "max"}).collect()[0] storeIncrementalValue(max_value["max(id)"], "overwrite") result.printSchema() return result
def ageCalculate(self, column, dateFormat, nameColAge): """ This method compute the age of clients based on their born dates. :param column Name of the column born dates column. :param dateFormat String format date of the column provided. :param nameColAge Name of the new column, the new columns is the resulting column of ages. """ # Check if column argument a string datatype: self.__assertTypeStr(column, "column") # Check if dateFormat argument a string datatype: self.__assertTypeStr(dateFormat, "dateFormat") # Asserting if column if in dataFrame: assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame" # Output format date Format = "yyyy-MM-dd" # Some SimpleDateFormat string exprs = format_number( mag( months_between(date_format( unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias( nameColAge) self.__df = self.__df.withColumn(nameColAge, exprs) self.__addTransformation() # checkpoint in case return self
def county_reality_supply(): # 各区县各档位该品规上周投放量 try: print(f"{str(dt.now())} 各区县各档位该品规上周投放量") plm_item = get_plm_item(spark).select("item_id", "item_name") co_cust = get_co_cust(spark).select("cust_id", "sale_center_id", "cust_seg") area = get_area(spark) # com_id与city的映射关系 city = area.dropDuplicates(["com_id"]).select("com_id", "city") # sale_center_id与区(list)的映射关系 county = area.groupBy("sale_center_id") \ .agg(f.collect_list("county").alias("county")) \ .select("sale_center_id", "county") # 获取上周实际投放量 # cust_item_spw = spark.sql( # "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \ # .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \ # .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \ # .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \ # .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \ # .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun")))\ # .join(co_cust,"cust_id") cust_item_spw = spark.sql( "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \ .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \ .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \ .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7 * 4)) \ .withColumn("last_sun", f.date_add(col("last_mon"), 6 + 7 * 3)) \ .where((col("begin_date") >= col("last_mon")) & (col("end_date") <= col("last_sun")))\ .join(co_cust,"cust_id") #需要计算的值的列名 colName = "county_gauge_week_volume" result = cust_item_spw.groupBy("com_id","sale_center_id","cust_seg", "item_id") \ .agg(f.sum("qty_allocco").alias(colName)) columns = [ "com_id", "city", "sale_center_id", "county", "gears", "gauge_id", "gauge_name", "city", "gears_data_marker", colName ] result.withColumn("row", f.concat_ws("_", col("sale_center_id"),col("cust_seg"), col("item_id"))) \ .withColumn("gears_data_marker", f.lit("4")) \ .join(plm_item, "item_id") \ .join(city, "com_id") \ .join(county,"sale_center_id")\ .withColumnRenamed("item_id","gauge_id")\ .withColumnRenamed("item_name","gauge_name")\ .withColumnRenamed("cust_seg","gears")\ .foreachPartition(lambda x: write_hbase1(x, columns, hbase)) except Exception: tb.print_exc()
def get_crm_cust_log(spark): #crm_cust_log 变更记录表 # change_type: CO_CUST.STATUS 状态变更 # CO_CUST.CUST_SEG 档位变更 crm_cust_log = spark.sql("select cust_id,change_type,change_frm,change_to,audit_date from DB2_DB2INST1_CRM_CUST_LOG") \ .withColumn("audit_date", f.to_date("audit_date", "yyyyMMdd")) \ .withColumn("day_diff",f.datediff(f.current_date(), col("audit_date"))) \ .withColumn("change_frm", f.regexp_replace(col("change_frm"), "(zz)|(ZZ)", "31")) \ .withColumn("change_to", f.regexp_replace(col("change_to"), "(zz)|(ZZ)", "31")) return crm_cust_log
def create_oracle_output(): df_hcp_match_final = spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.path')) df_hcp_yesterdays_data = \ spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.d2.path')) \ .filter(F.col('full_mk').isNotNull()) \ .filter(F.col('prop_mk').isNotNull()) \ .filter(F.col('pnr_mk').isNotNull()) join_fields = ['lastmodified_datetime', 'full_mk', 'prop_mk', 'pnr_mk', 'match_level', 'new_rate_bucket'] df_hcp_yesterday = df_hcp_yesterdays_data \ .withColumn('match_flag', F.lit('Y')) \ .join(df_hcp_match_final.select(join_fields), on=join_fields) \ .persist() df_hcp_today = df_hcp_match_final \ .withColumn('dl_last_modified_date', F.current_date()) \ .withColumn('rk', F.lit(1)) \ .withColumn('rk_f', F.lit(1)) \ .join(df_hcp_yesterday.select(join_fields + ['match_flag']), on=join_fields, how='left') \ .filter(F.col('match_flag').isNull()) # TODO: Remove redundant fields and remnants of deduping keys which only contain a single value df_hcp_oracle = \ cast_fields_for_oracle_output(df_hcp_yesterday) \ .union(cast_fields_for_oracle_output(df_hcp_today)) \ .withColumn('dwh_rk', F.row_number().over(Window .partitionBy(F.col('pnr_mk')) .orderBy(F.col('pnr_mk').desc()))) \ .filter(F.col('dwh_rk') == 1) \ .drop('dwh_rk') # TODO: Investigate partitioning of Oracle data suitable for end-user # Dump to staging path since Spark cannot simultaneously read and write to the same path dump_dataframe(df=df_hcp_oracle, path=config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.path'), table_name=config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.table')) df_hcp_yesterday.unpersist() df_hcp_oracle = spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.path')) dump_dataframe(df=df_hcp_oracle, path=config.get(config_set, 'hcp.txns.new.fin.f.d2.path'), table_name=config.get(config_set, 'hcp.txns.new.fin.f.d2.table')) # Oracle table for EDM dump_dataframe(df=df_hcp_oracle .withColumnRenamed('rate_bucket', 'original_rate_bucket') .withColumnRenamed('new_rate_bucket', 'rate_bucket'), path=config.get(config_set, 'hcp.txns.to.dwh.f.path'), table_name=config.get(config_set, 'hcp.txns.to.dwh.f.table') )
def adobe_traffic_by_channel(df_traffic_by_channel): return (df_traffic_by_channel.select( df_traffic_by_channel['id'], df_traffic_by_channel['createdat'], df_traffic_by_channel['reportsuiteid'], df_traffic_by_channel['reportsuitename'], df_traffic_by_channel['server'], df_traffic_by_channel['data'], df_traffic_by_channel['hour'], df_traffic_by_channel['channel'], df_traffic_by_channel['visits'], df_traffic_by_channel['orders'], df_traffic_by_channel['ordersserialized'], df_traffic_by_channel['revenue']).withColumn( 'dataprocessamento', f.trunc(f.current_date(), 'month')))
def prepare_features(df): df = df.withColumn( 'exclam', length('review_body') - length(regexp_replace('review_body', '\!', ''))) df = df.withColumn('age', datediff(current_date(), to_date(df['review_date']))) df = df.withColumn('review_length', length(df['review_body'])) df = df.withColumn('helfulness', df['helpful_votes'] / df['total_votes']) df = df.withColumn('label', expr("CAST(verified_purchase='Y' As INT)")) select_cols = df.select( ['star_rating', 'helfulness', 'age', 'review_length', 'label']).na.fill(0) return select_cols
def merge(spark, update, tableName, cols, key): """ 将DataFrame和delta表进行merge操作,insert操作要求DataFrame必须包含delta表所有的列(0.5版本) 当我们使用merge操作更新/插入delta表其中几列时,指定在DataFrame中不存在的列的值为null。 注:DataFrame中要写入delta表的列要和delta表一样 :param spark,SparkSession实例 :param update,spark DataFrame :param tableName,要更新的delta表 """ # 如果没有dt列,创建当前日期的dt列 if "dt" not in cols: update = update.withColumn("dt", f.current_date()) cols.append("dt") # 1.构建merge条件 mergeExpr = f"origin.{key}=update.{key}" print(f"merge expression:{mergeExpr}") # 2.构建更新表达式 updateExpr = {} for c in cols: updateExpr[c] = f"update.{c}" print(f"update expression:{updateExpr}") origin = DeltaTable.forPath(spark, tableName) origin_cols = origin.toDF().columns # 3.构建插入表达式 insertExpr = {} for origin_col in origin_cols: if origin_col in cols: insertExpr[origin_col] = f"update.{origin_col}" else: # 不存在,插入null值(不是字符串) insertExpr[origin_col] = "null" print(f"insert expression:{insertExpr}") # for origin_col in origin_cols: # if origin_col not in cols: # update=update.withColumn(origin_col,f.lit(None)) origin.alias("origin") \ .merge(update.alias("update"), mergeExpr) \ .whenMatchedUpdate(set=updateExpr) \ .whenNotMatchedInsert(values=insertExpr) \ .execute()
def test_current_date(self, input_df, spark_session): """Substitute the cleansed values with the current date""" cleaning_definitions = dict(status=dict( elements=["active", "inactive"], default=F.current_date())) expected_output_df = spark_session.createDataFrame([ Row(id=1, status="active"), Row(id=2, status=str(dt.date.today())), Row(id=3, status=str(dt.date.today())), Row(id=4, status="inactive"), Row(id=5, status=None), Row(id=6, status=str(dt.date.today())), ]) output_df = EnumCleaner( cleaning_definitions=cleaning_definitions).transform(input_df) assert_df_equality(expected_output_df, output_df)
def _years_between(_new_col_name, attr): _date_format = attr[0] _col_name = attr[1] return F.format_number( F.abs( F.months_between( F.date_format( F.unix_timestamp( _col_name, _date_format).cast("timestamp"), format_dt), F.current_date()) / 12), 4) \ .alias( _new_col_name)
def getRandomUserWebApiDataFrame(): final_list = [] user_set = set() results_json = dict() randomuserurl = argv[7] ALLOWED_RANDOM_USER = int(argv[8]) while True: resp = json.loads( urllib.request.urlopen(randomuserurl).read().decode('utf-8')) for randomuser in resp['results']: username = removeDigits(randomuser.get('user').get('username')) if username not in user_set: user_set.add(username) final_list.append(randomuser) if len(user_set) == ALLOWED_RANDOM_USER: break if len(user_set) == ALLOWED_RANDOM_USER: break results_json["results"] = final_list json_df = spark.read.json(sc.parallelize([results_json])) users = json_df.withColumn("results", f.explode(f.col("results"))) \ .withColumn("r_username", f.regexp_replace("results.user.username", '[0-9]', '')) \ .withColumn("dob", f.from_unixtime(f.unix_timestamp(f.col('results.user.dob').cast('timestamp')), 'dd-MM-yyyy')) \ .withColumn("user_current_date", f.current_date()) \ .select("r_username", "results.user.gender", "results.user.name.title", f.col("results.user.name.first").alias("firstname"), f.col("results.user.name.last").alias("lastname"), f.col("results.user.location.street").alias("location_street"), f.col("results.user.location.city").alias("location_city"), f.col("results.user.location.state").alias("location_state"), f.col("results.user.location.zip", ).alias("location_zip"), "results.user.email", "results.user.password", "results.user.salt", "results.user.md5", "results.user.sha1", "results.user.sha256", "results.user.registered", "dob", "results.user.phone", "results.user.cell", f.col("results.user.picture.large").alias("picture_large"), f.col("results.user.picture.medium").alias("picture_medium"), "results.user.picture.thumbnail", "user_current_date") return users
def main(inp, outp): listings = spark.read.csv(inp, header=True, schema=schema) listings = listings.drop('host_id', 'host_name', 'neighbourhood_group', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365') # listings.describe(['minimum_nights']).show() # Give a range to minimum_nights listings = listings.filter("minimum_nights > 0 AND minimum_nights <= 30") # Remove listings with last_review posted more than 2 years ago listings = listings.withColumn('curr', f.current_date()) listings = listings.withColumn( 'diff', f.datediff(f.col('curr'), f.col('last_review'))) cleaned_listings = listings.filter(listings['diff'] < 730).drop( 'curr', 'diff', 'number_of_reviews', 'last_review') ''' Coalescion is fine, as listing data only contains airbnb locations in Vancouver ''' cleaned_listings.coalesce(1).write.csv(outp, mode='overwrite', header=True)
def createAgeDateColumns(self, df): """Criar colunas de idade do cliente e data do pedido. Args: df: Dataframe Returns: Dataframe. """ log( "INFO", f"[createAgeDateColumns] Criando colunas de idade e data do pedido." ) return (df.withColumn( "data_pedido_date", date_format( from_unixtime(col('data_pedido')), "yyyy-MM-dd")).withColumn('idade', (months_between( current_date(), col('data_nascimento_cliente')) / 12).cast( IntegerType())))
def get_grade_cons_except(): print(f"{str(dt.now())} 近30天零售户档位与周边消费水平不符") try: consume_level=get_all_consume_level(spark)\ .select("city","cust_id","consume_level") cust_cluster=get_cust_cluster(spark) #只展示目前有效的零售户 去掉 过去有效 现在无效的零售户 valid_co_cust=get_valid_co_cust(spark).select("cust_id","sale_center_id") #近30天零售户档位 一个零售户近30天内多种档位,分别预警 cust_seg=spark.sql("select * from DB2_DB2INST1_CO_CUST")\ .where(f.datediff(f.current_date(),col("dt"))<=30)\ .where(col("com_id").isin(["011114305","011114306","011114302"]))\ .withColumn("cust_seg",f.regexp_replace("cust_seg","ZZ","31"))\ .select("com_id","cust_id","cust_seg")\ .join(valid_co_cust,"cust_id")\ .dropDuplicates(["cust_id","cust_seg"]) cols = {"value": "retail_rim_cons", "level1_code": "classify_level1_code", } cust_seg_cons=cust_seg.join(consume_level,"cust_id")\ .withColumnRenamed("consume_level",cols["value"])\ .join(cust_cluster,"cust_id")\ .select("city","cust_id","cust_seg","cluster_index","sale_center_id",cols["value"]) result=except_grade(cust_seg_cons,cols,["city","cust_seg","cluster_index"],[3,4,5])\ .withColumnRenamed("cust_seg","retail_grade")\ .withColumnRenamed("mean","city_grade_cons_avg") \ .withColumn(cols["level1_code"], f.lit("YJFL003")) values = [ "avg_orders_plus3", "avg_orders_minu3", "avg_orders_plus4", "avg_orders_minu4", "avg_orders_plus5", "avg_orders_minu5", "warning_level_code", "cust_id", "city", "sale_center_id", "retail_grade", "city_grade_cons_avg" ] + list(cols.values()) result.foreachPartition(lambda x:write_hbase2(x,values,hbase)) except Exception: tb.print_exc()
def execute(self): """ Method overrides the execute method of the abstract Task class. Method classifies the recipe based on the difficulty. Difficulty is classified based on the total time required for prepping and cooking the food. If sum(prep time + cooking time) > 1 Hour is as Hard. If sum(prep time + cooking time) > 30 minutes and less than an hour is as Medium If sum(prep time + cooking time) < 30 minutes as Easy As 'Unknown' for all other cases. :return: pyspark.sql.DataFrame Returns a Spark's DataFrame with the transformation for the recipe. """ return self.data_frame \ .withColumn('cookTimeInMinutes', when(lower(col('cookTime')).endswith('h'), regexp_extract('cookTime', '\d+', 0) * 60) .when(lower(col('cookTime')).endswith('m'), regexp_extract('cookTime', '\d+', 0)).otherwise("")) \ .withColumn('prepTimeInMinutes', when(lower(col('prepTime')).endswith('h'), regexp_extract('prepTime', '\d+', 0) * 60) .when(lower(col('prepTime')).endswith('m'), regexp_extract('prepTime', '\d+', 0)).otherwise("")) \ .withColumn('difficulty', when(expr('(cookTimeInMinutes + prepTimeInMinutes) > 60'), "Hard") .when(expr('(cookTimeInMinutes + prepTimeInMinutes) <= 60 and ' '(cookTimeInMinutes + prepTimeInMinutes) >= 30'), 'Medium') .when(expr('(cookTimeInMinutes + prepTimeInMinutes) < 30'), "Easy") .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes > 60'), 'Hard') .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes >= 30 and prepTimeInMinutes <= 60'), 'Medium') .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes < 30'), 'Easy') .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes > 60'), 'Hard') .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes >= 30 and cookTimeInMinutes <= 60'), 'Medium') .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes < 30'), 'Easy') .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes = ""'), 'Unknown') .otherwise("Unknown")) \ .withColumn('date_of_execution', current_date()) \ .drop('cookTimeInMinutes') \ .drop('prepTimeInMinutes')
def get_co_co_01(spark, scope: list, filter="day"): """ :param spark: SparkSession :param scope: [lower,upper] 日期过滤范围 :param filter: 过滤类别 "day","week","month" default:"day" :return: """ # 获取co_co_01 unique_kind: 90 退货 10 普通订单 pmt_status: 0 未付款 1 收款完成 co_co_01 = spark.sql( "select * from DB2_DB2INST1_CO_CO_01 where (unique_kind = 90 and pmt_status=0) or (unique_kind=10 and pmt_status=1)") \ .where(col("com_id").isin(cities)) \ .where(col("born_date").rlike("\d{8}")) \ .withColumn("born_date", f.to_date("born_date", "yyyyMMdd")) \ .withColumn("today", f.current_date()) \ .withColumn("qty_sum", col("qty_sum").cast("float")) \ .withColumn("amt_sum", col("amt_sum").cast("float")) lower = scope[0] upper = scope[1] if lower >= 0 and upper >= lower: if filter in ["day", "week", "month"]: if filter == "day": co_co_01 = co_co_01.withColumn("day_diff", f.datediff(col("today"), col("born_date"))) \ .where((col("day_diff") >= lower) & (col("day_diff") <= upper)) elif filter == "week": co_co_01=co_co_01.withColumn("week_diff", week_diff("today", "born_date"))\ .where((col("week_diff") >= lower) & (col("week_diff") <= upper)) else: co_co_01 = co_co_01.withColumn("month_diff", month_diff_udf(f.year(col("born_date")), f.month(col("born_date")), f.year(col("today")), f.month(col("today")))) \ .where((col("month_diff") >= lower) & (col("month_diff") <= upper)) else: raise Exception("filter is 'day'/'week'/'month'") else: raise Exception("lower must >=0 and upper must >= lower") return co_co_01
def get_co_co_line(spark, scope: list, filter="day"): """ :param spark: SparkSession :param scope: [lower,upper] 日期过滤范围 :param filter: 过滤类别 "day","week","month" default:"day" :return: """ co_co_line = spark.sql( "select * from DB2_DB2INST1_CO_CO_LINE") \ .where(col("com_id").isin(cities))\ .where(col("born_date").rlike("\d{8}"))\ .withColumn("born_date", f.to_date("born_date", "yyyyMMdd")) \ .withColumn("today", f.current_date()) \ .withColumn("qty_ord", col("qty_ord").cast("float")) \ .withColumn("price", col("price").cast("float"))\ .withColumn("qty_rsn",col("qty_rsn").cast("float")) lower = scope[0] upper = scope[1] if lower >= 0 and upper >= lower: if filter in ["day", "week", "month"]: if filter == "day": co_co_line = co_co_line.withColumn("day_diff", f.datediff(col("today"), col("born_date"))) \ .where((col("day_diff") >= lower) & (col("day_diff") <= upper)) elif filter == "week": co_co_line = co_co_line.withColumn("week_diff", week_diff("today", "born_date")) \ .where((col("week_diff") >= lower) & (col("week_diff") <= upper)) else: co_co_line = co_co_line.withColumn("month_diff", month_diff_udf(f.year(col("born_date")), f.month(col("born_date")), f.year(col("today")), f.month(col("today")))) \ .where((col("month_diff") >= lower) & (col("month_diff") <= upper)) else: raise Exception("filter is 'day'/'week'/'month'") else: raise Exception("lower must >=0 and upper must >= lower") return co_co_line
def adobe_traffic_by_department(df_traffic_by_department, df_deparadepto): df_depara = (df_deparadepto.filter( (df_deparadepto['tabela'] == 'departamento') & (df_deparadepto['campo'] == 'iddepartamento')).select( df_deparadepto['de'], df_deparadepto['para'])) return (df_traffic_by_department.join( df_depara, df_traffic_by_department['iddepartament'] == df_depara['de'], 'left').select(df_traffic_by_department['id'], df_traffic_by_department['createdat'], df_traffic_by_department['reportsuiteid'], df_traffic_by_department['reportsuitename'], df_traffic_by_department['server'], df_traffic_by_department['data'], df_traffic_by_department['hour'], df_traffic_by_department['iddepartament'], f.trim(df_depara['para']).alias('namedepartment'), df_traffic_by_department['visits'], df_traffic_by_department['orders'], df_traffic_by_department['ordersserialized'], df_traffic_by_department['revenue']).withColumn( 'dataprocessamento', f.trunc(f.current_date(), 'month')))
def test_create(self): data = [ (1001, "Jane", "Doe", "2000-05-01", 29.0, False), (1002, "John", "Doe", "1988-05-03", 33.0, False), (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True), (2202, None, None, "1980-05-03", 45.0, False), # Add a few nulls (2235, "", "", "1980-05-03", 43.0, True) ] df = get_spark_session().createDataFrame( data, ["id", "first_name", "last_name", "dob", "age", "is_temp"]) # Ensure that a Hyper file can be created with date and timestamp columns df.withColumn("hire_date", current_date()) df.withColumn("last_updated", current_timestamp()) creator = Creator(df, 'employees', False) hyper_file_path = creator.create() assert (hyper_file_path == "/tmp/hyperleaup/employees/employees.hyper") tables = TestUtils.get_tables( "Extract", "/tmp/hyperleaup/employees/employees.hyper") assert (len(tables) == 1) num_rows = TestUtils.get_row_count( "Extract", "Extract", "/tmp/hyperleaup/employees/employees.hyper") assert (num_rows == 5)
def main(input_dir, output_dir): spark = SparkSession.builder.getOrCreate() spark.conf.set("spark.sql.session.timeZone", "UTC") df = read_pg_dump(spark, input_dir) # apply some other transformations to obtain the final table result = ( df.withColumn("parts", F.split("table_name", "_")).withColumn( "dimension", F.from_json("dimension", DIMENSION_SCHEMA)). select( F.current_date().alias("ingest_date"), "aggregate_type", "ds_nodash", # parts[:2] form aggregate_type, this is parsed from the filename F.col("parts").getItem(2).alias("channel"), F.col("parts").getItem(3).alias("version"), # parts[-1] is ds_nodash, this is parsed from the filename "dimension.*", from_pg_array_udf("aggregate"), )) # -RECORD 0----------------------------------------------------- # aggregate_type | submission_date # channel | nightly # version | 43 # ds_nodash | 20191201 # os | Windows_NT # child | false # label | # metric | A11Y_INSTANTIATED_FLAG # osVersion | 10.0 # application | Firefox # architecture | x86 # aggregate | "[0, 2, 0, 2, 2]" result.repartition(1).write.parquet(output_dir, mode="overwrite")
jdbc_params = { "url": ut.get_mysql_jdbc_url(app_secret), "lowerBound": "1", "upperBound": "100", "dbtable": src_conf["mysql_conf"]["dbtable"], "numPartitions": "2", "partitionColumn": src_conf["mysql_conf"]["partition_column"], "user": app_secret["mysql_conf"]["username"], "password": app_secret["mysql_conf"]["password"] } txn_df = spark\ .read.format("jdbc")\ .option("driver", "com.mysql.cj.jdbc.Driver")\ .options(**jdbc_params)\ .load() \ .withColumn("ins_dt", functions.current_date()) txn_df.show() txn_df.write \ .mode('overwrite')\ .partitionBy("INS_DT") \ .parquet("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/staging/SB") print("\nWriting SB data to S3 <<") elif src == "OL": print("\nReading OL data from sftp >>") txn_df2 = spark.read.format("com.springml.spark.sftp") \ .option("host", app_secret["sftp_conf"]["hostname"]) \ .option("port", app_secret["sftp_conf"]["port"]) \ .option("username", app_secret["sftp_conf"]["username"]) \
"partitionColumn": src_conf["mysql_conf"]["partition_column"], "user": app_secret["mysql_conf"]["username"], "password": app_secret["mysql_conf"]["password"] } # print(jdbcParams) # use the ** operator/un-packer to treat a python dictionary as **kwargs print( "\nReading data from MySQL DB using SparkSession.read.format()," ) txnDF = spark\ .read.format("jdbc")\ .option("driver", "com.mysql.cj.jdbc.Driver")\ .options(**jdbc_params)\ .load()\ .withColumn('ins_date', current_date()) # add the current date to the above df and then you write it txnDF.show() txnDF\ .write.mode("append")\ .partitionBy('ins_date') \ .format("parquet") \ .save(src_loc) elif src == 'OL': # put the code here to pull rewards data from SFTP server and write it to s3 ol_txn_df = spark.read\ .format("com.springml.spark.sftp")\ .option("host", app_secret["sftp_conf"]["hostname"])\ .option("port", app_secret["sftp_conf"]["port"])\
def color_locator(column, color_string): return locate(color_string.upper(), column)\ .cast("boolean")\ .alias("is_" + color_string) selectedColumns = [color_locator(df.Description, c) for c in simpleColors] selectedColumns.append(expr("*")) # has to a be Column type df.select(*selectedColumns).where(expr("is_white OR is_red"))\ .select("Description").show(3, False) # COMMAND ---------- from pyspark.sql.functions import current_date, current_timestamp dateDF = spark.range(10)\ .withColumn("today", current_date())\ .withColumn("now", current_timestamp()) dateDF.createOrReplaceTempView("dateTable") # COMMAND ---------- from pyspark.sql.functions import date_add, date_sub dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1) # COMMAND ---------- from pyspark.sql.functions import datediff, months_between, to_date dateDF.withColumn("week_ago", date_sub(col("today"), 7))\ .select(datediff(col("week_ago"), col("today"))).show(1)
from pyspark.sql import SparkSession from pyspark.sql.functions import col, lit, to_date, current_date, \ current_timestamp, date_add, date_sub, datediff, months_between, to_timestamp, hour if __name__ == '__main__': spark = SparkSession.builder.appName("learning").master( "local").getOrCreate() spark.range(5).withColumn('date', to_date(lit('2019-01-01'))).show() spark.read.jdbc spark.range(5)\ .select(current_date().alias('date'), current_timestamp().alias('timestamp'))\ .select(date_add(col('date'), 1), date_sub(col('timestamp'), 1)).show() spark.range(5).select(to_date(lit('2019-01-01')).alias('date1'), to_date(lit('2019-01-05')).alias('date2'))\ .select(datediff(col('date2'), col('date1')), months_between(col('date2'), col('date1'))).show() spark.range(5).withColumn('date', to_date( lit('2019-XX-XX'))).show() #No emite excepcion spark.range(5).withColumn('date_comp1', to_date(lit('2019-01-01')) > to_date(lit('2019-01-02'))) \ .withColumn('date_comp2', to_date(lit('2019-01-01')) > to_timestamp(lit('2019-01-02'))) \ .withColumn('date_comp3', to_date(lit('2019-01-01')) > "2019-01-02") \ .withColumn('date_comp3', to_date(lit('2019-01-01')) > "'2019-01-02'").show() spark.range(5).select(current_timestamp().alias("timestamp")).select( hour(col('date')))
from pyspark.sql.functions import initcap, lower, upper, ltrim, rtrim, trim, lpad, rpad df.select(initcap(col("Description"))).show(5) df.select(col("Description"), initcap(col("Description")), lower(col("Description")), upper(col("Description"))).show(5) string_with_space = " hello " df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)), trim(lit(string_with_space))).show() #regular expressions #working with dates, timestamps from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp dateDF = spark.range(10).withColumn("today", current_date()).withColumn( "now", current_timestamp()) dateDF.show() dateDF.select( date_add(col("today"), 5).alias("today+5"), date_sub(col("today"), 5).alias("today-5")).show() #convert string to date, default format is 'YYYY-MM-DD' spark.range(1).select( to_date(lit("2019-02-01")).alias("start_date"), to_date(lit("2019-03-06")).alias("end_date")).select( datediff(col("start_date"), col("end_date"))).show() cleanDateDF = spark.range(1).withColumn("date1", current_date()) date_format = 'YYYY-MM-DD'