def get_filtered_by_month(data: DataFrame) -> DataFrame: """ Method transforms periods "start_date - end_date" to year and month number source: +---+----------+----------+ |key|start_date| end_date| +---+----------+----------+ | 5|2018-01-01|2018-01-09| +---+----------+----------+ result: +---+----------+----------+ |key| year| month| +---+----------+----------+ | 5| 2018| 1| +---+----------+----------+ """ transformed_data = data \ .withColumn("start_date", F.trunc("start_date", "month")) \ .withColumn("monthsDiff", F.months_between("end_date", "start_date")) \ .withColumn("repeat", F.expr("split(repeat(',', monthsDiff), ',')")) \ .select("*", F.posexplode("repeat").alias("date", "val")) \ .withColumn("month", F.expr("add_months(datab, date)")) \ .drop('start_date', 'end_date', 'monthsDiff', 'repeat', 'val', 'date') \ .dropDuplicates() return transformed_data
def _generate_select_expression_for_TimestampMonth(source_column, name): # noqa: N802 """ Used for Anonymizing. Can be used to keep the age but obscure the explicit birthday. This custom datatype requires a :any:`pyspark.sql.types.TimestampType` column as input. The datetime value will be set to the first day of the month. Example ------- >>> from pyspark.sql import Row >>> from datetime import datetime >>> from spooq.transformer import Mapper >>> >>> input_df = spark.createDataFrame( >>> [Row(birthday=datetime(2019, 2, 9, 2, 45)), >>> Row(birthday=None), >>> Row(birthday=datetime(1988, 1, 31, 8))] >>> ) >>> >>> mapping = [("birthday", "birthday", "TimestampMonth")] >>> output_df = Mapper(mapping).transform(input_df) >>> output_df.head(3) [Row(birthday=datetime.datetime(2019, 2, 1, 0, 0)), Row(birthday=None), Row(birthday=datetime.datetime(1988, 1, 1, 0, 0))] """ return F.trunc(source_column, "month").cast(T.TimestampType()).alias(name)
def getGroupedPointOwners(): if environment == "local": pass elif environment == "cluster": pass else: raise AssertionError( "Bad environment variable (environment = {}). Should be local or cluster" .format(environment)) points_df = spark.read.format("orc").load(files_directory) # df modification # points_df = points_df.withColumn("qty", points_df["qty"].cast(DoubleType())) # points_df = points_df.withColumn("period_full_date", # from_unixtime(points_df["period"] / 1000, 'yyyy-MM-dd hh:mm:ss')) points_df = points_df.withColumn( "period", from_unixtime(points_df["period"] / 1000, 'yyyy-MM-dd hh:mm:ss')) # # print(points_df.printSchema) # points_df.show(10) # # points_stats = points_df \ # .groupBy(["period_year_month", "organisationid", "customerid", "typeid"]) \ # .agg(_sum("qty").alias("total_qty")).orderBy(desc("period_year_month")) points_df = points_df.withColumn("qty", points_df["qty"].cast(DoubleType())) points_df = points_df.withColumn('month', trunc(points_df['period'], 'MM')) points_df = points_df.groupby( ['organisationid', 'customerid', 'typeid', 'month']).sum('qty') points_df = points_df.withColumn( "cumulativeSum", _sum('sum(qty)').over( Window.partitionBy(['organisationid', 'customerid', 'typeid']).orderBy('month'))) points_df = points_df.withColumn('aggdate', add_months(points_df['month'], 1)) points_df = points_df.withColumn('aggdate_ts', to_timestamp(points_df['aggdate'])) points_df = points_df.withColumn('aggdate_date', points_df['month'].cast(DateType())) points_df = points_df.withColumn("qty", points_df["cumulativeSum"]) points_df = points_df.drop('cumulativeSum') points_df = points_df.drop('sum(qty)') points_df = points_df.drop('month') points_df.show(100) return points_df
def adobe_traffic_by_channel(df_traffic_by_channel): return (df_traffic_by_channel.select( df_traffic_by_channel['id'], df_traffic_by_channel['createdat'], df_traffic_by_channel['reportsuiteid'], df_traffic_by_channel['reportsuitename'], df_traffic_by_channel['server'], df_traffic_by_channel['data'], df_traffic_by_channel['hour'], df_traffic_by_channel['channel'], df_traffic_by_channel['visits'], df_traffic_by_channel['orders'], df_traffic_by_channel['ordersserialized'], df_traffic_by_channel['revenue']).withColumn( 'dataprocessamento', f.trunc(f.current_date(), 'month')))
def adobe_traffic_by_department(df_traffic_by_department, df_deparadepto): df_depara = (df_deparadepto.filter( (df_deparadepto['tabela'] == 'departamento') & (df_deparadepto['campo'] == 'iddepartamento')).select( df_deparadepto['de'], df_deparadepto['para'])) return (df_traffic_by_department.join( df_depara, df_traffic_by_department['iddepartament'] == df_depara['de'], 'left').select(df_traffic_by_department['id'], df_traffic_by_department['createdat'], df_traffic_by_department['reportsuiteid'], df_traffic_by_department['reportsuitename'], df_traffic_by_department['server'], df_traffic_by_department['data'], df_traffic_by_department['hour'], df_traffic_by_department['iddepartament'], f.trim(df_depara['para']).alias('namedepartment'), df_traffic_by_department['visits'], df_traffic_by_department['orders'], df_traffic_by_department['ordersserialized'], df_traffic_by_department['revenue']).withColumn( 'dataprocessamento', f.trunc(f.current_date(), 'month')))
## sellers who have got orders in each of the first six months of 1997 # spark sql code spark.sql( 'select s.companyName,count(distinct trunc(o.orderdate,"month")) as monthcount \ from orders o \ inner join order_details od \ on o.orderid=od.orderid \ inner join products p \ on od.productid= p.productid \ inner join suppliers s \ on p.supplierid =s.supplierid \ where \ o.orderdate between to_date("1997-01-01") and to_date("1997-06-30") \ group by s.companyName \ having monthcount=6').show() #equivalent python code using dataframes orders = dataframes['orders'] order_details = dataframes['order_details'] products = dataframes['products'] suppliers = dataframes['suppliers'] joined=orders.filter(orders.OrderDate >= '1997-01-01').filter(orders.OrderDate <= '1997-06-30' )\ .join(order_details, orders.OrderID==order_details.OrderID)\ .join(products, order_details.ProductID==products.ProductID)\ .join(suppliers, products.SupplierID==suppliers.SupplierID)\ .groupBy(suppliers.CompanyName)\ .agg(F.countDistinct(F.trunc(orders.OrderDate,"month")).alias('monthcount')) joined.filter(joined.monthcount==6)\ .show()
max_table_name_key = 0 #set dataframe for inserted records df_inserted_table_name = s_df_table_name\ .alias("silver")\ .join(g_df_table_name.alias("gold"),\ (s_df_table_name.Forecast_Date==g_df_table_name.Forecast_Date) & \ (s_df_table_name.Snapshot_Date==g_df_table_name.Snapshot_Date) & \ (s_df_table_name.End_of_Month==g_df_table_name.End_of_Month) & \ (s_df_table_name.Sub_Region_Code==g_df_table_name.Sub_Region_Code) &\ (s_df_table_name.Snp_Seq_No==g_df_table_name.Snp_Seq_No) ,"left_anti")\ .where(s_df_table_name.IBIOperationType != "Delete")\ .selectExpr("NULL as MergeKey1","NULL as MergeKey2","NULL as MergeKey3","NULL as MergeKey4", "NULL as MergeKey5",\ "silver.*", "silver.IBIUpdatedDate as s_IBIUpdatedDate", "silver.IBIUpdatedBy as s_IBIUpdatedBy") df_inserted_table_name = df_inserted_table_name.withColumn("ValidFrom",trunc("End_of_Month", "month")) df_inserted_table_name = df_inserted_table_name.withColumn("ValidTo",df_inserted_table_name.End_of_Month) df_inserted_table_name = df_inserted_table_name.withColumn("IBIChangeReason",lit("New record has been created in source system.")) df_inserted_table_name = df_inserted_table_name.withColumn("IBIOperationType",lit("Insert")) df_inserted_table_name = df_inserted_table_name.withColumn("IsDeleted",lit("false")) df_inserted_table_name = df_inserted_table_name.withColumn("IsLatest",lit("true")) #set dataframe for modified records df_modified_table_name = s_df_table_name\ .alias("silver")\ .join(df_inserted_table_name.alias("inserted"),\ (s_df_table_name.Forecast_Date==df_inserted_table_name.Forecast_Date) & \ (s_df_table_name.Snapshot_Date==df_inserted_table_name.Snapshot_Date) & \ (s_df_table_name.End_of_Month==df_inserted_table_name.End_of_Month) & \