def parse_dates(df, format): """ Parses dateinto year,month,day :param df: input df :param format: the format of the timestamp :return: dataframe """ return df.withColumn('parsed_date', f.to_timestamp(f.col('transaction_date'), format)) \ .withColumn("year", f.year(f.col('parsed_date'))) \ .withColumn("month", f.month(f.col('parsed_date'))) \ .withColumn("day", f.dayofmonth(f.col('parsed_date'))) \ .withColumn("unix_ts", f.unix_timestamp('parsed_date')) \ .drop("transaction_date")
def shared_test_partition_preserving(self, func, preserve, create = None): from pyspark.sql.functions import month from tests.test_data import FORECAST_DATA flintContext = self.flintContext def create_dataframe(): return, ["time", "id", "forecast"])) if create is None: create = create_dataframe df_lazy = create() df_eager = create() df_eager.timeSeriesRDD df = create() df_joined = df.leftJoin(df, right_alias="right") df = create() df_cached = df.cache() df_cached.count() df_cached_joined = df_cached.leftJoin(df_cached, right_alias="right") partition_preserving_input_tranforms = [ lambda df: df, lambda df: df.withColumn("f2", df.forecast * 2), lambda df:"time", "id", "forecast"), lambda df: df.filter(month(df.time) == 1) ] order_preserving_input_tranforms = [ lambda df: df.orderBy("time") ] input_dfs = [df_lazy, df_eager, df_joined, df_cached, df_cached_joined] for transform in partition_preserving_input_tranforms: for input_df in input_dfs: self.assert_partition_preserving(transform(input_df), func, preserve) for transform in order_preserving_input_tranforms: for input_df in input_dfs: self.assert_order_preserving(transform(input_df), func, preserve) df_cached.unpersist()
#add new column new_column = df.withColumn("Continent",) #rename column name rename= df.withColumnRenamed("first","first_name") #string manypulations---->>>don't forget to from pyspark.sql import functions'email','@')) #concate string':','country','first')).collect() #extract a perticular year ,date ,time from a column'created_at')).show()'created_at')).show() #filter data df.filter(col('email').contains('')).show() df.filter('country'== 'Switzerland').show() df.filter(col('country').isin("'Switzerland'")).show() df.filter(col('first').like('T%')).show() df.filter(col('id').between(1,10)).show() #some dataframe api'country').sort('country').show()
def month(self) -> "ks.Series": """ The month of the timestamp as January = 1 December = 12. """ return column_op(lambda c: F.month(c).cast(LongType()))( self._data).alias(
StructField("hashTags", ArrayType(StringType()), True), StructField("lang", StringType(), True), StructField("text", StringType(), True), StructField("createdAt", LongType(), True) ]) (spark.readStream.table("tweets.`bronze`").withColumn( "json", from_json(col("tweet"), schema)).filter( col("").isNotNull()).withColumn( "hashtag", explode("json.hashTags")).withColumn( "hashtag", lower(col("hashtag"))).withColumn( "createdAt", (col("json.createdAt").cast(LongType()) / 1000).cast(TimestampType())).withColumn( "year", year(col("createdAt"))).withColumn( "month", month(col("createdAt"))).withColumn( "day", dayofmonth(col("createdAt"))).select( "", "json.user", "hashtag", "json.lang", "json.text", "createdAt", "year", "month", "day").writeStream.format("delta").option( "checkpointLocation", silverCheckpointPath). outputMode("append").queryName(silverStreamName).table("tweets.`silver`")) # COMMAND ---------- # %sql select * from tweets.`silver` order by createdAt desc limit 10; # COMMAND ----------
from pyspark.sql import SparkSession spark = SparkSession.builder.appName('dates').getOrCreate() df ='appl_stock.csv', header=True, inferSchema=True) print(['Date', 'Open']).show()) # Operowanie na datach from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, format_number, date_format)['Date'])).show()['Date'])).show()['Date'])).show() # Średnia zamykająca cena na rok['Date'])).show() newdf = df.withColumn("Year", year(df['Date'])) result = newdf.groupBy("Year").mean().select(["Year", "avg(Close)"]) new = result.withColumnRenamed("avg(Close)", "Average Closing Price") ['Year', format_number('Average Closing Price', 2).alias("Avg Close")]).show()
## collecting date for partitioning date ='nav_date')).collect()[0][0] year = date.year month = date.month if month < 10: month = '0' + str(month) part_date = str(year) + str(month) ## extracting weekday column using date data = data.withColumn('week_day', f.date_format('nav_date', 'E')) sqlctxt.sql("set hive.exec.dynamic.partition.mode=nonstrict") funds = data.filter(f.month('nav_date') == 4).select('fund_id').distinct() funds.registerTempTable("fund_id") #sqlctxt.sql("insert overwrite table h011gtcsandbox.xnd_pricing_fund_id PARTITION (YYYYMM = " + str(part_date) + ") select * from fund_id") funds ='fund_id')).collect()[0][0] funds = [str(funds) for funds in funds] data_without_funds = data.where(~data.fund_id.isin(funds)) funds_miss = data.filter((data.fund_id == '2DEC')) data_with_funds = data.where(data.fund_id.isin(funds)) data = data_with_funds.unionAll(funds_miss) ## function to convert weekday from string to numerical EX: Monday as 0 so on def week_day(x):
# Transform transformed_temp_df = cleaned_temp_df\ .select("dt", "AverageTemperature", "AverageTemperatureUncertainty", "City", "Country", "Latitude", "Longitude")\ .withColumn("dt", udf_parse_datetime("dt"))\ .withColumnRenamed("AverageTemperature", "avg_temp")\ .withColumnRenamed("AverageTemperatureUncertainty", "avg_temp_uncertainty")\ .withColumn("city_code", udf_map_country("country"))\ .withColumnRenamed("City", "city")\ .withColumnRenamed("Country", "country")\ .withColumnRenamed("Latitude", "latitude")\ .withColumnRenamed("Longitude", "longitude")\ .withColumnRenamed("dt", "date_time")\ .withColumn('month', month('date_time')) \ .withColumn('year', year('date_time')) \ transformed_temp_df = transformed_temp_df.filter(transformed_temp_df.city_code != 'null') # Write transformed_temp_df.write\ .partitionBy("city_code", "year", "month")\ .mode("append")\ .parquet("{}/transformed/temperature/".format(s3_bucket_name))
def select_range_time(df,day_ini,day_fin): df_ret=df.filter(year("created_at")>=day_ini.year).filter(month("created_at")>=day_ini.month).filter(dayofmonth("created_at")> df_ret=df_ret.filter(year("created_at")<=day_fin.year).filter(month("created_at")<=day_fin.month).filter(dayofmonth("created_at")< return(df_ret)
# There’s an API named agg(*exprs) that takes a list of column names and expressions for the type of aggregation you’d like to compute. # You can leverage the built-in functions that mentioned above as part of the expressions for each column. # Provide the min, count, and avg and groupBy the location column. Diplay the results agg_df = df.groupBy("location").agg(F.min("id"), F.count("id"), F.avg("date_diff")) display(agg_df) # COMMAND ---------- # DBTITLE 1,I’d like to write out the DataFrames to Parquet, but would like to partition on a particular column. # You can use the following APIs to accomplish this. # Ensure the code does not create a large number of partition columns with the datasets otherwise the overhead of the metadata can cause significant slow downs. # If there is a SQL table back by this directory, you will need to call refresh table <table-name> to update the metadata prior to the query. df = df.withColumn('end_month', F.month('end_date')) df = df.withColumn('end_year', F.year('end_date')) df.write.partitionBy("end_year", "end_month").parquet("/tmp/sample_table") display("/tmp/sample_table")) # COMMAND ---------- # DBTITLE 1,How do I properly handle cases where I want to filter out NULL data? null_item_schema = StructType([ StructField("col1", StringType(), True), StructField("col2", IntegerType(), True) ]) null_df = spark.createDataFrame([("test", 1), (None, 2)], null_item_schema) display(null_df.filter("col1 IS NOT NULL")) # COMMAND ----------
os.getenv("HOME") + "/.ivy2/jars/org.postgresql_postgresql-42.1.1.jar") conf.set("spark.executor.extrajavaoptions", "-Xmx15000m") conf.set("spark.executor.memory", "15g") conf.set("spark.driver.memory", "15g") conf.set("", "0") spark = SparkSession.builder \ .config(conf=conf) \ .master("local") \ .appName("SAIDI Calculator") \ .getOrCreate() config = open('config.yaml') config = yaml.load(config) #connect to the database pw_df = "jdbc:postgresql://", "pw_dedupe", properties={ "user": config['user'], "password": config['password'], "driver": "org.postgresql.Driver" }) #read the data that we care about pw_df =['core_id'], pw_df['time'], pw_df['product_id']) pw_df = pw_df.filter("product_id = 7008 OR product_id= 7009") pw_df.groupBy(month("time")).agg(countDistinct('core_id')).show()
def test_month(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).select(f.month(f.col('a'))))
# In[30]: # 2. Add a date column orders_2 = orders_1.withColumn('Date',convertToDate(orders_1['OrderDate'])) # In[31]: # In[32]: # 3. Add month and year #orders_3 = orders_2.withColumn('Month',getMonth(orders_2['Date'])).withColumn('Year',getYear(orders_2['Date'])) orders_3 = orders_2.withColumn('Month',F.month(orders_2['Date'])).withColumn('Year',F.year(orders_2['Date'])) orders_3 = orders_2.withColumn('Month',getM(orders_2['Date'])).withColumn('Year',getY(orders_2['Date'])) # In[33]: # In[34]: # 3. How many orders by month/year ? import time start_time = time.time() orders_3.groupBy("Year","Month").sum('Total').show() print "%s Elapsed : %f" % (, time.time() - start_time)
collectibles_df.createOrReplaceTempView("collectibles") collectibles_df = spark.sql( "SELECT ROW_NUMBER() OVER(ORDER BY Collectible) as Id, * FROM collectibles" ) # Generate dim_Time feed combined_timestamp ="timestamp") \ .union("timestamp")) \ .union("timestamp")) \ .union("timestamp")) time_df ="timestamp") \ .where(col("Timestamp").isNotNull()) \ .distinct() \ .orderBy("timestamp") time_df = time_df.withColumn("Year", year(time_df["timestamp"])) \ .withColumn("Month", month(time_df["timestamp"])) \ .withColumn("Day", dayofmonth(time_df["timestamp"])) \ .withColumn("Hour", hour(time_df["timestamp"])) \ .withColumn("Minute", minute(time_df["timestamp"])) \ .withColumn("Second", second(time_df["timestamp"])) # prepare glasses activities glasses_activities_acc_x = time_df.join(glasses_df, "timestamp", how="inner") \ .select( [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_X").alias("Collectible"), "timestamp", "ACC_X"]) glasses_activities_acc_y = time_df.join(glasses_df, "timestamp", how="inner") \ .select( [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_Y").alias("Collectible"), "timestamp", "ACC_Y"]) glasses_activities_acc_z = time_df.join(glasses_df, "timestamp", how="inner") \ .select( [F.lit(1).alias("PersonId"), F.lit(3).alias("SourceId"), F.lit("ACC_Z").alias("Collectible"), "timestamp", "ACC_Z"])
) \ .withColumn('eyesopen', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].eyesopen'), \ StructType().add('confidence', DoubleType()).add('value', BooleanType()) \ ) \ ) \ .withColumn('mouthopen', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].mouthopen'), \ StructType().add('confidence', DoubleType()).add('value', BooleanType()) \ ) \ ) \ .drop('ts') \ .withColumnRenamed('n_ts', 'ts') \ .withColumn('year', F.year('ts')) \ .withColumn('month', F.month('ts')) ## Sometimes we need to distribute the data based on a specific column, higher cardinality is better. ## To see the number of spark partitions being used: df.rdd.getNumPartitions() df = df.repartition('ts') ## Finally write the data back out to S3 in partitioned Parquet format ## maxRecordsPerFile is recommended over the old method of using coalesce() df \ .withColumn('smiling', \ .write \ .option('maxRecordsPerFile', 1000) \ .partitionBy('year', 'month', 'smiling') \ .mode('overwrite') \ .parquet('s3://bucket/prefix')
