-
Notifications
You must be signed in to change notification settings - Fork 0
/
Time_date.py
78 lines (62 loc) · 3.03 KB
/
Time_date.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from pyspark.sql.functions import concat_ws,to_utc_timestamp
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import avg,format_number,year,month,hour,minute,dayofmonth,date_sub,date_add,datediff,months_between
spark = SparkSession. \
builder. \
appName('DF DateTime Example'). \
getOrCreate()
df_nycflights = spark.read. \
csv('/user/nycflights13.csv',inferSchema=True,header=True)
'''Concatenating year month and day then hour and minute to make a field date'''
'''We are trying to create timestamp field'''
df_nycflights = df_nycflights. \
withColumn('date',
concat_ws('-',
df_nycflights.year,
df_nycflights.month,
df_nycflights.day)). \
withColumn('time',
concat_ws(':',
df_nycflights.hour,
df_nycflights.minute))
df_nycflights.show()
df_nycflights = df_nycflights. \
withColumn('timestamp',
concat_ws(' ',
df_nycflights.date,
df_nycflights.time))
df_nycflights.show()
'''Next we will convert the StringType to TimeStampType'''
df_nycflights = df_nycflights. \
withColumn('timestamp',
to_utc_timestamp(df_nycflights.timestamp,'GMT'))
df_nycflights.show()
'''Next we make the column timestamp as first column'''
'''Note columns will return all column name'''
df_nycflights = df_nycflights. \
select(df_nycflights.columns[-1:] + df_nycflights.columns[0:-1])
df_nycflights.show()
'''Now we drop year,month,day,hour,minute,date,time columns as we will again try to create these from timestamp column that we created'''
df_nycflights = df_nycflights. \
drop('year'). \
drop('month'). \
drop('day'). \
drop('hour'). \
drop('minute'). \
drop('date'). \
drop('time')
df_nycflights.show()
'''Now we extract the fields back'''
df_nycflights = df_nycflights. \
withColumn('year',year(df_nycflights.timestamp)). \
withColumn('month',month(df_nycflights.timestamp)). \
withColumn('day',dayofmonth(df_nycflights.timestamp)). \
withColumn('hour',hour(df_nycflights.timestamp)). \
withColumn('minute',minute(df_nycflights.timestamp))
df_nycflights.show()
'''Now few operations on timestamp '''
df_nycflights = df_nycflights.\
withColumn('date_sub',date_sub(df_nycflights.timestamp ,10)). \
withColumn('date_add',date_add(df_nycflights.timestamp ,10)). \
withColumn('months_between',months_between(df_nycflights.timestamp,df_nycflights.timestamp))
df_nycflights.show()