/
data_description.py
96 lines (68 loc) · 3.11 KB
/
data_description.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
import dask.dataframe as dd
import os
w_dir = 'D:/BIG DATA'
directory_load = 'Merged Files'
os.chdir(w_dir)
ccys = ['EURUSD', 'EURCHF', 'EURGBP', 'EURJPY', 'EURAUD']
import time
total_start_time = time.time()
descriptions =[]
for ccy in ccys:
for year in range(2003,2017):
start_time = time.time()
df = dd.read_hdf(directory_load+'/'+ccy+'-'+str(year)+'.h5','*')
with open(directory_load+'/'+'Data Description.txt', 'a') as fout:
description = df.describe().compute()
descriptions.append([ccy,year,description])
fout.write('\n'.join([
'\n\n=======================================================',
'Ccy: '+ccy+' Year: '+str(year),
str(description)]))
elapsed = time.time()-start_time
with open(directory_load+'/'+'Data Description.txt', 'a') as fout:
fout.write('\nTime Elapsed: '+str(np.round(elapsed,2)))
total_elapsed = time.time()-total_start_time
print(total_elapsed)
#%%
counts = [[description[2].loc['count','Bid'] for description in descriptions if description[0]==ccy] for ccy in ccys]
min_mean_max_usd = [description[2].loc[['min','mean','max'],'Bid'] for description in descriptions if description[0]==ccys[0]]
min_mean_max_chf = [description[2].loc[['min','mean','max'],'Bid'] for description in descriptions if description[0]==ccys[1]]
min_mean_max_gbp = [description[2].loc[['min','mean','max'],'Bid'] for description in descriptions if description[0]==ccys[2]]
min_mean_max_jpy = [description[2].loc[['min','mean','max'],'Bid'] for description in descriptions if description[0]==ccys[3]]
min_mean_max_aud = [description[2].loc[['min','mean','max'],'Bid'] for description in descriptions if description[0]==ccys[4]]
#%%
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import numpy as np
params = {'legend.fontsize': 'xx-large',
'figure.figsize': (18, 15),
'axes.labelsize': 'xx-large',
'axes.titlesize':'xx-large',
'xtick.labelsize':'xx-large',
'ytick.labelsize':'xx-large'}
pylab.rcParams.update(params)
plt.figure()
plt.plot(range(2003,2017),np.array(counts).T)
#plt.title('Data Entries by Currency each Year')
plt.legend(ccys)
plt.figure()
plt.plot(range(2003,2017),np.array(min_mean_max_usd))
#plt.title('Min, Mean and Max of '+ccys[0]+' each year')
plt.legend(['Min','Mean','Max'])
plt.figure()
plt.plot(range(2003,2017),np.array(min_mean_max_chf))
#plt.title('Min, Mean and Max of '+ccys[1]+' each year')
plt.legend(['Min','Mean','Max'])
plt.figure()
plt.plot(range(2003,2017),np.array(min_mean_max_gbp))
#plt.title('Min, Mean and Max of '+ccys[2]+' each year')
plt.legend(['Min','Mean','Max'])
plt.figure()
plt.plot(range(2003,2017),np.array(min_mean_max_jpy))
#plt.title('Min, Mean and Max of '+ccys[3]+' each year')
plt.legend(['Min','Mean','Max'])
plt.figure()
plt.plot(range(2003,2017),np.array(min_mean_max_aud))
#plt.title('Min, Mean and Max of '+ccys[4]+' each year')
plt.legend(['Min','Mean','Max'])