/
DataProcessor.py
91 lines (71 loc) · 3.03 KB
/
DataProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import PeakAnalysis
def load_data():
f = open('data/data.csv')
# create empty list
inflation = []
unemployment = []
# infinite loop
while True:
# read a single line from file
line = f.readline()
# stops when the content is empty
if len(line) == 0:
break
# split line by ','
[inf, unemp] = line.split(',')
# cast to float and append to the list
inflation.append(float(inf))
unemployment.append(float(unemp))
# closes the file
f.close()
return inflation, unemployment
def match_peaks(inf, peak_inf, une, peak_une):
is_upward = inf[peak_inf[0]] < inf[peak_inf[1]]
# inflation 이 unemployment 의 변화에 영향을 미친다 -- inflation 의 첫 peak 를 가지고 첫번째 opposite 을 찾아냄
first_opposite = 0
while first_opposite < len(peak_une):
if is_upward and (une[peak_une[first_opposite]] > une[peak_une[first_opposite + 1]]):
break
if not is_upward and (une[peak_une[first_opposite]] < une[peak_une[first_opposite + 1]]):
break
first_opposite += 1
matched = []
j = 0
for i in range(first_opposite, len(peak_une)):
matched.append((peak_inf[j], peak_une[i]))
j += 1
return matched
# peaks: list of integers --> each element indicates the position of peak in data
# peak 사이 diff 를 num slices 20 만큼으로 나눠서 그 step 만큼 data 를 가지고 interpolate
def interpolate_peaks(data, peaks: list, num_slices=20):
new_data = []
for curr_idx in range(len(peaks) - 1):
next_idx = curr_idx + 1
curr_peak = peaks[curr_idx]
next_peak = peaks[next_idx]
step = (next_peak - curr_peak) / num_slices
for s in range(num_slices):
pos = curr_peak + s * step
smaller = int(pos)
if pos == smaller:
new_data.append(data[smaller])
else:
# smaller == pos 인 경우 smaller == larger 이니까 이렇게 else를 larger = smaller + 1 으로
larger = smaller + 1
slope = data[larger] - data[smaller]
new_data.append(data[smaller] + slope * (pos - smaller))
new_data.append(data[peaks[-1]])
return new_data
if __name__ == '__main__':
inflation, unemployment = load_data()
# why not use mark real peaks, remove ambiguous peaks?
mod_inf, peak_inf = PeakAnalysis.find_peak(inflation, 0.6)
mod_une, peak_une = PeakAnalysis.find_peak(unemployment, 0.7)
matched = match_peaks(mod_inf, peak_inf, mod_une, peak_une)
interpolated_inf = interpolate_peaks(mod_inf, list(map(lambda t: t[0], matched)))
interpolated_une = interpolate_peaks(mod_une, list(map(lambda t: t[1], matched)))
assert(len(interpolated_inf) == len(interpolated_une))
# with EXPRESSION as NAME:
with open('interpolated_data.csv', 'w') as f:
for idx in range(len(interpolated_inf)):
f.write(str(interpolated_inf[idx]) + ',' + str(interpolated_une[idx]) + '\n')