-
Notifications
You must be signed in to change notification settings - Fork 1
/
heterogeneous_data.py
54 lines (46 loc) · 2.69 KB
/
heterogeneous_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
'''
Created on Jan 13, 2016
@author: hanhanwu
'''
import mock_Chinese_stock_price
import cross_validation
import KNN
def rescale(data_set, scale):
scaled_data = []
for row in data_set:
scaled_input = [row['input'][i]*scale[i] for i in range(len(scale))]
scaled_data.append({'input': scaled_input, 'price': row['price']})
return scaled_data
def normalization(data_set, min_max):
normalized_data = []
for row in data_set:
normalized_input = [float(row['input'][i]-min_max[i][0])/(min_max[i][1]-min_max[i][0]) for i in range(len(min_max))]
normalized_data.append({'input': normalized_input, 'price': row['price']})
return normalized_data
def main():
# when the attributes have different data range
heterogeneous_data = mock_Chinese_stock_price.get_stockset_various()
# in this dataset, I have added investment, and employee number,
# they all have large numbers and will influence the results significantly without normalization,
# then those more important attributes with smaller values may not influence the result and the final result cannot be accurate
print 'before re-scale/normalization'
cv_total_error_unweighted = cross_validation.cross_validate(heterogeneous_data, algr = KNN.get_KNN, trails=100)
cv_total_error_weighted = cross_validation.cross_validate(heterogeneous_data, algr = KNN.get_weightedKNN, trails=100)
print 'cross validation, using un-weighted KNN: ', cv_total_error_unweighted
print 'cross validation, using weighted KNN: ', cv_total_error_weighted
print 'after re-scale'
scale = [10, 10, 10, 0.00001, 0]
scaled_data = rescale(heterogeneous_data, scale)
scaled_cv_total_error_unweighted = cross_validation.cross_validate(scaled_data, algr = KNN.get_KNN, trails=100)
scaled_cv_total_error_weighted = cross_validation.cross_validate(scaled_data, algr = KNN.get_weightedKNN, trails=100)
print 'cross validation, using un-weighted KNN: ', scaled_cv_total_error_unweighted
print 'cross validation, using weighted KNN: ', scaled_cv_total_error_weighted
print 'after normalization'
min_max = [(1,10), (1,20), (1,50), (10000, 10000000)]
normalized_data = normalization(heterogeneous_data, min_max)
normalized_cv_total_error_unweighted = cross_validation.cross_validate(normalized_data, algr = KNN.get_KNN, trails=100)
normalized_cv_total_error_weighted = cross_validation.cross_validate(normalized_data, algr = KNN.get_weightedKNN, trails=100)
print 'cross validation, using un-weighted KNN: ', normalized_cv_total_error_unweighted
print 'cross validation, using weighted KNN: ', normalized_cv_total_error_weighted
if __name__ == '__main__':
main()