-
Notifications
You must be signed in to change notification settings - Fork 0
/
kriging.py
130 lines (99 loc) · 4.36 KB
/
kriging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import numpy as np
from sys import argv, exit
import time,operator
from sklearn import cross_validation
def normalizeDataset(train, test):
for att in ('age', 'account_value', 'withdrawal_rate', 'maturity'):
mu = np.mean(train[att])
sigma = np.std(train[att])
train[att] = normalize(train[att], mu, sigma)
test[att] = normalize(test[att], mu, sigma)
def normalize(arr, mu, sigma):
return (arr-mu)/sigma
def train_and_test(X_train,Y_train,X_test,Y_test,dtype,alpha,beta_percentile,cv=True):
k = X_train.shape[0]
n = X_test.shape[0]
A = np.zeros(shape=(k,k))
lambd = 1
part1 = np.column_stack((X_train['guarantee_type'], X_train['gender']))
part2 = np.column_stack((X_train['age'], X_train['account_value'], X_train['withdrawal_rate'],X_train['maturity']))
for i in range(k):
tmp = np.array([X_train[i],]*k, dtype=dtype)
tmp1 = np.column_stack((tmp['guarantee_type'], tmp['gender'])) #copy this row k times
tmp2 = np.column_stack((tmp['age'], tmp['account_value'], tmp['withdrawal_rate'],tmp['maturity']))
result1 = lambd*np.sum(np.bitwise_xor(part1, tmp1), axis=1)
result2 = np.sum((part2-tmp2)**2,axis=1)
A[i] = np.sqrt(result1+result2)
# tmp = np.triu(A)
# tmp = tmp[tmp!=0]
max_dist = A.max()
B = np.zeros(shape=(k+1,k+1))
B[k] = B[:,k] = 1
B[k,k] = 0
part1 = np.column_stack((X_test['guarantee_type'], X_test['gender']))
part2 = np.column_stack((X_test['age'], X_test['account_value'], X_test['withdrawal_rate'],X_test['maturity']))
weight_mat = np.zeros((k+1,n))
weight_mat[k] = 1
beta = max_dist*beta_percentile
for j in range(k):
B[j][:k] = alpha+np.exp(-3*A[j]/beta)
tmp = np.array([X_train[j],]*n, dtype=dtype)
tmp1 = np.column_stack((tmp['guarantee_type'], tmp['gender'])) #copy this row k times
tmp2 = np.column_stack((tmp['age'], tmp['account_value'], tmp['withdrawal_rate'],tmp['maturity']))
result1 = lambd*np.sum(np.bitwise_xor(part1, tmp1), axis=1)
result2 = np.sum((part2-tmp2)**2,axis=1)
weight_mat[j] = alpha+np.exp(np.sqrt(result1+result2)*(-3/beta))
Y_train = np.append(Y_train, 0)
const = np.dot(np.linalg.inv(B),Y_train)
Y_hat = np.dot(const, weight_mat)
if cv == True:
RMSE = np.sqrt(sum((Y_test-Y_hat)**2)/n)
return RMSE
else:
RMSE = np.sqrt(sum((Y_test-Y_hat)**2)/(n+k))
MAD = sum(abs(Y_test-Y_hat))/(n+k)
total = sum(Y_hat)+sum(Y_train)
benchmark = sum(Y_test)+sum(Y_train)
APD = abs(total-benchmark)
RPD = APD/benchmark
return APD,RPD,MAD,RMSE
def crossValidation(X_train_orig,Y_train_orig,dtype):
n = X_train_orig.shape[0]
beta_percentile_arr = [0.5,1,1.5,2,2.5,3]
cv_result = {}
np.random.seed(0)
for j in range(len(beta_percentile_arr)):
kf = cross_validation.KFold(n, n_folds=5, shuffle=True)
RMSE = 0
for train_index, test_index in kf:
X_train, X_test = X_train_orig[train_index], X_train_orig[test_index]
Y_train, Y_test = Y_train_orig[train_index], Y_train_orig[test_index]
RMSE += train_and_test(X_train,Y_train,X_test,Y_test,dtype,0,beta_percentile_arr[j])
cv_result[beta_percentile_arr[j]] = RMSE/5
print(str(beta_percentile_arr[j])+","+str(RMSE/5)+"\n")
sorted_cv_result = sorted(cv_result.iteritems(), key=operator.itemgetter(1))
return sorted_cv_result[0][0]
def main():
if len(argv) != 2:
print("input a folder")
exit(1)
folder = argv[1]
trainingDataName = folder + "/random_train.csv"
testingDataName = folder + "/random_test.csv"
dtype=[('guarantee_type', '<i4'), ('gender', '<i4'), ('age', '<f8'), ('account_value', '<f8'), ('withdrawal_rate', '<f8'), ('maturity', '<f8')]
X_train = np.genfromtxt(trainingDataName, delimiter=',', dtype=dtype, usecols=[2,3,4,5,6,7])
Y_train = np.loadtxt(trainingDataName, delimiter=',', usecols=[0])
X_test = np.genfromtxt(testingDataName, delimiter=',', dtype=dtype, usecols=[2,3,4,5,6,7])
Y_test = np.loadtxt(testingDataName, delimiter=',', usecols=[0])
normalizeDataset(X_train, X_test)
start_time = time.clock() #start time of the program
beta = crossValidation(X_train,Y_train,dtype)
print(str(beta))
#start_time = time.clock() #start time of the program
#APD,RPD,MAD,RMSE = train_and_test(X_train,Y_train,X_test,Y_test,dtype,0,beta,False)
#print(str(time.clock() - start_time))
#print(str(RPD))
#output = np.array([APD,RPD,MAD,RMSE])
#np.savetxt(folder+"/kriging_random_results.csv", output.reshape(1, output.shape[0]),fmt="%12.5f",delimiter=",")
if __name__ == "__main__":
main()