-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sparse_AutoEncoder.py
116 lines (88 loc) · 4.66 KB
/
Sparse_AutoEncoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
from scipy.optimize import minimize
from sparse_autoencoder import initialize
def kl_divergence(x, y):
return x * np.log(x / y) + (1 - x) * np.log((1 - x) / (1 - y))
class SparseAutoEncoder:
def __init__(self, ac_func, lam, sparsity_param, beta):
self.layout = (0, 0)
self.weights = dict()
self.bias = dict()
self.activations = dict()
self.deriatives = dict()
self.func = ac_func
self.sparsity_param = sparsity_param
self.lam = lam
self.beta = beta
self.sample_size = 0
def initialize(self, train, hidden_size):
r = np.sqrt(6) / np.sqrt(sum(self.layout) + 1)
self.weights[0] = np.random.random((self.layout[0], self.layout[1])) * 2 * r - r
self.weights[1] = np.random.random((self.layout[1], self.layout[0])) * 2 * r - r
self.bias[0] = np.zeros(self.layout[0])
self.bias[1] = np.zeros(self.layout[1])
l = self.layout[0] * self.layout[1]
return np.concatenate((self.weights[0].reshape(l), self.weights[1].reshape(l), self.bias[0], self.bias[1]))
def set_theta(self, theta):
l = self.layout[0] * self.layout[1]
self.weights[0] = theta[0:l].reshape(self.layout[0], self.layout[1])
self.weights[1] = theta[l:2 * l].reshape(self.layout[1], self.layout[0])
self.bias[0] = theta[2 * l: 2 * l + self.layout[0]]
self.bias[1] = theta[2 * l + self.layout[0] :]
def forward_back(self, theta, debug = False):
self.set_theta(theta)
for i in self.weights.keys():
z = self.weights[i].dot(self.activations[i]) + np.tile(self.bias[i], (self.sample_size, 1)).transpose()
self.activations[i + 1], self.deriatives[i + 1] = self.func(z)
# Sparsity
rho_est = np.sum(self.activations[1], axis = 1) / self.sample_size
rho = np.tile(self.sparsity_param, self.layout[0])
# Cost Function
cost = np.sum((self.activations[2] - self.activations[0]) ** 2) / (2 * self.sample_size) + \
self.lam * (np.sum(self.weights[0] ** 2) + np.sum(self.weights[1] ** 2)) / 2 + \
self.beta * np.sum(kl_divergence(rho, rho_est))
sparsity_delta = np.tile(- rho / rho_est + (1 - rho) / (1 - rho_est), (self.sample_size, 1)).transpose()
if debug:
return cost
# Back_Propagation
delta3 = -(self.activations[0] - self.activations[2]) * self.activations[2] * (1 - self.activations[2])
delta2 = (self.weights[1].transpose().dot(delta3) + self.beta * sparsity_delta) * self.activations[1] * (1 - self.activations[1])
w1_gradient = delta3.dot(self.activations[1].transpose()) / self.sample_size + self.lam * self.weights[1]
w0_gradient = delta2.dot(self.activations[0].transpose()) / self.sample_size + self.lam * self.weights[0]
b1_gradient = np.sum(delta3, axis = 1) / self.sample_size
b0_gradient = np.sum(delta2, axis = 1) / self.sample_size
l = self.layout[0] * self.layout[1]
grad = np.concatenate((w0_gradient.reshape(l), w1_gradient.reshape(l), b0_gradient, b1_gradient))
return cost, grad
def train(self, train, hidden_size, debug = True):
self.layout = (hidden_size, train.shape[0])
self.sample_size = train.shape[1]
theta = initialize(hidden_size, train.shape[0])
self.activations[0] = train
if debug:
self.check_gradient(theta)
else:
options = {'maxiter': 500, 'disp': True}
J = lambda x: self.forward_back(theta = x, debug = False)
result = minimize(J, theta, method='L-BFGS-B', jac=True, options=options)
opt_theta = result.x
print opt_theta
def check_gradient(self, theta):
grad0 = self.forward_back(theta, debug = False)[1]
epsilon = 0.0001
grad1 = np.zeros(theta.shape)
print theta.shape[0]
for i in range(theta.shape[0]):
theta_epsilon_plus = np.array(theta, dtype=np.float64)
theta_epsilon_plus[i] = theta[i] + epsilon
theta_epsilon_minus = np.array(theta, dtype=np.float64)
theta_epsilon_minus[i] = theta[i] - epsilon
c1 = self.forward_back(theta_epsilon_plus, debug = True)
c2 = self.forward_back(theta_epsilon_minus, debug = True)
grad1[i] = (c1 - c2) / (2 * epsilon)
if i % 1000 == 0:
print "Computing gradient for input:", i
print "Printing difference \n"
diff = np.linalg.norm(grad1 - grad0) / np.linalg.norm(grad1 + grad0)
print diff
print "Norm of the difference between numerical and analytical num_grad (should be < 1e-9)\n"