def _LL_lower_bound_check(model, x, lnZ, conv_thres=0.0001, max_iter=100000): ''' Computes the log likelihood lower bound for x by approximating h1, h2 by Mean field estimates. .. seealso:: AISTATS 2009: Deep Bolzmann machines http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS09_SalakhutdinovH.pdf :Parameters: model: The model -type: Valid DBM model x: Input states. -type: numpy array [batch size, input dim] lnZ: Logarithm of the patition function. -type: float conv_thres: Convergence threshold for the mean field approximation -type: float max_iter: If convergence threshold not reached, maximal number of sampling steps -type: int :Returns: Log likelihood lower bound for x. -type: numpy array [batch size, 1] ''' # Pre calc activation from x since it is constant id1 = numx.dot(x - model.o1, model.W1) # Initialize mu3 with its mean d3 = numx.zeros((x.shape[0], model.hidden2_dim)) d2 = numx.zeros((x.shape[0], model.hidden1_dim)) # While convergence of max number of iterations not reached, # run mean field estimation for i in range(x.shape[0]): d3_temp = numx.copy(model.o3) d2_temp = 0.0 d2_new = Sigmoid.f(id1[i, :] + numx.dot(d3_temp - model.o3, model.W2.T) + model.b2) d3_new = Sigmoid.f(numx.dot(d2_new - model.o2, model.W2) + model.b3) while numx.max(numx.abs(d2_new - d2_temp)) > conv_thres or numx.max( numx.abs(d3_new - d3_temp)) > conv_thres: d2_temp = d2_new d3_temp = d3_new d2_new = Sigmoid.f(id1[i, :] + numx.dot(d3_new - model.o3, model.W2.T) + model.b2) d3_new = Sigmoid.f( numx.dot(d2_new - model.o2, model.W2) + model.b3) d2[i] = numx.clip(d2_new, 0.0000000000000001, 0.9999999999999999).reshape(1, model.hidden1_dim) d3[i] = numx.clip(d3_new, 0.0000000000000001, 0.9999999999999999).reshape(1, model.hidden2_dim) # Return ernegy of states + the entropy of h1.h2 due to the mean field approximation return -model.energy(x, d2, d3) - lnZ - numx.sum( d2 * numx.log(d2) + (1.0 - d2) * numx.log(1.0 - d2), axis=1).reshape( x.shape[0], 1) - numx.sum(d3 * numx.log(d3) + (1.0 - d3) * numx.log(1.0 - d3), axis=1).reshape(x.shape[0], 1)
def sample(self, activation): ''' This function samples states from the activation. :Parameters: activation: pre and post synaptiv activation. -type: list len(2) of numpy arrays [batch_size, input dim] ''' # numx.clip(a=activation[1], a_min=-1.0, a_max=1.0, out=activation[1]) activation3 = numx.maximum(0.0, activation[1] + numx.random.randn(activation[1].shape[0], activation[1].shape[1]) * numx.sqrt( Sigmoid.f(activation[1]))) activation3 = numx.minimum(1.0, activation3) # activation3 = activation[1] + numx.random.randn(activation[1].shape[0],activation[1].shape[1]) * numx.sqrt(Sigmoid.f(activation[1])) # activation3 = numx.maximum(0.0,activation[1] + numx.random.randn(activation[1].shape[0],activation[1].shape[1]) * numx.sqrt(Sigmoid.f(activation[1]))) # numx.clip(a = activation3,a_min=0.0,a_max=1.0,out = activation3) return activation3
def activation(self, bottom_up_states, top_down_states, bottom_up_pre=None, top_down_pre=None): ''' Calculates the pre and post synaptic activation. :Parameters: bottom_up_states: activation comming from previous layer. -type: numpy array [batch_size, input dim] top_down_states: activation comming from next layer. -type: numpy array [batch_size, input dim] bottom_up_pre: pre-activation comming from previous layer of None. if given this pre activation is used to avoid re-caluclations. -type: None or numpy array [batch_size, input dim] top_down_pre: pre-activation comming from next layer of None. if given this pre activation is used to avoid re-caluclations. -type: None or numpy array [batch_size, input dim] :Returns: Pre and post synaptic activation for this layer. -type: numpy array [batch_size, input dim] ''' pre_act = 0.0 if self.input_weight_layer is not None: if bottom_up_pre is None: pre_act += self.input_weight_layer.propagate_up( bottom_up_states) else: pre_act += bottom_up_pre if self.output_weight_layer is not None: if top_down_pre is None: pre_act += self.output_weight_layer.propagate_down( top_down_states) else: pre_act += top_down_pre pre_act += self.bias return Sigmoid.f(pre_act), pre_act
def train(self, data, epsilon, k=[3, 1], offset_typ='DDD', meanfield=False): #positive phase id1 = numx.dot(data - self.model.o1, self.model.W1) d3 = numx.copy(self.model.o3) d2 = numx.copy(self.model.o2) #for _ in range(k[0]): if meanfield == False: for _ in range(k[0]): d2 = Sigmoid.f(id1 + numx.dot(d3 - self.model.o3, self.model.W2.T) + self.model.b2) d2 = self.model.dtype(d2 > numx.random.random(d2.shape)) d3 = Sigmoid.f( numx.dot(d2 - self.model.o2, self.model.W2) + self.model.b3) d3 = self.model.dtype(d3 > numx.random.random(d3.shape)) else: if meanfield == True: for _ in range(k[0]): d2 = Sigmoid.f(id1 + numx.dot(d3 - self.model.o3, self.model.W2.T) + self.model.b2) d3 = Sigmoid.f( numx.dot(d2 - self.model.o2, self.model.W2) + self.model.b3) else: d2_new = Sigmoid.f(id1 + numx.dot(d3 - self.model.o3, self.model.W2.T) + self.model.b2) d3_new = Sigmoid.f( numx.dot(d2_new - self.model.o2, self.model.W2) + self.model.b3) while numx.max(numx.abs(d2_new - d2)) > meanfield or numx.max( numx.abs(d3_new - d3)) > meanfield: d2 = d2_new d3 = d3_new d2_new = Sigmoid.f( id1 + numx.dot(d3_new - self.model.o3, self.model.W2.T) + self.model.b2) d3_new = Sigmoid.f( numx.dot(d2_new - self.model.o2, self.model.W2) + self.model.b3) d2 = d2_new d3 = d3_new self.sampler.model = RBM_MODEL.BinaryBinaryRBM( number_visibles=self.model.input_dim + self.model.hidden2_dim, number_hiddens=self.model.hidden1_dim, data=None, initial_weights=numx.vstack((self.model.W1, self.model.W2.T)), initial_visible_bias=numx.hstack((self.model.b1, self.model.b3)), initial_hidden_bias=self.model.b2, initial_visible_offsets=numx.hstack( (self.model.o1, self.model.o3)), initial_hidden_offsets=self.model.o2) if isinstance(self.sampler, RBM_SAMPLER.GibbsSampler): sample = self.sampler.sample(numx.hstack((data, d3))) else: sample = self.sampler.sample(self.batch_size, k[1]) self.m2 = self.sampler.model.probability_h_given_v(sample) self.m1 = sample[:, 0:self.model.input_dim] self.m3 = sample[:, self.model.input_dim:] # Estimate new means new_o1 = 0 if offset_typ[0] is 'D': new_o1 = data.mean(axis=0) if offset_typ[0] is 'A': new_o1 = (self.m1.mean(axis=0) + data.mean(axis=0)) / 2.0 if offset_typ[0] is 'M': new_o1 = self.m1.mean(axis=0) new_o2 = 0 if offset_typ[1] is 'D': new_o2 = d2.mean(axis=0) if offset_typ[1] is 'A': new_o2 = (self.m2.mean(axis=0) + d2.mean(axis=0)) / 2.0 if offset_typ[1] is 'M': new_o2 = self.m2.mean(axis=0) new_o3 = 0 if offset_typ[2] is 'D': new_o3 = d3.mean(axis=0) if offset_typ[2] is 'A': new_o3 = (self.m3.mean(axis=0) + d3.mean(axis=0)) / 2.0 if offset_typ[2] is 'M': new_o3 = self.m3.mean(axis=0) # Reparameterize self.model.b1 += epsilon[6] * numx.dot(new_o2 - self.model.o2, self.model.W1.T) self.model.b2 += epsilon[5] * numx.dot( new_o1 - self.model.o1, self.model.W1) + epsilon[7] * numx.dot( new_o3 - self.model.o3, self.model.W2.T) self.model.b3 += epsilon[7] * numx.dot(new_o2 - self.model.o2, self.model.W2) # Shift means self.model.o1 = (1.0 - epsilon[5]) * self.model.o1 + epsilon[5] * new_o1 self.model.o2 = (1.0 - epsilon[6]) * self.model.o2 + epsilon[6] * new_o2 self.model.o3 = (1.0 - epsilon[7]) * self.model.o3 + epsilon[7] * new_o3 # Calculate gradients dW1 = (numx.dot( (data - self.model.o1).T, d2 - self.model.o2) - numx.dot( (self.m1 - self.model.o1).T, self.m2 - self.model.o2)) dW2 = (numx.dot((d2 - self.model.o2).T, d3 - self.model.o3) - numx.dot( (self.m2 - self.model.o2).T, self.m3 - self.model.o3)) db1 = (numx.sum(data - self.m1, axis=0)).reshape(1, self.model.input_dim) db2 = (numx.sum(d2 - self.m2, axis=0)).reshape(1, self.model.hidden1_dim) db3 = (numx.sum(d3 - self.m3, axis=0)).reshape(1, self.model.hidden2_dim) # Update Model self.model.W1 += epsilon[0] / self.batch_size * dW1 self.model.W2 += epsilon[1] / self.batch_size * dW2 self.model.b1 += epsilon[2] / self.batch_size * db1 self.model.b2 += epsilon[3] / self.batch_size * db2 self.model.b3 += epsilon[4] / self.batch_size * db3
def train(self, data, epsilon, k=[3, 1], offset_typ='DDD', meanfield=False): #positive phase id1 = numx.dot(data - self.model.o1, self.model.W1) d3 = numx.copy(self.model.o3) d2 = 0.0 #for _ in range(k[0]): if meanfield == False: for _ in range(k[0]): d3 = self.model.dtype(d3 > numx.random.random(d3.shape)) d2 = Sigmoid.f(id1 + numx.dot(d3 - self.model.o3, self.model.W2.T) + self.model.b2) d2 = self.model.dtype(d2 > numx.random.random(d2.shape)) d3 = Sigmoid.f( numx.dot(d2 - self.model.o2, self.model.W2) + self.model.b3) else: if meanfield == True: for _ in range(k[0]): d2 = Sigmoid.f(id1 + numx.dot(d3 - self.model.o3, self.model.W2.T) + self.model.b2) d3 = Sigmoid.f( numx.dot(d2 - self.model.o2, self.model.W2) + self.model.b3) else: d2_new = Sigmoid.f(id1 + numx.dot(d3 - self.model.o3, self.model.W2.T) + self.model.b2) d3_new = Sigmoid.f( numx.dot(d2_new - self.model.o2, self.model.W2) + self.model.b3) while numx.max(numx.abs(d2_new - d2)) > meanfield or numx.max( numx.abs(d3_new - d3)) > meanfield: d2 = d2_new d3 = d3_new d2_new = Sigmoid.f( id1 + numx.dot(d3_new - self.model.o3, self.model.W2.T) + self.model.b2) d3_new = Sigmoid.f( numx.dot(d2_new - self.model.o2, self.model.W2) + self.model.b3) d2 = d2_new d3 = d3_new #negative phase for _ in range(k[1]): self.m2 = Sigmoid.f( numx.dot(self.m1 - self.model.o1, self.model.W1) + numx.dot(self.m3 - self.model.o3, self.model.W2.T) + self.model.b2) self.m2 = self.model.dtype( self.m2 > numx.random.random(self.m2.shape)) self.m1 = Sigmoid.f( numx.dot(self.m2 - self.model.o2, self.model.W1.T) + self.model.b1) self.m1 = self.model.dtype( self.m1 > numx.random.random(self.m1.shape)) self.m3 = Sigmoid.f( numx.dot(self.m2 - self.model.o2, self.model.W2) + self.model.b3) self.m3 = self.model.dtype( self.m3 > numx.random.random(self.m3.shape)) # Estimate new means new_o1 = 0 if offset_typ[0] is 'D': new_o1 = data.mean(axis=0) if offset_typ[0] is 'A': new_o1 = (self.m1.mean(axis=0) + data.mean(axis=0)) / 2.0 if offset_typ[0] is 'M': new_o1 = self.m1.mean(axis=0) new_o2 = 0 if offset_typ[1] is 'D': new_o2 = d2.mean(axis=0) if offset_typ[1] is 'A': new_o2 = (self.m2.mean(axis=0) + d2.mean(axis=0)) / 2.0 if offset_typ[1] is 'M': new_o2 = self.m2.mean(axis=0) new_o3 = 0 if offset_typ[2] is 'D': new_o3 = d3.mean(axis=0) if offset_typ[2] is 'A': new_o3 = (self.m3.mean(axis=0) + d3.mean(axis=0)) / 2.0 if offset_typ[2] is 'M': new_o3 = self.m3.mean(axis=0) # Reparameterize self.model.b1 += epsilon[6] * numx.dot(new_o2 - self.model.o2, self.model.W1.T) self.model.b2 += epsilon[5] * numx.dot( new_o1 - self.model.o1, self.model.W1) + epsilon[7] * numx.dot( new_o3 - self.model.o3, self.model.W2.T) self.model.b3 += epsilon[6] * numx.dot(new_o2 - self.model.o2, self.model.W2) # Shift means self.model.o1 = (1.0 - epsilon[5]) * self.model.o1 + epsilon[5] * new_o1 self.model.o2 = (1.0 - epsilon[6]) * self.model.o2 + epsilon[6] * new_o2 self.model.o3 = (1.0 - epsilon[7]) * self.model.o3 + epsilon[7] * new_o3 # Calculate gradients dW1 = (numx.dot( (data - self.model.o1).T, d2 - self.model.o2) - numx.dot( (self.m1 - self.model.o1).T, self.m2 - self.model.o2)) dW2 = (numx.dot((d2 - self.model.o2).T, d3 - self.model.o3) - numx.dot( (self.m2 - self.model.o2).T, self.m3 - self.model.o3)) db1 = (numx.sum(data - self.m1, axis=0)).reshape(1, self.model.input_dim) db2 = (numx.sum(d2 - self.m2, axis=0)).reshape(1, self.model.hidden1_dim) db3 = (numx.sum(d3 - self.m3, axis=0)).reshape(1, self.model.hidden2_dim) # Update Model self.model.W1 += epsilon[0] / self.batch_size * dW1 self.model.W2 += epsilon[1] / self.batch_size * dW2 self.model.b1 += epsilon[2] / self.batch_size * db1 self.model.b2 += epsilon[3] / self.batch_size * db2 self.model.b3 += epsilon[4] / self.batch_size * db3