def use_learner(self,example): output = np.zeros((self.hidden_size)) mllin.product_matrix_vector(self.W,example,self.hidden_act) self.hidden_act += self.c mlnonlin.sigmoid(self.hidden_act,output) return [output]
def use_learner(self, example): output = np.zeros((self.hidden_size)) mllin.product_matrix_vector(self.W, example, self.hidden_act) self.hidden_act += self.c mlnonlin.sigmoid(self.hidden_act, output) return [output]
def cost(self,outputs,example): hidden = outputs[0] mllin.product_matrix_vector(self.W.T,hidden,self.neg_input_act) self.neg_input_act += self.b mlnonlin.sigmoid(self.neg_input_act,self.neg_input_prob) return [ np.sum((example-self.neg_input_prob)**2) ]
def cost(self, outputs, example): hidden = outputs[0] mllin.product_matrix_vector(self.W.T, hidden, self.neg_input_act) self.neg_input_act += self.b mlnonlin.sigmoid(self.neg_input_act, self.neg_input_prob) return [np.sum((example - self.neg_input_prob)**2)]
def compute_document_representation(self,word_counts_sparse): self.input[:] = 0 self.input[word_counts_sparse[1]] = word_counts_sparse[0] output = np.zeros((self.hidden_size,)) mllin.product_matrix_vector(self.W,self.input,self.hidden_act) self.hidden_act += self.c*self.input.sum() mlnonlin.sigmoid(self.hidden_act,output) return output
def compute_document_representation(self, word_counts_sparse): self.input[:] = 0 self.input[word_counts_sparse[1]] = word_counts_sparse[0] output = np.zeros((self.hidden_size,)) mllin.product_matrix_vector(self.W, self.input, self.hidden_act) self.hidden_act += self.c * self.input.sum() mlnonlin.sigmoid(self.hidden_act, output) return output
def use_learner(self,example): self.input[:] = 0 self.input[example[1]] = example[0] output = np.zeros((self.hidden_size)) mllin.product_matrix_vector(self.W,self.input,self.hidden_act) self.hidden_act += self.c*self.input.sum() mlnonlin.sigmoid(self.hidden_act,output) return [output]
def cost(self,outputs,example): hidden = outputs[0] self.input[:] = 0 self.input[example[1]] = example[0] mllin.product_matrix_vector(self.W.T,hidden,self.neg_input_act) self.neg_input_act += self.b mlnonlin.softmax(self.neg_input_act,self.neg_input_prob) return [ np.sum((self.input-self.input.sum()*self.neg_input_prob)**2) ]
def use_learner(self, example): self.input[:] = 0 self.input[example[1]] = example[0] output = np.zeros((self.hidden_size)) mllin.product_matrix_vector(self.W, self.input, self.hidden_act) self.hidden_act += self.c * self.input.sum() mlnonlin.sigmoid(self.hidden_act, output) return [output]
def bprop(self,target): """ Computes the loss derivatives with respect to all parameters times the current learning rate. It assumes that ``self.fprop(input)`` was called first. All the derivatives are put in their corresponding object attributes (i.e. ``self.d*``). """ self.doutput_act[:] = self.output self.doutput_act[target] -= 1 self.doutput_act *= self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.dd[:] = self.doutput_act for k in range(self.n_k_means): c = self.cluster_indices[k] idx = c + k*self.n_clusters mllin.outer(self.doutput_act,self.layers[k],self.dVs[idx]) mllin.product_matrix_vector(self.Vs[idx].T,self.doutput_act,self.dlayers[k]) #mlnonlin.dsigmoid(self.layers[k],self.dlayers[k],self.dlayer_acts[k]) if self.activation_function == 'sigmoid': mlnonlin.dsigmoid(self.layers[k],self.dlayers[k],self.dlayer_acts[k]) elif self.activation_function == 'tanh': mlnonlin.dtanh(self.layers[k],self.dlayers[k],self.dlayer_acts[k]) elif self.activation_function == 'reclin': mlnonlin.dreclin(self.layers[k],self.dlayers[k],self.dlayer_acts[k]) else: raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'') self.dcs[idx][:] = self.dlayer_acts[k] mllin.outer(self.dlayer_acts[k],self.input,self.dWs[idx]) if self.autoencoder_regularization != 0: self.dae_doutput_act[:] = self.dae_output self.dae_doutput_act[:] -= self.input self.dae_doutput_act *= 2*self.autoencoder_regularization*self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.dae_dd[:] = self.dae_doutput_act for k in range(self.n_k_means): c = self.cluster_indices[k] idx = c + k*self.n_clusters mllin.outer(self.dae_doutput_act,self.dae_layers[k],self.dae_dWsT[idx]) self.dWs[idx] += self.dae_dWsT[idx].T mllin.product_matrix_vector(self.Ws[idx],self.dae_doutput_act,self.dae_dlayers[k]) #mlnonlin.dsigmoid(self.dae_layers[k],self.dae_dlayers[k],self.dae_dlayer_acts[k]) if self.activation_function == 'sigmoid': mlnonlin.dsigmoid(self.dae_layers[k],self.dae_dlayers[k],self.dae_dlayer_acts[k]) elif self.activation_function == 'tanh': mlnonlin.dtanh(self.dae_layers[k],self.dae_dlayers[k],self.dae_dlayer_acts[k]) elif self.activation_function == 'reclin': mlnonlin.dreclin(self.dae_layers[k],self.dae_dlayers[k],self.dae_dlayer_acts[k]) else: raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'') self.dcs[idx] += self.dae_dlayer_acts[k] mllin.outer(self.dae_dlayer_acts[k],self.dae_input,self.dae_dWs[idx]) self.dWs[idx] += self.dae_dWs[idx]
def cost(self, outputs, example): hidden = outputs[0] self.input[:] = 0 self.input[example[1]] = example[0] mllin.product_matrix_vector(self.W.T, hidden, self.neg_input_act) self.neg_input_act += self.b mlnonlin.softmax(self.neg_input_act, self.neg_input_prob) return [np.sum((self.input - self.input.sum() * self.neg_input_prob) ** 2)]
def use_learner(self, example): self.input[self.input_order] = example output = np.zeros((self.input_size)) recact = np.zeros((self.input_size)) # fprop mllin.product_matrix_vector(self.W, self.input, recact) recact += self.b mlnonlin.sigmoid(recact, output) return [output, recact]
def use_learner(self,example): self.input[self.input_order] = example output = np.zeros((self.input_size)) recact = np.zeros((self.input_size)) # fprop mllin.product_matrix_vector(self.W,self.input,recact) recact += self.b mlnonlin.sigmoid(recact,output) return [output,recact]
def update_learner(self,example): self.layers[0][:] = example[0] # fprop for h in range(self.n_hidden_layers): mllin.product_matrix_vector(self.Ws[h],self.layers[h],self.layer_acts[h+1]) self.layer_acts[h+1] += self.cs[h] mlnonlin.sigmoid(self.layer_acts[h+1],self.layers[h+1]) mllin.product_matrix_vector(self.U,self.layers[-1],self.output_act) self.output_act += self.d mlnonlin.softmax(self.output_act,self.output) self.doutput_act[:] = self.output self.doutput_act[example[1]] -= 1 self.doutput_act *= self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.dd[:] = self.doutput_act mllin.outer(self.doutput_act,self.layers[-1],self.dU) mllin.product_matrix_vector(self.U.T,self.doutput_act,self.dlayers[-1]) mlnonlin.dsigmoid(self.layers[-1],self.dlayers[-1],self.dlayer_acts[-1]) for h in range(self.n_hidden_layers-1,-1,-1): self.dcs[h][:] = self.dlayer_acts[h+1] mllin.outer(self.dlayer_acts[h+1],self.layers[h],self.dWs[h]) mllin.product_matrix_vector(self.Ws[h].T,self.dlayer_acts[h+1],self.dlayers[h]) mlnonlin.dsigmoid(self.layers[h],self.dlayers[h],self.dlayer_acts[h]) self.U -= self.dU self.d -= self.dd for h in range(self.n_hidden_layers-1,-1,-1): self.Ws[h] -= self.dWs[h] self.cs[h] -= self.dcs[h] self.n_updates += 1
def use_learner(self, example): output = np.zeros((self.n_classes)) self.layers[0][:] = example[0] # fprop for h in range(self.n_hidden_layers): mllin.product_matrix_vector(self.Ws[h], self.layers[h], self.layer_acts[h + 1]) self.layer_acts[h + 1] += self.cs[h] mlnonlin.sigmoid(self.layer_acts[h + 1], self.layers[h + 1]) mllin.product_matrix_vector(self.U, self.layers[-1], self.output_act) self.output_act += self.d mlnonlin.softmax(self.output_act, output) return [output.argmax(), output]
def update_learner(self,example): self.layers[0][:] = example[0] # fprop for h in range(self.n_hidden_layers): mllin.product_matrix_vector(self.Ws[h],self.layers[h],self.layer_acts[h+1]) self.layer_acts[h+1] += self.cs[h] if self.activation_function == 'sigmoid': mlnonlin.sigmoid(self.layer_acts[h+1],self.layers[h+1]) elif self.activation_function == 'tanh': mlnonlin.tanh(self.layer_acts[h+1],self.layers[h+1]) elif self.activation_function == 'reclin': mlnonlin.reclin(self.layer_acts[h+1],self.layers[h+1]) else: raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'') mllin.product_matrix_vector(self.U,self.layers[-1],self.output_act) self.output_act += self.d mlnonlin.softmax(self.output_act,self.output) self.doutput_act[:] = self.output self.doutput_act[example[1]] -= 1 self.doutput_act *= self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.dd[:] = self.doutput_act mllin.outer(self.doutput_act,self.layers[-1],self.dU) mllin.product_matrix_vector(self.U.T,self.doutput_act,self.dlayers[-1]) if self.activation_function == 'sigmoid': mlnonlin.dsigmoid(self.layers[-1],self.dlayers[-1],self.dlayer_acts[-1]) elif self.activation_function == 'tanh': mlnonlin.dtanh(self.layers[-1],self.dlayers[-1],self.dlayer_acts[-1]) elif self.activation_function == 'reclin': mlnonlin.dreclin(self.layers[-1],self.dlayers[-1],self.dlayer_acts[-1]) else: raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'') for h in range(self.n_hidden_layers-1,-1,-1): self.dcs[h][:] = self.dlayer_acts[h+1] mllin.outer(self.dlayer_acts[h+1],self.layers[h],self.dWs[h]) mllin.product_matrix_vector(self.Ws[h].T,self.dlayer_acts[h+1],self.dlayers[h]) if self.activation_function == 'sigmoid': mlnonlin.dsigmoid(self.layers[h],self.dlayers[h],self.dlayer_acts[h]) elif self.activation_function == 'tanh': mlnonlin.dtanh(self.layers[h],self.dlayers[h],self.dlayer_acts[h]) elif self.activation_function == 'reclin': mlnonlin.dreclin(self.layers[h],self.dlayers[h],self.dlayer_acts[h]) else: raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'') self.U -= self.dU self.d -= self.dd for h in range(self.n_hidden_layers-1,-1,-1): self.Ws[h] -= self.dWs[h] self.cs[h] -= self.dcs[h] self.n_updates += 1
def fprop(self,input): """ Computes the output given some input. Puts the result in ``self.output`` """ self.input[:] = input self.output_act[:] = self.d for k in range(self.n_k_means): if self.n_k_means_inputs == self.input_size: c = self.clusterings[k].compute_cluster(self.input) else: c = self.clusterings[k].compute_cluster(self.input[self.k_means_subset_inputs[k]]) idx = c + k*self.n_clusters self.cluster_indices[k] = c mllin.product_matrix_vector(self.Ws[idx],self.input,self.layer_acts[k]) self.layer_acts[k] += self.cs[idx] #mlnonlin.sigmoid(self.layer_acts[k],self.layers[k]) if self.activation_function == 'sigmoid': mlnonlin.sigmoid(self.layer_acts[k],self.layers[k]) elif self.activation_function == 'tanh': mlnonlin.tanh(self.layer_acts[k],self.layers[k]) elif self.activation_function == 'reclin': mlnonlin.reclin(self.layer_acts[k],self.layers[k]) else: raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'') mllin.product_matrix_vector(self.Vs[idx],self.layers[k],self.output_acts[k]) self.output_act += self.output_acts[k] mlnonlin.softmax(self.output_act,self.output) if self.autoencoder_regularization != 0: self.dae_input[:] = input self.rng.shuffle(self.input_idx) self.dae_input[self.input_idx[:int(self.autoencoder_missing_fraction*self.input_size)]] = 0 self.dae_output_act[:] = self.dae_d for k in range(self.n_k_means): idx = self.cluster_indices[k] + k*self.n_clusters mllin.product_matrix_vector(self.Ws[idx],self.dae_input,self.dae_layer_acts[k]) self.dae_layer_acts[k] += self.cs[idx] #mlnonlin.sigmoid(self.dae_layer_acts[k],self.dae_layers[k]) if self.activation_function == 'sigmoid': mlnonlin.sigmoid(self.dae_layer_acts[k],self.dae_layers[k]) elif self.activation_function == 'tanh': mlnonlin.tanh(self.dae_layer_acts[k],self.dae_layers[k]) elif self.activation_function == 'reclin': mlnonlin.reclin(self.dae_layer_acts[k],self.dae_layers[k]) else: raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'') mllin.product_matrix_vector(self.Ws[idx].T,self.dae_layers[k],self.dae_output_acts[k]) self.dae_output_act += self.dae_output_acts[k] self.dae_output[:] = self.dae_output_act
def update_learner(self,example): self.input[self.input_order] = example # fprop mllin.product_matrix_vector(self.W,self.input,self.recact) self.recact += self.b mlnonlin.sigmoid(self.recact,self.rec) # bprop np.subtract(self.rec,self.input,self.drec) self.db[:] = self.drec mllin.outer(self.drec,self.input,self.dW) self.dW *= self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.db *= self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.W -= self.dW self.b -= self.db self.W.ravel()[self.utri_index] = 0 # Setting back upper diagonal to 0 self.n_updates += 1
def use_learner(self,example): output = np.zeros((self.n_classes)) self.layers[0][:] = example[0] # fprop for h in range(self.n_hidden_layers): mllin.product_matrix_vector(self.Ws[h],self.layers[h],self.layer_acts[h+1]) self.layer_acts[h+1] += self.cs[h] if self.activation_function == 'sigmoid': mlnonlin.sigmoid(self.layer_acts[h+1],self.layers[h+1]) elif self.activation_function == 'tanh': mlnonlin.tanh(self.layer_acts[h+1],self.layers[h+1]) elif self.activation_function == 'reclin': mlnonlin.reclin(self.layer_acts[h+1],self.layers[h+1]) else: raise ValueError('activation_function must be either \'sigmoid\', \'tanh\' or \'reclin\'') mllin.product_matrix_vector(self.U,self.layers[-1],self.output_act) self.output_act += self.d mlnonlin.softmax(self.output_act,output) return [output.argmax(),output]
def update_learner(self, example): self.input[self.input_order] = example # fprop mllin.product_matrix_vector(self.W, self.input, self.recact) self.recact += self.b mlnonlin.sigmoid(self.recact, self.rec) # bprop np.subtract(self.rec, self.input, self.drec) self.db[:] = self.drec mllin.outer(self.drec, self.input, self.dW) self.dW *= self.learning_rate / ( 1. + self.decrease_constant * self.n_updates) self.db *= self.learning_rate / ( 1. + self.decrease_constant * self.n_updates) self.W -= self.dW self.b -= self.db self.W.ravel()[self.utri_index] = 0 # Setting back upper diagonal to 0 self.n_updates += 1
def update_learner(self, example): self.layers[0][:] = example[0] # fprop for h in range(self.n_hidden_layers): mllin.product_matrix_vector(self.Ws[h], self.layers[h], self.layer_acts[h + 1]) self.layer_acts[h + 1] += self.cs[h] mlnonlin.sigmoid(self.layer_acts[h + 1], self.layers[h + 1]) mllin.product_matrix_vector(self.U, self.layers[-1], self.output_act) self.output_act += self.d mlnonlin.softmax(self.output_act, self.output) self.doutput_act[:] = self.output self.doutput_act[example[1]] -= 1 self.doutput_act *= self.learning_rate / ( 1. + self.decrease_constant * self.n_updates) self.dd[:] = self.doutput_act mllin.outer(self.doutput_act, self.layers[-1], self.dU) mllin.product_matrix_vector(self.U.T, self.doutput_act, self.dlayers[-1]) mlnonlin.dsigmoid(self.layers[-1], self.dlayers[-1], self.dlayer_acts[-1]) for h in range(self.n_hidden_layers - 1, -1, -1): self.dcs[h][:] = self.dlayer_acts[h + 1] mllin.outer(self.dlayer_acts[h + 1], self.layers[h], self.dWs[h]) mllin.product_matrix_vector(self.Ws[h].T, self.dlayer_acts[h + 1], self.dlayers[h]) mlnonlin.dsigmoid(self.layers[h], self.dlayers[h], self.dlayer_acts[h]) self.U -= self.dU self.d -= self.dd for h in range(self.n_hidden_layers - 1, -1, -1): self.Ws[h] -= self.dWs[h] self.cs[h] -= self.dcs[h] self.n_updates += 1
def update_learner(self, example): self.input[:] = 0 self.input[example[1]] = example[0] n_words = int(self.input.sum()) # Performing CD-k mllin.product_matrix_vector(self.W, self.input, self.hidden_act) self.hidden_act += self.c * n_words mlnonlin.sigmoid(self.hidden_act, self.hidden_prob) self.neg_hidden_prob[:] = self.hidden_prob for k in range(self.k_contrastive_divergence_steps): if self.mean_field: self.hidden[:] = self.neg_hidden_prob else: np.less(self.rng.rand(self.hidden_size), self.neg_hidden_prob, self.hidden) mllin.product_matrix_vector(self.W.T, self.hidden, self.neg_input_act) self.neg_input_act += self.b mlnonlin.softmax(self.neg_input_act, self.neg_input_prob) if self.mean_field: self.neg_input[:] = n_words * self.neg_input_prob else: self.neg_input[:] = self.rng.multinomial(n_words, self.neg_input_prob) mllin.product_matrix_vector(self.W, self.neg_input, self.neg_hidden_act) self.neg_hidden_act += self.c * n_words mlnonlin.sigmoid(self.neg_hidden_act, self.neg_hidden_prob) mllin.outer(self.hidden_prob, self.input, self.deltaW) mllin.outer(self.neg_hidden_prob, self.neg_input, self.neg_stats) self.deltaW -= self.neg_stats np.subtract(self.input, self.neg_input, self.deltab) np.subtract(self.hidden_prob, self.neg_hidden_prob, self.deltac) self.deltaW *= self.learning_rate / (1.0 + self.decrease_constant * self.n_updates) self.deltab *= self.learning_rate / (1.0 + self.decrease_constant * self.n_updates) self.deltac *= n_words * self.learning_rate / (1.0 + self.decrease_constant * self.n_updates) self.W += self.deltaW self.b += self.deltab self.c += self.deltac self.n_updates += 1
def update_learner(self,example): self.input[:] = 0 self.input[example[1]] = example[0] n_words = int(self.input.sum()) # Performing CD-k mllin.product_matrix_vector(self.W,self.input,self.hidden_act) self.hidden_act += self.c*n_words mlnonlin.sigmoid(self.hidden_act,self.hidden_prob) self.neg_hidden_prob[:] = self.hidden_prob for k in range(self.k_contrastive_divergence_steps): if self.mean_field: self.hidden[:] = self.neg_hidden_prob else: np.less(self.rng.rand(self.hidden_size),self.neg_hidden_prob,self.hidden) mllin.product_matrix_vector(self.W.T,self.hidden,self.neg_input_act) self.neg_input_act += self.b mlnonlin.softmax(self.neg_input_act,self.neg_input_prob) if self.mean_field: self.neg_input[:] = n_words*self.neg_input_prob else: self.neg_input[:] = self.rng.multinomial(n_words,self.neg_input_prob) mllin.product_matrix_vector(self.W,self.neg_input,self.neg_hidden_act) self.neg_hidden_act += self.c*n_words mlnonlin.sigmoid(self.neg_hidden_act,self.neg_hidden_prob) mllin.outer(self.hidden_prob,self.input,self.deltaW) mllin.outer(self.neg_hidden_prob,self.neg_input,self.neg_stats) self.deltaW -= self.neg_stats np.subtract(self.input,self.neg_input,self.deltab) np.subtract(self.hidden_prob,self.neg_hidden_prob,self.deltac) self.deltaW *= self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.deltab *= self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.deltac *= n_words*self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.W += self.deltaW self.b += self.deltab self.c += self.deltac self.n_updates += 1
def update_learner(self, example): self.input[:] = example # Performing CD-1 mllin.product_matrix_vector(self.W, self.input, self.hidden_act) self.hidden_act += self.c mlnonlin.sigmoid(self.hidden_act, self.hidden_prob) np.less(self.rng.rand(self.hidden_size), self.hidden_prob, self.hidden) mllin.product_matrix_vector(self.W.T, self.hidden, self.neg_input_act) self.neg_input_act += self.b mlnonlin.sigmoid(self.neg_input_act, self.neg_input_prob) np.less(self.rng.rand(self.input_size), self.neg_input_prob, self.neg_input) mllin.product_matrix_vector(self.W, self.neg_input, self.neg_hidden_act) self.neg_hidden_act += self.c mlnonlin.sigmoid(self.neg_hidden_act, self.neg_hidden_prob) mllin.outer(self.hidden_prob, self.input, self.deltaW) mllin.outer(self.neg_hidden_prob, self.neg_input, self.neg_stats) self.deltaW -= self.neg_stats np.subtract(self.input, self.neg_input, self.deltab) np.subtract(self.hidden_prob, self.neg_hidden_prob, self.deltac) self.deltaW *= self.learning_rate / ( 1. + self.decrease_constant * self.n_updates) self.deltab *= self.learning_rate / ( 1. + self.decrease_constant * self.n_updates) self.deltac *= self.learning_rate / ( 1. + self.decrease_constant * self.n_updates) self.W += self.deltaW self.b += self.deltab self.c += self.deltac if self.l1_regularization > 0: self.W *= (np.abs(self.W) > (self.l1_regularization * self.learning_rate / (1. + self.decrease_constant * self.n_updates))) self.n_updates += 1
def update_learner(self,example): self.input[:] = example # Performing CD-1 mllin.product_matrix_vector(self.W,self.input,self.hidden_act) self.hidden_act += self.c mlnonlin.sigmoid(self.hidden_act,self.hidden_prob) np.less(self.rng.rand(self.hidden_size),self.hidden_prob,self.hidden) mllin.product_matrix_vector(self.W.T,self.hidden,self.neg_input_act) self.neg_input_act += self.b mlnonlin.sigmoid(self.neg_input_act,self.neg_input_prob) np.less(self.rng.rand(self.input_size),self.neg_input_prob,self.neg_input) mllin.product_matrix_vector(self.W,self.neg_input,self.neg_hidden_act) self.neg_hidden_act += self.c mlnonlin.sigmoid(self.neg_hidden_act,self.neg_hidden_prob) mllin.outer(self.hidden_prob,self.input,self.deltaW) mllin.outer(self.neg_hidden_prob,self.neg_input,self.neg_stats) self.deltaW -= self.neg_stats np.subtract(self.input,self.neg_input,self.deltab) np.subtract(self.hidden_prob,self.neg_hidden_prob,self.deltac) self.deltaW *= self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.deltab *= self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.deltac *= self.learning_rate/(1.+self.decrease_constant*self.n_updates) self.W += self.deltaW self.b += self.deltab self.c += self.deltac if self.l1_regularization > 0: self.W *= (np.abs(self.W) > (self.l1_regularization * self.learning_rate/(1.+self.decrease_constant*self.n_updates))) self.n_updates += 1
def cond_probs(self, y_set, gamma_set): """ Given the set of gamma variables, outputs the set of probabilities p(y_t | y_{t-1}, ... , y_1, gamma_{t-1}, ... , gamma_1) """ # Note (HUGO): this function should probably be implemented in C # to make it much faster, since it requires for loops. # Setting variables with friendlier name d_y = self.input_size d_z = self.latent_size A = self.A C = self.C Sigma = self.Sigma E = self.E cond_probs = [] map_probs = [] laplace_probs = [] y_pred = [] z_n_z_n_post_sum = zeros((d_z, d_z)) # Temporary variable, to avoid memory allocation vec_d_z = zeros(d_z) vec_d_z2 = zeros(d_z) vec_d_y = zeros(d_y) mat_d_z_d_z = zeros((d_z, d_z)) mat_d_z_d_z2 = zeros((d_z, d_z)) eye_d_z = eye(d_z) mat_times_C_trans = zeros((d_z, d_y)) pred = zeros(d_y) cov_pred = zeros((d_y, d_y)) A_gamma = zeros((d_z, d_z)) E_gamma = zeros((d_z, d_z)) K = zeros((d_z, d_y)) KC = zeros((d_z, d_z)) J = zeros((d_z, d_z)) A_times_prev_mu = zeros(d_z) Af_d_y_d_y = zeros((d_y, d_y), order='fortran') # Temporary variables Bf_d_y_d_z = zeros((d_y, d_z), order='fortran') # for calls to Af_d_z_d_z = zeros((d_z, d_z), order='fortran') # math.linalg.solve(...) Bf_d_z_d_z = zeros((d_z, d_z), order='fortran') pivots_d_y = zeros((d_y), dtype='i', order='fortran') pivots_d_z = zeros((d_z), dtype='i', order='fortran') z_n_z_n_post = zeros((d_z, d_z)) next_z_n_z_n_post = zeros((d_z, d_z)) log_det_diff2_log_gamma = 0 for y_t, gamma_t in zip(y_set, gamma_set): T = len(y_t) cond_probs_t = zeros(T) map_probs_t = zeros(T) laplace_probs_t = zeros(T) y_pred_t = zeros((T, d_y)) mu_kalman_t = zeros((T, d_z)) # Filtering mus E_kalman_t = zeros((T, d_z, d_z)) # Filtering Es mu_post_t = zeros((T, d_z)) E_post_t = zeros((T, d_z, d_z)) P_t = zeros((T - 1, d_z, d_z)) # Forward pass # Initialization at n = 0 A_times_prev_mu[:] = 0 multiply(C.T, reshape(gamma_t[0], (-1, 1)), mat_times_C_trans) pred[:] = 0 product_matrix_matrix(C, mat_times_C_trans, cov_pred) cov_pred += Sigma solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y, Bf_d_y_d_z, pivots_d_y) vec_d_y[:] = y_t[0] vec_d_y -= pred product_matrix_vector(K, vec_d_y, mu_kalman_t[0]) product_matrix_matrix(K, C, KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC multiply(mat_d_z_d_z, gamma_t[0], E_kalman_t[0]) cond_probs_t[0] = self.multivariate_norm_log_pdf( y_t[0], pred, cov_pred) y_pred_t[0] = pred # from n=1 to T-1 for n in xrange(T - 1): divide(1., E, vec_d_z) divide(1., gamma_t[n + 1], vec_d_z2) vec_d_z += vec_d_z2 divide(1., vec_d_z, vec_d_z2) setdiag(E_gamma, vec_d_z2) divide(E, gamma_t[n + 1], vec_d_z) vec_d_z += 1 divide(A, reshape(vec_d_z, (-1, 1)), A_gamma) P_tn = P_t[n] product_matrix_matrix(E_kalman_t[n], A_gamma.T, mat_d_z_d_z) product_matrix_matrix(A_gamma, mat_d_z_d_z, P_tn) P_tn += E_gamma product_matrix_vector(A_gamma, mu_kalman_t[n], A_times_prev_mu) product_matrix_matrix(P_tn, C.T, mat_times_C_trans) product_matrix_vector(C, A_times_prev_mu, pred) product_matrix_matrix(C, mat_times_C_trans, cov_pred) cov_pred += Sigma solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y, Bf_d_y_d_z, pivots_d_y) vec_d_y[:] = y_t[n + 1] vec_d_y -= pred product_matrix_vector(K, vec_d_y, mu_kalman_t[n + 1]) mu_kalman_t[n + 1] += A_times_prev_mu product_matrix_matrix(K, C, KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC product_matrix_matrix(mat_d_z_d_z, P_tn, mat_d_z_d_z2) # To ensure symmetry E_kalman_t[n + 1] = mat_d_z_d_z2 E_kalman_t[n + 1] += mat_d_z_d_z2.T E_kalman_t[n + 1] /= 2 mu_post_t[-1] = mu_kalman_t[-1] E_post_t[-1] = E_kalman_t[-1] # Compute last step statistics outer(mu_post_t[-1], mu_post_t[-1], z_n_z_n_post) z_n_z_n_post += E_post_t[-1] # Update cumulative statistics z_n_z_n_post_sum += z_n_z_n_post cond_probs_t[n + 1] = self.multivariate_norm_log_pdf( y_t[n + 1], pred, cov_pred) y_pred_t[n + 1] = pred #print y_t, y_pred_t # Backward pass pred[:] = 0 cov_pred[:] = 0 for n in xrange(T - 2, -1, -1): next_z_n_z_n_post[:] = z_n_z_n_post divide(E, gamma_t[n + 1], vec_d_z) vec_d_z += 1 divide(A, reshape(vec_d_z, (-1, 1)), A_gamma) P_tn = P_t[n] solve(P_tn.T, A_gamma, mat_d_z_d_z, Af_d_z_d_z, Bf_d_z_d_z, pivots_d_z) product_matrix_matrix(E_kalman_t[n], mat_d_z_d_z.T, J) product_matrix_vector(A_gamma, mu_kalman_t[n], vec_d_z) vec_d_z *= -1 vec_d_z += mu_post_t[n + 1] product_matrix_vector(J, vec_d_z, mu_post_t[n]) mu_post_t[n] += mu_kalman_t[n] mat_d_z_d_z[:] = E_post_t[n + 1] mat_d_z_d_z -= P_tn product_matrix_matrix(mat_d_z_d_z, J.T, mat_d_z_d_z2) product_matrix_matrix(J, mat_d_z_d_z2, mat_d_z_d_z) # To ensure symmetry E_post_t[n] = E_kalman_t[n] E_post_t[n] += mat_d_z_d_z E_post_t[n] += E_kalman_t[n].T E_post_t[n] += mat_d_z_d_z.T E_post_t[n] /= 2 outer(mu_post_t[n], mu_post_t[n], z_n_z_n_post) z_n_z_n_post += E_post_t[n] dummy = self.compute_gamma(A, E, z_n_z_n_post, next_z_n_z_n_post, gamma_t[n + 1]) log_prior_gamma = self.log_prior_gamma(gamma_t[n + 1]) #print log_prior_gamma log_prior_log_gamma = self.log_prior_log_gamma(gamma_t[n + 1]) log_det_diff2_log_gamma = self.log_det_diff2_log_gamma( A, E, z_n_z_n_post, next_z_n_z_n_post, gamma_t[n + 1]) map_probs_t[n + 1] = cond_probs_t[n + 1] + log_prior_gamma laplace_probs_t[ n + 1] = cond_probs_t[n + 1] + log_prior_log_gamma + d_z * log( 2 * pi) / 2 - 0.5 * log_det_diff2_log_gamma gamma_t[0] = (diag(z_n_z_n_post) + 2 * self.gamma_prior_beta) / ( 2 * self.gamma_prior_alpha + 3) log_prior_gamma = self.log_prior_gamma(gamma_t[0]) log_prior_log_gamma = self.log_prior_log_gamma(gamma_t[0]) log_det_diff2_log_gamma = sum( (z_n_z_n_post / 2 + self.gamma_prior_beta) / gamma_t[0]) map_probs_t[0] = cond_probs_t[0] + log_prior_gamma laplace_probs_t[ 0] = cond_probs_t[0] + log_prior_log_gamma + d_z * log( 2 * pi) / 2 - 0.5 * log_det_diff2_log_gamma cond_probs += [cond_probs_t] map_probs += [map_probs_t] laplace_probs += [laplace_probs_t] y_pred += [y_pred_t] return cond_probs, map_probs, laplace_probs, y_pred
def EM_step(self, y_set, gamma_set, training=False, return_mu_post=False): """ Computes the posterior statistics and outputs the M step estimates of the parameters. Also outputs the non-parametric, sparsity inducing variances gamma_t. Optionally, can output the posterior means of the latent state variables. """ # Setting variables with friendlier name d_y = self.input_size d_z = self.latent_size #V_zero = self.V_zero A = self.A C = self.C Sigma = self.Sigma E = self.E # Variables for estimating new parameters A_new = zeros((d_z, d_z)) C_new = zeros((d_y, d_z)) z_n_z_n_1_post_sum = zeros((d_z, d_z)) z_n_z_n_post_sum = zeros((d_z, d_z)) A_new_denums = zeros((d_z, d_z, d_z)) y_n_z_n_post_sum = zeros((d_y, d_z)) # Temporary variable, to avoid memory allocation vec_d_z = zeros(d_z) vec_d_z2 = zeros(d_z) vec_d_y = zeros(d_y) mat_d_z_d_z = zeros((d_z, d_z)) mat_d_z_d_z2 = zeros((d_z, d_z)) eye_d_z = eye(d_z) mat_times_C_trans = zeros((d_z, d_y)) pred = zeros(d_y) cov_pred = zeros((d_y, d_y)) A_gamma = zeros((d_z, d_z)) E_gamma = zeros((d_z, d_z)) K = zeros((d_z, d_y)) KC = zeros((d_z, d_z)) J = zeros((d_z, d_z)) A_times_prev_mu = zeros(d_z) Af_d_y_d_y = zeros((d_y, d_y), order='fortran') # Temporary variables Bf_d_y_d_z = zeros((d_y, d_z), order='fortran') # for calls to Af_d_z_d_z = zeros((d_z, d_z), order='fortran') # math.linalg.solve(...) Bf_d_z_d_z = zeros((d_z, d_z), order='fortran') pivots_d_y = zeros((d_y), dtype='i', order='fortran') pivots_d_z = zeros((d_z), dtype='i', order='fortran') z_n_z_n_1_post = zeros((d_z, d_z)) z_n_z_n_post = zeros((d_z, d_z)) weighted_z_n_z_n_post = zeros((d_z, d_z, d_z)) next_z_n_z_n_post = zeros((d_z, d_z)) y_n_z_n_post = zeros((d_y, d_z)) if training == True: max_Esteps = self.max_Esteps last_Esteps = self.last_Esteps else: max_Esteps = self.max_test_Esteps last_Esteps = self.max_test_Esteps Esteps = 0 have_A_denum = False get_A_denum = False finished = False while not finished: T_sum = 0 gamma_mean_diff = 0 z_n_z_n_1_post_sum[:] = 0 z_n_z_n_post_sum[:] = 0 y_n_z_n_post_sum[:] = 0 A_new_denums[:] = 0 Esteps += 1 if Esteps == max_Esteps: get_A_denum = True finished = True elif Esteps >= last_Esteps: get_A_denum = True if return_mu_post: mu_post = [] for y_t, gamma_t in zip(y_set, gamma_set): T = len(y_t) T_sum += T mu_kalman_t = zeros((T, d_z)) # Filtering mus E_kalman_t = zeros((T, d_z, d_z)) # Filtering Es mu_post_t = zeros((T, d_z)) E_post_t = zeros((T, d_z, d_z)) P_t = zeros((T - 1, d_z, d_z)) # Forward pass # Initialization at n = 0 A_times_prev_mu[:] = 0 multiply(C.T, reshape(gamma_t[0], (-1, 1)), mat_times_C_trans) pred[:] = 0 product_matrix_matrix(C, mat_times_C_trans, cov_pred) cov_pred += Sigma solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y, Bf_d_y_d_z, pivots_d_y) vec_d_y[:] = y_t[0] vec_d_y -= pred product_matrix_vector(K, vec_d_y, mu_kalman_t[0]) product_matrix_matrix(K, C, KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC multiply(mat_d_z_d_z, gamma_t[0], E_kalman_t[0]) # from n=1 to T-1 for n in xrange(T - 1): divide(1., E, vec_d_z) divide(1., gamma_t[n + 1], vec_d_z2) vec_d_z += vec_d_z2 divide(1., vec_d_z, vec_d_z2) setdiag(E_gamma, vec_d_z2) divide(E, gamma_t[n + 1], vec_d_z) vec_d_z += 1 divide(A, reshape(vec_d_z, (-1, 1)), A_gamma) P_tn = P_t[n] product_matrix_matrix(E_kalman_t[n], A_gamma.T, mat_d_z_d_z) product_matrix_matrix(A_gamma, mat_d_z_d_z, P_tn) P_tn += E_gamma product_matrix_vector(A_gamma, mu_kalman_t[n], A_times_prev_mu) product_matrix_matrix(P_tn, C.T, mat_times_C_trans) product_matrix_vector(C, A_times_prev_mu, pred) product_matrix_matrix(C, mat_times_C_trans, cov_pred) cov_pred += Sigma solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y, Bf_d_y_d_z, pivots_d_y) vec_d_y[:] = y_t[n + 1] vec_d_y -= pred product_matrix_vector(K, vec_d_y, mu_kalman_t[n + 1]) mu_kalman_t[n + 1] += A_times_prev_mu product_matrix_matrix(K, C, KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC product_matrix_matrix(mat_d_z_d_z, P_tn, mat_d_z_d_z2) # To ensure symmetry E_kalman_t[n + 1] = mat_d_z_d_z2 E_kalman_t[n + 1] += mat_d_z_d_z2.T E_kalman_t[n + 1] /= 2 mu_post_t[-1] = mu_kalman_t[-1] E_post_t[-1] = E_kalman_t[-1] # Compute last step statistics outer(mu_post_t[-1], mu_post_t[-1], z_n_z_n_post) z_n_z_n_post += E_post_t[-1] outer(y_t[-1], mu_post_t[-1], y_n_z_n_post) # Update cumulative statistics z_n_z_n_post_sum += z_n_z_n_post y_n_z_n_post_sum += y_n_z_n_post # Backward pass pred[:] = 0 cov_pred[:] = 0 for n in xrange(T - 2, -1, -1): next_z_n_z_n_post[:] = z_n_z_n_post divide(E, gamma_t[n + 1], vec_d_z) vec_d_z += 1 divide(A, reshape(vec_d_z, (-1, 1)), A_gamma) P_tn = P_t[n] solve(P_tn.T, A_gamma, mat_d_z_d_z, Af_d_z_d_z, Bf_d_z_d_z, pivots_d_z) product_matrix_matrix(E_kalman_t[n], mat_d_z_d_z.T, J) product_matrix_vector(A_gamma, mu_kalman_t[n], vec_d_z) vec_d_z *= -1 vec_d_z += mu_post_t[n + 1] product_matrix_vector(J, vec_d_z, mu_post_t[n]) mu_post_t[n] += mu_kalman_t[n] mat_d_z_d_z[:] = E_post_t[n + 1] mat_d_z_d_z -= P_tn product_matrix_matrix(mat_d_z_d_z, J.T, mat_d_z_d_z2) product_matrix_matrix(J, mat_d_z_d_z2, mat_d_z_d_z) # To ensure symmetry E_post_t[n] = E_kalman_t[n] E_post_t[n] += mat_d_z_d_z E_post_t[n] += E_kalman_t[n].T E_post_t[n] += mat_d_z_d_z.T E_post_t[n] /= 2 # Compute posterior statistics product_matrix_matrix(J, E_post_t[n + 1], z_n_z_n_1_post) outer(mu_post_t[n + 1], mu_post_t[n], mat_d_z_d_z) z_n_z_n_1_post += mat_d_z_d_z outer(mu_post_t[n], mu_post_t[n], z_n_z_n_post) z_n_z_n_post += E_post_t[n] outer(y_t[n], mu_post_t[n], y_n_z_n_post) # Update cumulative statistics z_n_z_n_1_post_sum += z_n_z_n_1_post z_n_z_n_post_sum += z_n_z_n_post y_n_z_n_post_sum += y_n_z_n_post gamma_mean_diff += self.compute_gamma( A, E, z_n_z_n_post, next_z_n_z_n_post, gamma_t[n + 1]) #print gamma_t[n+1] if get_A_denum == True: # Compute the denominator of the A update, # which requires d_z matrices of size (d_z,d_z) # (i.e. d_z different weighted sums of the z_n_z_n_post matrices) add(gamma_t[n + 1], E, vec_d_z) divide(gamma_t[n + 1], vec_d_z, vec_d_z2) multiply(reshape(z_n_z_n_post, (1, d_z, d_z)), reshape(vec_d_z2, (d_z, 1, 1)), weighted_z_n_z_n_post) A_new_denums += weighted_z_n_z_n_post have_A_denum = True new_gamma = (diag(z_n_z_n_post) + 2 * self.gamma_prior_beta ) / (2 * self.gamma_prior_alpha + 3) gamma_mean_diff += sum((gamma_t[0] - new_gamma)**2) / d_z gamma_t[0] = new_gamma gamma_mean_diff /= T_sum if gamma_mean_diff < self.gamma_change_tolerance: if training == True: if have_A_denum == True: finished = True self.last_Esteps = Esteps else: get_A_denum = True else: finished = True elif gamma_mean_diff <= 10 * self.gamma_change_tolerance and training == True: get_A_denum = True if self.verbose: print gamma_mean_diff, max_Esteps, Esteps if return_mu_post: mu_post += [mu_post_t] # Compute the M step estimates of the parameters if training == True: for i in xrange(d_z): solve( A_new_denums[i] + eye(d_z) * self.latent_transition_matrix_regularizer, z_n_z_n_1_post_sum[i:(i + 1)].T, A_new[i:(i + 1)].T) solve( z_n_z_n_post_sum + eye_d_z * self.emission_matrix_regularizer, y_n_z_n_post_sum.T, C_new.T) if return_mu_post: return (A_new, C_new), gamma_set, mu_post else: return (A_new, C_new), gamma_set
def EM_step(self, y_set, return_mu_post=False): """ Computes the posterior statistics and outputs the M step estimates of the parameters. The set of probabilities p(y_t | y_{t-1}, ... , y_1) are also given. """ # Setting variables with friendlier name d_y = self.input_size d_z = self.latent_size mu_zero = self.mu_zero V_zero = self.V_zero A = self.A C = self.C Sigma = self.Sigma E = self.E # Variables for estimating new parameters A_new = zeros((d_z, d_z)) C_new = zeros((d_y, d_z)) E_new = zeros((d_z, d_z)) Sigma_new = zeros((d_y, d_y)) mu_zero_new = zeros((d_z)) V_zero_new = zeros((d_z, d_z)) z_n_z_n_1_post_sum = zeros((d_z, d_z)) z_n_z_n_post_sum = zeros((d_z, d_z)) z_n_z_n_post_sum_no_last = zeros((d_z, d_z)) z_n_z_n_post_sum_no_first = zeros((d_z, d_z)) z_n_z_n_post_sum_first = zeros((d_z, d_z)) outer_z_n_z_n_post_sum_first = zeros((d_z, d_z)) z_n_post_sum_first = zeros((d_z)) y_n_z_n_post_sum = zeros((d_y, d_z)) y_n_y_n_sum = zeros((d_y, d_y)) cond_probs = [] # Temporary variable, to avoid memory allocation vec_d_z = zeros(d_z) vec_d_y = zeros(d_y) mat_d_z_d_z = zeros((d_z, d_z)) mat_d_z_d_z2 = zeros((d_z, d_z)) eye_d_z = eye(d_z) mat_times_C_trans = zeros((d_z, d_y)) pred = zeros(d_y) cov_pred = zeros((d_y, d_y)) K = zeros((d_z, d_y)) KC = zeros((d_z, d_z)) J = zeros((d_z, d_z)) A_times_prev_mu = zeros(d_z) Af_d_y_d_y = zeros((d_y, d_y), order='fortran') # Temporary variables Bf_d_y_d_z = zeros((d_y, d_z), order='fortran') # for calls to Af_d_z_d_z = zeros((d_z, d_z), order='fortran') # math.linalg.solve(...) Bf_d_z_d_z = zeros((d_z, d_z), order='fortran') pivots_d_y = zeros((d_y), dtype='i', order='fortran') pivots_d_z = zeros((d_z), dtype='i', order='fortran') z_n_z_n_1_post = zeros((d_z, d_z)) z_n_z_n_post = zeros((d_z, d_z)) y_n_z_n_post = zeros((d_y, d_z)) y_n_y_n = zeros((d_y, d_y)) T_sum = 0 if return_mu_post: mu_post = [] for y_t in y_set: T = len(y_t) T_sum += T mu_kalman_t = zeros((T, d_z)) # Filtering mus E_kalman_t = zeros((T, d_z, d_z)) # Filtering Es mu_post_t = zeros( (T, d_z)) # Posterior mus (could be removed and computed once) E_post_t = zeros( (T, d_z, d_z)) # Posterior Es (could be removed and computed once) P_t = zeros((T - 1, d_z, d_z)) cond_probs_t = zeros(T) # Forward pass # Initialization at n = 0 A_times_prev_mu[:] = 0 product_matrix_matrix(V_zero, C.T, mat_times_C_trans) product_matrix_vector(C, mu_zero, pred) product_matrix_matrix(C, mat_times_C_trans, cov_pred) cov_pred += Sigma solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y, Bf_d_y_d_z, pivots_d_y) vec_d_y[:] = y_t[0] vec_d_y -= pred product_matrix_vector(K, vec_d_y, mu_kalman_t[0]) mu_kalman_t[0] += mu_zero product_matrix_matrix(K, C, KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC product_matrix_matrix(mat_d_z_d_z, V_zero, E_kalman_t[0]) cond_probs_t[0] = self.multivariate_norm_log_pdf( y_t[0], pred, cov_pred) # from n=1 to T-1 for n in xrange(T - 1): P_tn = P_t[n] product_matrix_matrix(E_kalman_t[n], A.T, mat_d_z_d_z) product_matrix_matrix(A, mat_d_z_d_z, P_tn) P_tn += E product_matrix_vector(A, mu_kalman_t[n], A_times_prev_mu) product_matrix_matrix(P_tn, C.T, mat_times_C_trans) product_matrix_vector(C, A_times_prev_mu, pred) product_matrix_matrix(C, mat_times_C_trans, cov_pred) cov_pred += Sigma solve(cov_pred, mat_times_C_trans.T, K.T, Af_d_y_d_y, Bf_d_y_d_z, pivots_d_y) vec_d_y[:] = y_t[n + 1] vec_d_y -= pred product_matrix_vector(K, vec_d_y, mu_kalman_t[n + 1]) mu_kalman_t[n + 1] += A_times_prev_mu product_matrix_matrix(K, C, KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC product_matrix_matrix(mat_d_z_d_z, P_tn, mat_d_z_d_z2) # To ensure symmetry E_kalman_t[n + 1] = mat_d_z_d_z2 E_kalman_t[n + 1] += mat_d_z_d_z2.T E_kalman_t[n + 1] /= 2 cond_probs_t[n + 1] = self.multivariate_norm_log_pdf( y_t[n + 1], pred, cov_pred) mu_post_t[-1] = mu_kalman_t[-1] E_post_t[-1] = E_kalman_t[-1] # Compute last step statistics outer(mu_post_t[-1], mu_post_t[-1], z_n_z_n_post) z_n_z_n_post += E_post_t[-1] outer(y_t[-1], mu_post_t[-1], y_n_z_n_post) outer(y_t[-1], y_t[-1], y_n_y_n) # Update cumulative statistics z_n_z_n_post_sum += z_n_z_n_post z_n_z_n_post_sum_no_first += z_n_z_n_post y_n_z_n_post_sum += y_n_z_n_post y_n_y_n_sum += y_n_y_n # Backward pass pred[:] = 0 cov_pred[:] = 0 for n in xrange(T - 2, -1, -1): P_tn = P_t[n] solve(P_tn.T, A, mat_d_z_d_z, Af_d_z_d_z, Bf_d_z_d_z, pivots_d_z) product_matrix_matrix(E_kalman_t[n], mat_d_z_d_z.T, J) product_matrix_vector(A, mu_kalman_t[n], vec_d_z) vec_d_z *= -1 vec_d_z += mu_post_t[n + 1] product_matrix_vector(J, vec_d_z, mu_post_t[n]) mu_post_t[n] += mu_kalman_t[n] mat_d_z_d_z[:] = E_post_t[n + 1] mat_d_z_d_z -= P_tn product_matrix_matrix(mat_d_z_d_z, J.T, mat_d_z_d_z2) product_matrix_matrix(J, mat_d_z_d_z2, mat_d_z_d_z) # To ensure symmetry E_post_t[n] = E_kalman_t[n] E_post_t[n] += mat_d_z_d_z E_post_t[n] += E_kalman_t[n].T E_post_t[n] += mat_d_z_d_z.T E_post_t[n] /= 2 # Compute posterior statistics product_matrix_matrix(J, E_post_t[n + 1], z_n_z_n_1_post) outer(mu_post_t[n + 1], mu_post_t[n], mat_d_z_d_z) z_n_z_n_1_post += mat_d_z_d_z outer(mu_post_t[n], mu_post_t[n], z_n_z_n_post) z_n_z_n_post += E_post_t[n] outer(y_t[n], mu_post_t[n], y_n_z_n_post) outer(y_t[n], y_t[n], y_n_y_n) # Update cumulative statistics z_n_z_n_1_post_sum += z_n_z_n_1_post z_n_z_n_post_sum += z_n_z_n_post if n > 0: z_n_z_n_post_sum_no_first += z_n_z_n_post else: z_n_z_n_post_sum_first += z_n_z_n_post z_n_post_sum_first += mu_post_t[n] outer(mu_post_t[n], mu_post_t[n], mat_d_z_d_z) outer_z_n_z_n_post_sum_first += mat_d_z_d_z z_n_z_n_post_sum_no_last += z_n_z_n_post y_n_z_n_post_sum += y_n_z_n_post y_n_y_n_sum += y_n_y_n cond_probs += [cond_probs_t] if return_mu_post: mu_post += [mu_post_t] # Compute the M step estimates of the parameters #A_new = dot(z_n_z_n_1_post_sum,inv(z_n_z_n_post_sum_no_last+ # eye_d_z*self.latent_transition_matrix_regularizer)) solve( z_n_z_n_post_sum_no_last + eye_d_z * self.latent_transition_matrix_regularizer, z_n_z_n_1_post_sum.T, A_new.T) #C_new = dot(y_n_z_n_post_sum, inv(z_n_z_n_post_sum+ # eye_d_z*self.input_transition_matrix_regularizer)) solve( z_n_z_n_post_sum + eye_d_z * self.input_transition_matrix_regularizer, y_n_z_n_post_sum.T, C_new.T) E_new[:] = z_n_z_n_post_sum_no_first z_n_z_n_1_A_T = dot(z_n_z_n_1_post_sum, A_new.T) E_new -= z_n_z_n_1_A_T.T E_new -= z_n_z_n_1_A_T # There is an error in Bishop's equation: the transpose on A is missing E_new += dot(A_new, dot(z_n_z_n_post_sum_no_last, A_new.T)) E_new += eye_d_z * self.latent_covariance_matrix_regularizer E_new /= T_sum - len(y_set) Sigma_new[:] = y_n_y_n_sum C_z_n_y_n = dot(C_new, y_n_z_n_post_sum.T) Sigma_new -= C_z_n_y_n Sigma_new -= C_z_n_y_n.T # There is an error in Bishop's equation: the transpose on C is missing Sigma_new += dot(C_new, dot(z_n_z_n_post_sum, C_new.T)) # ... idem Sigma_new += eye(d_y) * self.input_covariance_matrix_regularizer Sigma_new /= T_sum mu_zero_new[:] = z_n_post_sum_first mu_zero_new /= len(y_set) V_zero_new[:] = z_n_z_n_post_sum_first V_zero_new -= outer_z_n_z_n_post_sum_first V_zero_new /= len(y_set) if return_mu_post: return (A_new, C_new, E_new, Sigma_new, mu_zero_new, V_zero_new), cond_probs, mu_post else: return (A_new, C_new, E_new, Sigma_new, mu_zero_new, V_zero_new), cond_probs
def EM_step(self,y_set,gamma_set,training = False, return_mu_post = False): """ Computes the posterior statistics and outputs the M step estimates of the parameters. Also outputs the non-parametric, sparsity inducing variances gamma_t. Optionally, can output the posterior means of the latent state variables. """ # Setting variables with friendlier name d_y = self.input_size d_z = self.latent_size #V_zero = self.V_zero A = self.A C = self.C Sigma = self.Sigma E = self.E # Variables for estimating new parameters A_new = zeros((d_z,d_z)) C_new = zeros((d_y,d_z)) z_n_z_n_1_post_sum = zeros((d_z,d_z)) z_n_z_n_post_sum = zeros((d_z,d_z)) A_new_denums = zeros((d_z,d_z,d_z)) y_n_z_n_post_sum = zeros((d_y,d_z)) # Temporary variable, to avoid memory allocation vec_d_z = zeros(d_z) vec_d_z2 = zeros(d_z) vec_d_y = zeros(d_y) mat_d_z_d_z = zeros((d_z,d_z)) mat_d_z_d_z2 = zeros((d_z,d_z)) eye_d_z = eye(d_z) mat_times_C_trans = zeros((d_z,d_y)) pred = zeros(d_y) cov_pred = zeros((d_y,d_y)) A_gamma = zeros((d_z,d_z)) E_gamma = zeros((d_z,d_z)) K = zeros((d_z,d_y)) KC = zeros((d_z,d_z)) J = zeros((d_z,d_z)) A_times_prev_mu = zeros(d_z) Af_d_y_d_y = zeros((d_y,d_y),order='fortran') # Temporary variables Bf_d_y_d_z = zeros((d_y,d_z),order='fortran') # for calls to Af_d_z_d_z = zeros((d_z,d_z),order='fortran') # math.linalg.solve(...) Bf_d_z_d_z = zeros((d_z,d_z),order='fortran') pivots_d_y = zeros((d_y),dtype='i',order='fortran') pivots_d_z = zeros((d_z),dtype='i',order='fortran') z_n_z_n_1_post = zeros((d_z,d_z)) z_n_z_n_post = zeros((d_z,d_z)) weighted_z_n_z_n_post = zeros((d_z,d_z,d_z)) next_z_n_z_n_post = zeros((d_z,d_z)) y_n_z_n_post = zeros((d_y,d_z)) if training == True: max_Esteps = self.max_Esteps last_Esteps = self.last_Esteps else: max_Esteps = self.max_test_Esteps last_Esteps = self.max_test_Esteps Esteps = 0 have_A_denum = False get_A_denum = False finished = False while not finished: T_sum = 0 gamma_mean_diff = 0 z_n_z_n_1_post_sum[:] = 0 z_n_z_n_post_sum[:] = 0 y_n_z_n_post_sum[:] = 0 A_new_denums[:] = 0 Esteps += 1 if Esteps == max_Esteps: get_A_denum = True finished = True elif Esteps >= last_Esteps: get_A_denum = True if return_mu_post: mu_post = [] for y_t,gamma_t in zip(y_set,gamma_set): T = len(y_t) T_sum += T mu_kalman_t = zeros((T,d_z)) # Filtering mus E_kalman_t = zeros((T,d_z,d_z)) # Filtering Es mu_post_t = zeros((T,d_z)) E_post_t = zeros((T,d_z,d_z)) P_t = zeros((T-1,d_z,d_z)) # Forward pass # Initialization at n = 0 A_times_prev_mu[:] = 0 multiply(C.T,reshape(gamma_t[0],(-1,1)),mat_times_C_trans) pred[:] = 0 product_matrix_matrix(C,mat_times_C_trans,cov_pred) cov_pred += Sigma solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y) vec_d_y[:] = y_t[0] vec_d_y -= pred product_matrix_vector(K,vec_d_y,mu_kalman_t[0]) product_matrix_matrix(K,C,KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC multiply(mat_d_z_d_z,gamma_t[0],E_kalman_t[0]) # from n=1 to T-1 for n in xrange(T-1): divide(1.,E,vec_d_z) divide(1.,gamma_t[n+1],vec_d_z2) vec_d_z += vec_d_z2 divide(1.,vec_d_z,vec_d_z2) setdiag(E_gamma,vec_d_z2) divide(E,gamma_t[n+1],vec_d_z) vec_d_z += 1 divide(A,reshape(vec_d_z,(-1,1)),A_gamma) P_tn = P_t[n] product_matrix_matrix(E_kalman_t[n],A_gamma.T,mat_d_z_d_z) product_matrix_matrix(A_gamma,mat_d_z_d_z,P_tn) P_tn += E_gamma product_matrix_vector(A_gamma,mu_kalman_t[n],A_times_prev_mu) product_matrix_matrix(P_tn,C.T,mat_times_C_trans) product_matrix_vector(C,A_times_prev_mu,pred) product_matrix_matrix(C,mat_times_C_trans,cov_pred) cov_pred += Sigma solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y) vec_d_y[:] = y_t[n+1] vec_d_y -= pred product_matrix_vector(K,vec_d_y,mu_kalman_t[n+1]) mu_kalman_t[n+1] += A_times_prev_mu product_matrix_matrix(K,C,KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC product_matrix_matrix(mat_d_z_d_z,P_tn,mat_d_z_d_z2) # To ensure symmetry E_kalman_t[n+1] = mat_d_z_d_z2 E_kalman_t[n+1] += mat_d_z_d_z2.T E_kalman_t[n+1] /= 2 mu_post_t[-1] = mu_kalman_t[-1] E_post_t[-1] = E_kalman_t[-1] # Compute last step statistics outer(mu_post_t[-1],mu_post_t[-1],z_n_z_n_post) z_n_z_n_post += E_post_t[-1] outer(y_t[-1],mu_post_t[-1],y_n_z_n_post) # Update cumulative statistics z_n_z_n_post_sum += z_n_z_n_post y_n_z_n_post_sum += y_n_z_n_post # Backward pass pred[:] = 0 cov_pred[:] = 0 for n in xrange(T-2,-1,-1): next_z_n_z_n_post[:] = z_n_z_n_post divide(E,gamma_t[n+1],vec_d_z) vec_d_z += 1 divide(A,reshape(vec_d_z,(-1,1)),A_gamma) P_tn = P_t[n] solve(P_tn.T,A_gamma,mat_d_z_d_z,Af_d_z_d_z,Bf_d_z_d_z,pivots_d_z) product_matrix_matrix(E_kalman_t[n],mat_d_z_d_z.T,J) product_matrix_vector(A_gamma,mu_kalman_t[n],vec_d_z) vec_d_z *= -1 vec_d_z += mu_post_t[n+1] product_matrix_vector(J,vec_d_z,mu_post_t[n]) mu_post_t[n] += mu_kalman_t[n] mat_d_z_d_z[:] = E_post_t[n+1] mat_d_z_d_z -= P_tn product_matrix_matrix(mat_d_z_d_z,J.T,mat_d_z_d_z2) product_matrix_matrix(J,mat_d_z_d_z2,mat_d_z_d_z) # To ensure symmetry E_post_t[n] = E_kalman_t[n] E_post_t[n] += mat_d_z_d_z E_post_t[n] += E_kalman_t[n].T E_post_t[n] += mat_d_z_d_z.T E_post_t[n] /= 2 # Compute posterior statistics product_matrix_matrix(J,E_post_t[n+1],z_n_z_n_1_post) outer(mu_post_t[n+1],mu_post_t[n],mat_d_z_d_z) z_n_z_n_1_post += mat_d_z_d_z outer(mu_post_t[n],mu_post_t[n],z_n_z_n_post) z_n_z_n_post += E_post_t[n] outer(y_t[n],mu_post_t[n],y_n_z_n_post) # Update cumulative statistics z_n_z_n_1_post_sum += z_n_z_n_1_post z_n_z_n_post_sum += z_n_z_n_post y_n_z_n_post_sum += y_n_z_n_post gamma_mean_diff += self.compute_gamma(A,E,z_n_z_n_post,next_z_n_z_n_post,gamma_t[n+1]) #print gamma_t[n+1] if get_A_denum == True: # Compute the denominator of the A update, # which requires d_z matrices of size (d_z,d_z) # (i.e. d_z different weighted sums of the z_n_z_n_post matrices) add(gamma_t[n+1],E,vec_d_z) divide(gamma_t[n+1],vec_d_z,vec_d_z2) multiply(reshape(z_n_z_n_post,(1,d_z,d_z)),reshape(vec_d_z2,(d_z,1,1)),weighted_z_n_z_n_post) A_new_denums += weighted_z_n_z_n_post have_A_denum = True new_gamma = (diag(z_n_z_n_post)+2*self.gamma_prior_beta)/(2*self.gamma_prior_alpha+3) gamma_mean_diff += sum((gamma_t[0]-new_gamma)**2)/d_z gamma_t[0] = new_gamma gamma_mean_diff /= T_sum if gamma_mean_diff < self.gamma_change_tolerance: if training == True: if have_A_denum == True: finished = True self.last_Esteps = Esteps else: get_A_denum = True else: finished = True elif gamma_mean_diff <= 10*self.gamma_change_tolerance and training == True: get_A_denum = True if self.verbose: print gamma_mean_diff, max_Esteps, Esteps if return_mu_post: mu_post += [mu_post_t] # Compute the M step estimates of the parameters if training == True: for i in xrange(d_z): solve(A_new_denums[i]+eye(d_z)*self.latent_transition_matrix_regularizer,z_n_z_n_1_post_sum[i:(i+1)].T,A_new[i:(i+1)].T) solve(z_n_z_n_post_sum+eye_d_z*self.emission_matrix_regularizer,y_n_z_n_post_sum.T,C_new.T) if return_mu_post: return (A_new,C_new),gamma_set,mu_post else: return (A_new,C_new),gamma_set
def EM_step(self,y_set,return_mu_post = False): """ Computes the posterior statistics and outputs the M step estimates of the parameters. The set of probabilities p(y_t | y_{t-1}, ... , y_1) are also given. """ # Setting variables with friendlier name d_y = self.input_size d_z = self.latent_size mu_zero = self.mu_zero V_zero = self.V_zero A = self.A C = self.C Sigma = self.Sigma E = self.E # Variables for estimating new parameters A_new = zeros((d_z,d_z)) C_new = zeros((d_y,d_z)) E_new = zeros((d_z,d_z)) Sigma_new = zeros((d_y,d_y)) mu_zero_new = zeros((d_z)) V_zero_new = zeros((d_z,d_z)) z_n_z_n_1_post_sum = zeros((d_z,d_z)) z_n_z_n_post_sum = zeros((d_z,d_z)) z_n_z_n_post_sum_no_last = zeros((d_z,d_z)) z_n_z_n_post_sum_no_first = zeros((d_z,d_z)) z_n_z_n_post_sum_first = zeros((d_z,d_z)) outer_z_n_z_n_post_sum_first = zeros((d_z,d_z)) z_n_post_sum_first = zeros((d_z)) y_n_z_n_post_sum = zeros((d_y,d_z)) y_n_y_n_sum = zeros((d_y,d_y)) cond_probs = [] # Temporary variable, to avoid memory allocation vec_d_z = zeros(d_z) vec_d_y = zeros(d_y) mat_d_z_d_z = zeros((d_z,d_z)) mat_d_z_d_z2 = zeros((d_z,d_z)) eye_d_z = eye(d_z) mat_times_C_trans = zeros((d_z,d_y)) pred = zeros(d_y) cov_pred = zeros((d_y,d_y)) K = zeros((d_z,d_y)) KC = zeros((d_z,d_z)) J = zeros((d_z,d_z)) A_times_prev_mu = zeros(d_z) Af_d_y_d_y = zeros((d_y,d_y),order='fortran') # Temporary variables Bf_d_y_d_z = zeros((d_y,d_z),order='fortran') # for calls to Af_d_z_d_z = zeros((d_z,d_z),order='fortran') # math.linalg.solve(...) Bf_d_z_d_z = zeros((d_z,d_z),order='fortran') pivots_d_y = zeros((d_y),dtype='i',order='fortran') pivots_d_z = zeros((d_z),dtype='i',order='fortran') z_n_z_n_1_post = zeros((d_z,d_z)) z_n_z_n_post = zeros((d_z,d_z)) y_n_z_n_post = zeros((d_y,d_z)) y_n_y_n = zeros((d_y,d_y)) T_sum = 0 if return_mu_post: mu_post = [] for y_t in y_set: T = len(y_t) T_sum += T mu_kalman_t = zeros((T,d_z)) # Filtering mus E_kalman_t = zeros((T,d_z,d_z)) # Filtering Es mu_post_t = zeros((T,d_z)) # Posterior mus (could be removed and computed once) E_post_t = zeros((T,d_z,d_z)) # Posterior Es (could be removed and computed once) P_t = zeros((T-1,d_z,d_z)) cond_probs_t = zeros(T) # Forward pass # Initialization at n = 0 A_times_prev_mu[:] = 0 product_matrix_matrix(V_zero,C.T,mat_times_C_trans) product_matrix_vector(C,mu_zero,pred) product_matrix_matrix(C,mat_times_C_trans,cov_pred) cov_pred += Sigma solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y) vec_d_y[:] = y_t[0] vec_d_y -= pred product_matrix_vector(K,vec_d_y,mu_kalman_t[0]) mu_kalman_t[0] += mu_zero product_matrix_matrix(K,C,KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC product_matrix_matrix(mat_d_z_d_z,V_zero,E_kalman_t[0]) cond_probs_t[0] = self.multivariate_norm_log_pdf(y_t[0],pred,cov_pred) # from n=1 to T-1 for n in xrange(T-1): P_tn = P_t[n] product_matrix_matrix(E_kalman_t[n],A.T,mat_d_z_d_z) product_matrix_matrix(A,mat_d_z_d_z,P_tn) P_tn += E product_matrix_vector(A,mu_kalman_t[n],A_times_prev_mu) product_matrix_matrix(P_tn,C.T,mat_times_C_trans) product_matrix_vector(C,A_times_prev_mu,pred) product_matrix_matrix(C,mat_times_C_trans,cov_pred) cov_pred += Sigma solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y) vec_d_y[:] = y_t[n+1] vec_d_y -= pred product_matrix_vector(K,vec_d_y,mu_kalman_t[n+1]) mu_kalman_t[n+1] += A_times_prev_mu product_matrix_matrix(K,C,KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC product_matrix_matrix(mat_d_z_d_z,P_tn,mat_d_z_d_z2) # To ensure symmetry E_kalman_t[n+1] = mat_d_z_d_z2 E_kalman_t[n+1] += mat_d_z_d_z2.T E_kalman_t[n+1] /= 2 cond_probs_t[n+1] = self.multivariate_norm_log_pdf(y_t[n+1],pred,cov_pred) mu_post_t[-1] = mu_kalman_t[-1] E_post_t[-1] = E_kalman_t[-1] # Compute last step statistics outer(mu_post_t[-1],mu_post_t[-1],z_n_z_n_post) z_n_z_n_post += E_post_t[-1] outer(y_t[-1],mu_post_t[-1],y_n_z_n_post) outer(y_t[-1],y_t[-1],y_n_y_n) # Update cumulative statistics z_n_z_n_post_sum += z_n_z_n_post z_n_z_n_post_sum_no_first += z_n_z_n_post y_n_z_n_post_sum += y_n_z_n_post y_n_y_n_sum += y_n_y_n # Backward pass pred[:] = 0 cov_pred[:] = 0 for n in xrange(T-2,-1,-1): P_tn = P_t[n] solve(P_tn.T,A,mat_d_z_d_z,Af_d_z_d_z,Bf_d_z_d_z,pivots_d_z) product_matrix_matrix(E_kalman_t[n],mat_d_z_d_z.T,J) product_matrix_vector(A,mu_kalman_t[n],vec_d_z) vec_d_z *= -1 vec_d_z += mu_post_t[n+1] product_matrix_vector(J,vec_d_z,mu_post_t[n]) mu_post_t[n] += mu_kalman_t[n] mat_d_z_d_z[:] = E_post_t[n+1] mat_d_z_d_z -= P_tn product_matrix_matrix(mat_d_z_d_z,J.T,mat_d_z_d_z2) product_matrix_matrix(J,mat_d_z_d_z2,mat_d_z_d_z) # To ensure symmetry E_post_t[n] = E_kalman_t[n] E_post_t[n] += mat_d_z_d_z E_post_t[n] += E_kalman_t[n].T E_post_t[n] += mat_d_z_d_z.T E_post_t[n] /= 2 # Compute posterior statistics product_matrix_matrix(J,E_post_t[n+1],z_n_z_n_1_post) outer(mu_post_t[n+1],mu_post_t[n],mat_d_z_d_z) z_n_z_n_1_post += mat_d_z_d_z outer(mu_post_t[n],mu_post_t[n],z_n_z_n_post) z_n_z_n_post += E_post_t[n] outer(y_t[n],mu_post_t[n],y_n_z_n_post) outer(y_t[n],y_t[n],y_n_y_n) # Update cumulative statistics z_n_z_n_1_post_sum += z_n_z_n_1_post z_n_z_n_post_sum += z_n_z_n_post if n > 0: z_n_z_n_post_sum_no_first += z_n_z_n_post else: z_n_z_n_post_sum_first += z_n_z_n_post z_n_post_sum_first += mu_post_t[n] outer(mu_post_t[n],mu_post_t[n],mat_d_z_d_z) outer_z_n_z_n_post_sum_first += mat_d_z_d_z z_n_z_n_post_sum_no_last += z_n_z_n_post y_n_z_n_post_sum += y_n_z_n_post y_n_y_n_sum += y_n_y_n cond_probs += [cond_probs_t] if return_mu_post: mu_post += [mu_post_t] # Compute the M step estimates of the parameters #A_new = dot(z_n_z_n_1_post_sum,inv(z_n_z_n_post_sum_no_last+ # eye_d_z*self.latent_transition_matrix_regularizer)) solve(z_n_z_n_post_sum_no_last+eye_d_z*self.latent_transition_matrix_regularizer, z_n_z_n_1_post_sum.T,A_new.T) #C_new = dot(y_n_z_n_post_sum, inv(z_n_z_n_post_sum+ # eye_d_z*self.input_transition_matrix_regularizer)) solve(z_n_z_n_post_sum+eye_d_z*self.input_transition_matrix_regularizer, y_n_z_n_post_sum.T,C_new.T) E_new[:] = z_n_z_n_post_sum_no_first z_n_z_n_1_A_T = dot(z_n_z_n_1_post_sum,A_new.T) E_new -= z_n_z_n_1_A_T.T E_new -= z_n_z_n_1_A_T # There is an error in Bishop's equation: the transpose on A is missing E_new += dot(A_new,dot(z_n_z_n_post_sum_no_last,A_new.T)) E_new += eye_d_z*self.latent_covariance_matrix_regularizer E_new /= T_sum - len(y_set) Sigma_new[:] = y_n_y_n_sum C_z_n_y_n = dot(C_new,y_n_z_n_post_sum.T) Sigma_new -= C_z_n_y_n Sigma_new -= C_z_n_y_n.T # There is an error in Bishop's equation: the transpose on C is missing Sigma_new += dot(C_new,dot(z_n_z_n_post_sum,C_new.T)) # ... idem Sigma_new += eye(d_y)*self.input_covariance_matrix_regularizer Sigma_new /= T_sum mu_zero_new[:] = z_n_post_sum_first mu_zero_new /= len(y_set) V_zero_new[:] = z_n_z_n_post_sum_first V_zero_new -= outer_z_n_z_n_post_sum_first V_zero_new /= len(y_set) if return_mu_post: return (A_new,C_new,E_new,Sigma_new,mu_zero_new,V_zero_new),cond_probs,mu_post else: return (A_new,C_new,E_new,Sigma_new,mu_zero_new,V_zero_new),cond_probs
def cond_probs(self,y_set,gamma_set): """ Given the set of gamma variables, outputs the set of probabilities p(y_t | y_{t-1}, ... , y_1, gamma_{t-1}, ... , gamma_1) """ # Note (HUGO): this function should probably be implemented in C # to make it much faster, since it requires for loops. # Setting variables with friendlier name d_y = self.input_size d_z = self.latent_size A = self.A C = self.C Sigma = self.Sigma E = self.E cond_probs = [] map_probs = [] laplace_probs = [] y_pred = [] z_n_z_n_post_sum = zeros((d_z,d_z)) # Temporary variable, to avoid memory allocation vec_d_z = zeros(d_z) vec_d_z2 = zeros(d_z) vec_d_y = zeros(d_y) mat_d_z_d_z = zeros((d_z,d_z)) mat_d_z_d_z2 = zeros((d_z,d_z)) eye_d_z = eye(d_z) mat_times_C_trans = zeros((d_z,d_y)) pred = zeros(d_y) cov_pred = zeros((d_y,d_y)) A_gamma = zeros((d_z,d_z)) E_gamma = zeros((d_z,d_z)) K = zeros((d_z,d_y)) KC = zeros((d_z,d_z)) J = zeros((d_z,d_z)) A_times_prev_mu = zeros(d_z) Af_d_y_d_y = zeros((d_y,d_y),order='fortran') # Temporary variables Bf_d_y_d_z = zeros((d_y,d_z),order='fortran') # for calls to Af_d_z_d_z = zeros((d_z,d_z),order='fortran') # math.linalg.solve(...) Bf_d_z_d_z = zeros((d_z,d_z),order='fortran') pivots_d_y = zeros((d_y),dtype='i',order='fortran') pivots_d_z = zeros((d_z),dtype='i',order='fortran') z_n_z_n_post = zeros((d_z,d_z)) next_z_n_z_n_post = zeros((d_z,d_z)) log_det_diff2_log_gamma = 0 for y_t,gamma_t in zip(y_set,gamma_set): T = len(y_t) cond_probs_t = zeros(T) map_probs_t = zeros(T) laplace_probs_t = zeros(T) y_pred_t = zeros((T,d_y)) mu_kalman_t = zeros((T,d_z)) # Filtering mus E_kalman_t = zeros((T,d_z,d_z)) # Filtering Es mu_post_t = zeros((T,d_z)) E_post_t = zeros((T,d_z,d_z)) P_t = zeros((T-1,d_z,d_z)) # Forward pass # Initialization at n = 0 A_times_prev_mu[:] = 0 multiply(C.T,reshape(gamma_t[0],(-1,1)),mat_times_C_trans) pred[:] = 0 product_matrix_matrix(C,mat_times_C_trans,cov_pred) cov_pred += Sigma solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y) vec_d_y[:] = y_t[0] vec_d_y -= pred product_matrix_vector(K,vec_d_y,mu_kalman_t[0]) product_matrix_matrix(K,C,KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC multiply(mat_d_z_d_z,gamma_t[0],E_kalman_t[0]) cond_probs_t[0] = self.multivariate_norm_log_pdf(y_t[0],pred,cov_pred) y_pred_t[0] = pred # from n=1 to T-1 for n in xrange(T-1): divide(1.,E,vec_d_z) divide(1.,gamma_t[n+1],vec_d_z2) vec_d_z += vec_d_z2 divide(1.,vec_d_z,vec_d_z2) setdiag(E_gamma,vec_d_z2) divide(E,gamma_t[n+1],vec_d_z) vec_d_z += 1 divide(A,reshape(vec_d_z,(-1,1)),A_gamma) P_tn = P_t[n] product_matrix_matrix(E_kalman_t[n],A_gamma.T,mat_d_z_d_z) product_matrix_matrix(A_gamma,mat_d_z_d_z,P_tn) P_tn += E_gamma product_matrix_vector(A_gamma,mu_kalman_t[n],A_times_prev_mu) product_matrix_matrix(P_tn,C.T,mat_times_C_trans) product_matrix_vector(C,A_times_prev_mu,pred) product_matrix_matrix(C,mat_times_C_trans,cov_pred) cov_pred += Sigma solve(cov_pred,mat_times_C_trans.T,K.T,Af_d_y_d_y,Bf_d_y_d_z,pivots_d_y) vec_d_y[:] = y_t[n+1] vec_d_y -= pred product_matrix_vector(K,vec_d_y,mu_kalman_t[n+1]) mu_kalman_t[n+1] += A_times_prev_mu product_matrix_matrix(K,C,KC) mat_d_z_d_z[:] = eye_d_z mat_d_z_d_z -= KC product_matrix_matrix(mat_d_z_d_z,P_tn,mat_d_z_d_z2) # To ensure symmetry E_kalman_t[n+1] = mat_d_z_d_z2 E_kalman_t[n+1] += mat_d_z_d_z2.T E_kalman_t[n+1] /= 2 mu_post_t[-1] = mu_kalman_t[-1] E_post_t[-1] = E_kalman_t[-1] # Compute last step statistics outer(mu_post_t[-1],mu_post_t[-1],z_n_z_n_post) z_n_z_n_post += E_post_t[-1] # Update cumulative statistics z_n_z_n_post_sum += z_n_z_n_post cond_probs_t[n+1] = self.multivariate_norm_log_pdf(y_t[n+1],pred,cov_pred) y_pred_t[n+1] = pred #print y_t, y_pred_t # Backward pass pred[:] = 0 cov_pred[:] = 0 for n in xrange(T-2,-1,-1): next_z_n_z_n_post[:] = z_n_z_n_post divide(E,gamma_t[n+1],vec_d_z) vec_d_z += 1 divide(A,reshape(vec_d_z,(-1,1)),A_gamma) P_tn = P_t[n] solve(P_tn.T,A_gamma,mat_d_z_d_z,Af_d_z_d_z,Bf_d_z_d_z,pivots_d_z) product_matrix_matrix(E_kalman_t[n],mat_d_z_d_z.T,J) product_matrix_vector(A_gamma,mu_kalman_t[n],vec_d_z) vec_d_z *= -1 vec_d_z += mu_post_t[n+1] product_matrix_vector(J,vec_d_z,mu_post_t[n]) mu_post_t[n] += mu_kalman_t[n] mat_d_z_d_z[:] = E_post_t[n+1] mat_d_z_d_z -= P_tn product_matrix_matrix(mat_d_z_d_z,J.T,mat_d_z_d_z2) product_matrix_matrix(J,mat_d_z_d_z2,mat_d_z_d_z) # To ensure symmetry E_post_t[n] = E_kalman_t[n] E_post_t[n] += mat_d_z_d_z E_post_t[n] += E_kalman_t[n].T E_post_t[n] += mat_d_z_d_z.T E_post_t[n] /= 2 outer(mu_post_t[n],mu_post_t[n],z_n_z_n_post) z_n_z_n_post += E_post_t[n] dummy = self.compute_gamma(A,E,z_n_z_n_post,next_z_n_z_n_post,gamma_t[n+1]) log_prior_gamma = self.log_prior_gamma(gamma_t[n+1]) #print log_prior_gamma log_prior_log_gamma = self.log_prior_log_gamma(gamma_t[n+1]) log_det_diff2_log_gamma = self.log_det_diff2_log_gamma(A,E,z_n_z_n_post,next_z_n_z_n_post,gamma_t[n+1]) map_probs_t[n+1] = cond_probs_t[n+1]+log_prior_gamma laplace_probs_t[n+1] = cond_probs_t[n+1]+log_prior_log_gamma+d_z*log(2*pi)/2-0.5*log_det_diff2_log_gamma gamma_t[0] = (diag(z_n_z_n_post)+2*self.gamma_prior_beta)/(2*self.gamma_prior_alpha+3) log_prior_gamma = self.log_prior_gamma(gamma_t[0]) log_prior_log_gamma = self.log_prior_log_gamma(gamma_t[0]) log_det_diff2_log_gamma = sum((z_n_z_n_post/2+self.gamma_prior_beta)/gamma_t[0]) map_probs_t[0] = cond_probs_t[0]+log_prior_gamma laplace_probs_t[0] = cond_probs_t[0]+log_prior_log_gamma+d_z*log(2*pi)/2-0.5*log_det_diff2_log_gamma cond_probs += [cond_probs_t] map_probs += [map_probs_t] laplace_probs += [laplace_probs_t] y_pred += [y_pred_t] return cond_probs, map_probs, laplace_probs, y_pred
def update_learner(self, example): # apply example to the inputs self.layers[0][:] = example[0] # forward propagation: compute activation values of all units # hidden layers for h in range(self.n_hidden_layers): mllin.product_matrix_vector(self.Ws[h], self.layers[h], self.layer_acts[h + 1]) self.layer_acts[h + 1] += self.cs[h] mlnonlin.sigmoid(self.layer_acts[h + 1], self.layers[h + 1]) # output layer mllin.product_matrix_vector(self.U, self.layers[-1], self.output_act) self.output_act += self.d mlnonlin.softmax(self.output_act, self.output) # back propagation: compute delta errors and updates to weights and # biases # TA:begin if self.cost_function == 'CE': self.doutput_act[:] = self.output self.doutput_act[example[1]] -= 1 elif self.cost_function == 'SSE': y = self.output.copy() t = np.zeros(np.shape(y)) t[example[1]] = 1 # nr of classes c = np.size(y) T2 = (y-t)*y T2 = np.array([T2]) T2 = T2.T T2 = np.tile(T2,[1,c]) T3 = np.eye(c,c) T3 = T3 - np.tile(y,[c,1]) # delta error at output layer self.doutput_act = np.sum(T2*T3,axis=0) elif self.cost_function == 'EXP': y = self.output.copy() t = np.zeros(np.shape(y)) t[example[1]] = 1 # nr of classes c = np.size(y) T1 = y-t T1 = np.square(T1) T1 = np.sum(T1) T1 = T1/self.tau T1 = np.exp(T1) T1 = 2*T1 T2 = (y-t)*y T2 = np.array([T2]) T2 = T2.T T2 = np.tile(T2,[1,c]) T3 = np.eye(c,c) T3 = T3 - np.tile(y,[c,1]) # delta error at output layer self.doutput_act = T1 * np.sum(T2*T3,axis=0) # TA:end self.doutput_act *= self.learning_rate / (1. + self.decrease_constant * self.n_updates) self.dd[:] = self.doutput_act mllin.outer(self.doutput_act, self.layers[-1], self.dU) mllin.product_matrix_vector(self.U.T, self.doutput_act, self.dlayers[-1]) """ The description and argument names of dsigmoid() are unclear. In practice, dsigmoid(s,dx,ds) computes s*(1-s)*dx element-wise and puts the result in ds. [TA] """ mlnonlin.dsigmoid(self.layers[-1], self.dlayers[-1], self.dlayer_acts[-1]) for h in range(self.n_hidden_layers - 1, -1, -1): self.dcs[h][:] = self.dlayer_acts[h + 1] mllin.outer(self.dlayer_acts[h + 1], self.layers[h], self.dWs[h]) mllin.product_matrix_vector(self.Ws[h].T, self.dlayer_acts[h + 1], self.dlayers[h]) mlnonlin.dsigmoid(self.layers[h], self.dlayers[h], self.dlayer_acts[h]) #TA: if not self.freeze_Ws_cs: # update output weights and biases self.U -= self.dU self.d -= self.dd # update all hidden weights and biases for h in range(self.n_hidden_layers - 1, -1, -1): self.Ws[h] -= self.dWs[h] self.cs[h] -= self.dcs[h] else: # update output weights and biases self.U -= self.dU self.d -= self.dd # # update only highest hidden layer # h = self.n_hidden_layers - 1 # self.Ws[h] -= self.dWs[h] # self.cs[h] -= self.dcs[h] self.n_updates += 1