def __init__(self, config_dictionary): #completed """variables for Neural Network: feature_file_name(read from) required_variables - required variables for running system all_variables - all valid variables for each type""" self.feature_file_name = self.default_variable_define( config_dictionary, 'feature_file_name', arg_type='string') self.features, self.feature_sequence_lens = self.read_feature_file() self.model = Bidirectional_RNNLM_Weight() self.output_name = self.default_variable_define(config_dictionary, 'output_name', arg_type='string') self.required_variables = dict() self.all_variables = dict() self.required_variables['train'] = [ 'mode', 'feature_file_name', 'output_name' ] self.all_variables['train'] = self.required_variables['train'] + [ 'label_file_name', 'num_hiddens', 'weight_matrix_name', 'initial_weight_max', 'initial_weight_min', 'initial_bias_max', 'initial_bias_min', 'save_each_epoch', 'do_pretrain', 'pretrain_method', 'pretrain_iterations', 'pretrain_learning_rate', 'pretrain_batch_size', 'do_backprop', 'backprop_method', 'backprop_batch_size', 'l2_regularization_const', 'num_epochs', 'num_line_searches', 'armijo_const', 'wolfe_const', 'steepest_learning_rate', 'momentum_rate', 'conjugate_max_iterations', 'conjugate_const_type', 'truncated_newton_num_cg_epochs', 'truncated_newton_init_damping_factor', 'krylov_num_directions', 'krylov_num_batch_splits', 'krylov_num_bfgs_epochs', 'second_order_matrix', 'krylov_use_hessian_preconditioner', 'krylov_eigenvalue_floor_const', 'fisher_preconditioner_floor_val', 'use_fisher_preconditioner', 'structural_damping_const', 'validation_feature_file_name', 'validation_label_file_name' ] self.required_variables['test'] = [ 'mode', 'feature_file_name', 'weight_matrix_name', 'output_name' ] self.all_variables['test'] = self.required_variables['test'] + [ 'label_file_name' ]
def backprop_adagrad_single_batch(self): print "Starting backprop using adagrad" adagrad_weight = Bidirectional_RNNLM_Weight() adagrad_weight.init_zero_weights(self.model.get_architecture()) buffer_weight = Bidirectional_RNNLM_Weight() buffer_weight.init_zero_weights(self.model.get_architecture()) fudge_factor = 1.0 adagrad_weight = adagrad_weight + fudge_factor gradient = Bidirectional_RNNLM_Weight() gradient.init_zero_weights(self.model.get_architecture()) if self.validation_feature_file_name is not None: cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics( self.validation_features, self.validation_labels, self.validation_fsl, self.model) print "cross-entropy before steepest descent is", cross_entropy print "perplexity is", perplexity if self.l2_regularization_const > 0.0: print "regularized loss is", loss print "number correctly classified is", num_correct, "of", num_examples # excluded_keys = {'bias':['0'], 'weights':[]} # frame_table = np.cumsum(self.feature_sequence_lens) for epoch_num in range(len(self.steepest_learning_rate)): print "At epoch", epoch_num + 1, "of", len( self.steepest_learning_rate ), "with learning rate", self.steepest_learning_rate[epoch_num] start_frame = 0 end_frame = 0 cross_entropy = 0.0 num_examples = 0 # if hasattr(self, 'momentum_rate'): # momentum_rate = self.momentum_rate[epoch_num] # print "momentum is", momentum_rate # else: # momentum_rate = 0.0 for batch_index, feature_sequence_len in enumerate( self.feature_sequence_lens): end_frame = start_frame + feature_sequence_len batch_features = self.features[:feature_sequence_len, batch_index] batch_labels = self.labels[start_frame:end_frame, 1] # print "" # print batch_index # print batch_features # print batch_labels cur_xent = self.calculate_gradient_single_batch( batch_features, batch_labels, gradient, return_cross_entropy=True, check_gradient=False) # print self.model.norm() # print gradient.norm() if self.l2_regularization_const > 0.0: buffer_weight.assign_weights(self.model) buffer_weight *= self.l2_regularization_const gradient += buffer_weight buffer_weight.assign_weights(gradient) # print gradient.init_hiddens buffer_weight **= 2.0 adagrad_weight += buffer_weight # print adagrad_weight.init_hiddens buffer_weight.assign_weights(adagrad_weight) buffer_weight **= 0.5 # print buffer_weight.init_hiddens gradient /= buffer_weight # print gradient.init_hiddens cross_entropy += cur_xent per_done = float(batch_index) / self.num_sequences * 100 sys.stdout.write( "\r \r" ) #clear line sys.stdout.write("\r%.1f%% done " % per_done), sys.stdout.flush() ppp = cross_entropy / end_frame sys.stdout.write("train X-ent: %f " % ppp), sys.stdout.flush() gradient *= -self.steepest_learning_rate[epoch_num] self.model += gradient #/ batch_size # if momentum_rate > 0.0: # prev_step *= momentum_rate # self.model += prev_step # prev_step.assign_weights(gradient) # prev_step *= -self.steepest_learning_rate[epoch_num] start_frame = end_frame if self.validation_feature_file_name is not None: cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics( self.validation_features, self.validation_labels, self.validation_fsl, self.model) print "cross-entropy at the end of the epoch is", cross_entropy print "perplexity is", perplexity if self.l2_regularization_const > 0.0: print "regularized loss is", loss print "number correctly classified is", num_correct, "of", num_examples sys.stdout.write("\r100.0% done \r") sys.stdout.write( "\r \r" ) #clear line if self.save_each_epoch: self.model.write_weights(''.join( [self.output_name, '_epoch_', str(epoch_num + 1)]))
def calculate_gradient_single_batch(self, batch_inputs, batch_labels, gradient_weights, hiddens_forward=None, hiddens_backward=None, outputs=None, check_gradient=False, model=None, l2_regularization_const=0.0, return_cross_entropy=False): #need to check regularization #calculate gradient with particular Neural Network model. If None is specified, will use current weights (i.e., self.model) batch_size = batch_labels.size if model is None: model = self.model if hiddens_forward is None or hiddens_backward is None or outputs is None: outputs, hiddens_forward, hiddens_backward = self.forward_pass_single_batch( batch_inputs, model, return_hiddens=True) #derivative of log(cross-entropy softmax) batch_indices = np.arange(batch_size) gradient_weights *= 0.0 backward_inputs = outputs if return_cross_entropy: cross_entropy = -np.sum( np.log2(backward_inputs[batch_indices, batch_labels])) backward_inputs[batch_indices, batch_labels] -= 1.0 np.sum(backward_inputs, axis=0, out=gradient_weights.bias['output'][0]) np.dot(hiddens_forward.T, backward_inputs, out=gradient_weights.weights['hidden_output_forward']) pre_nonlinearity_hiddens_forward = np.dot( backward_inputs[batch_size - 1, :], model.weights['hidden_output_forward'].T) pre_nonlinearity_hiddens_forward *= hiddens_forward[batch_size - 1, :] pre_nonlinearity_hiddens_forward *= 1 - hiddens_forward[batch_size - 1, :] np.dot(hiddens_backward.T, backward_inputs, out=gradient_weights.weights['hidden_output_backward']) pre_nonlinearity_hiddens_backward = np.dot( backward_inputs[0, :], model.weights['hidden_output_backward'].T) pre_nonlinearity_hiddens_backward *= hiddens_backward[0, :] pre_nonlinearity_hiddens_backward *= 1 - hiddens_backward[0, :] if batch_size > 1: gradient_weights.weights['visible_hidden'][batch_inputs[ batch_size - 1]] += pre_nonlinearity_hiddens_forward gradient_weights.weights['hidden_hidden_forward'] += np.outer( hiddens_forward[batch_size - 2, :], pre_nonlinearity_hiddens_forward) gradient_weights.bias['hidden_forward'][ 0] += pre_nonlinearity_hiddens_forward gradient_weights.weights['visible_hidden'][ batch_inputs[0]] += pre_nonlinearity_hiddens_backward gradient_weights.weights['hidden_hidden_backward'] += np.outer( hiddens_backward[1, :], pre_nonlinearity_hiddens_backward) gradient_weights.bias['hidden_backward'][ 0] += pre_nonlinearity_hiddens_backward for index in range(batch_size - 2): backward_index = batch_size - 2 - index forward_index = index + 1 pre_nonlinearity_hiddens_forward = ( (np.dot(backward_inputs[backward_index, :], model.weights['hidden_output_forward'].T) + np.dot(pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T)) * hiddens_forward[backward_index, :] * (1 - hiddens_forward[backward_index, :])) pre_nonlinearity_hiddens_backward = ( (np.dot(backward_inputs[forward_index, :], model.weights['hidden_output_backward'].T) + np.dot(pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T)) * hiddens_backward[forward_index, :] * (1 - hiddens_backward[forward_index, :])) gradient_weights.weights['visible_hidden'][batch_inputs[ backward_index]] += pre_nonlinearity_hiddens_forward #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_forward'] += np.outer( hiddens_forward[backward_index - 1, :], pre_nonlinearity_hiddens_forward) gradient_weights.bias['hidden_forward'][ 0] += pre_nonlinearity_hiddens_forward gradient_weights.weights['visible_hidden'][batch_inputs[ forward_index]] += pre_nonlinearity_hiddens_backward #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_backward'] += np.outer( hiddens_backward[forward_index + 1, :], pre_nonlinearity_hiddens_backward) gradient_weights.bias['hidden_backward'][ 0] += pre_nonlinearity_hiddens_backward if batch_size > 1: pre_nonlinearity_hiddens_forward = ( (np.dot(backward_inputs[0, :], model.weights['hidden_output_forward'].T) + np.dot(pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T)) * hiddens_forward[0, :] * (1 - hiddens_forward[0, :])) pre_nonlinearity_hiddens_backward = ( (np.dot(backward_inputs[-1, :], model.weights['hidden_output_backward'].T) + np.dot(pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T)) * hiddens_backward[-1, :] * (1 - hiddens_backward[-1, :])) gradient_weights.weights['visible_hidden'][batch_inputs[ 0]] += pre_nonlinearity_hiddens_forward # += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_forward'] += np.outer( model.init_hiddens['forward'], pre_nonlinearity_hiddens_forward ) #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens) gradient_weights.bias['hidden_forward'][ 0] += pre_nonlinearity_hiddens_forward gradient_weights.init_hiddens['forward'][0] = np.dot( pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T) gradient_weights.weights['visible_hidden'][batch_inputs[ -1]] += pre_nonlinearity_hiddens_backward # += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_backward'] += np.outer( model.init_hiddens['backward'], pre_nonlinearity_hiddens_backward ) #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens) gradient_weights.bias['hidden_backward'][ 0] += pre_nonlinearity_hiddens_backward gradient_weights.init_hiddens['backward'][0] = np.dot( pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T) # gradient_weights = self.backward_pass(backward_inputs, hiddens, batch_inputs, model) backward_inputs[batch_indices, batch_labels] += 1.0 gradient_weights /= batch_size if l2_regularization_const > 0.0: gradient_weights += model * l2_regularization_const if return_cross_entropy and not check_gradient: return cross_entropy # if not check_gradient: # if not return_cross_entropy: # if l2_regularization_const > 0.0: # return gradient_weights / batch_size + model * l2_regularization_const # return gradient_weights / batch_size # else: # if l2_regularization_const > 0.0: # return gradient_weights / batch_size + model * l2_regularization_const, cross_entropy # return gradient_weights / batch_size, cross_entropy ### below block checks gradient... only to be used if you think the gradient is incorrectly calculated ############## else: gradient_weights *= batch_size if l2_regularization_const > 0.0: gradient_weights += model * (l2_regularization_const * batch_size) sys.stdout.write( "\r \r" ) print "checking gradient..." finite_difference_model = Bidirectional_RNNLM_Weight() finite_difference_model.init_zero_weights( self.model.get_architecture(), verbose=False) direction = Bidirectional_RNNLM_Weight() direction.init_zero_weights(self.model.get_architecture(), verbose=False) epsilon = 1E-5 print "at initial hiddens" for key in direction.init_hiddens.keys(): for index in range(direction.init_hiddens[key].size): direction.init_hiddens[key][0][index] = epsilon forward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model + direction)[batch_indices, batch_labels])) backward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model - direction)[batch_indices, batch_labels])) finite_difference_model.init_hiddens[key][0][index] = ( forward_loss - backward_loss) / (2 * epsilon) direction.init_hiddens[key][0][index] = 0.0 for key in direction.bias.keys(): print "at bias key", key for index in range(direction.bias[key].size): direction.bias[key][0][index] = epsilon #print direction.norm() forward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model + direction)[batch_indices, batch_labels])) backward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model - direction)[batch_indices, batch_labels])) finite_difference_model.bias[key][0][index] = ( forward_loss - backward_loss) / (2 * epsilon) direction.bias[key][0][index] = 0.0 for key in direction.weights.keys(): print "at weight key", key for index0 in range(direction.weights[key].shape[0]): for index1 in range(direction.weights[key].shape[1]): direction.weights[key][index0][index1] = epsilon forward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model + direction)[batch_indices, batch_labels])) backward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model - direction)[batch_indices, batch_labels])) finite_difference_model.weights[key][index0][ index1] = (forward_loss - backward_loss) / (2 * epsilon) direction.weights[key][index0][index1] = 0.0 print "calculated gradient for forward initial hiddens" print gradient_weights.init_hiddens['forward'] print "finite difference approximation for forward initial hiddens" print finite_difference_model.init_hiddens['forward'] print "calculated gradient for backward initial hiddens" print gradient_weights.init_hiddens['backward'] print "finite difference approximation for backward initial hiddens" print finite_difference_model.init_hiddens['backward'] print "calculated gradient for forward hidden bias" print gradient_weights.bias['hidden_forward'] print "finite difference approximation for forward hidden bias" print finite_difference_model.bias['hidden_forward'] print "calculated gradient for backward hidden bias" print gradient_weights.bias['hidden_backward'] print "finite difference approximation for backward hidden bias" print finite_difference_model.bias['hidden_backward'] print "calculated gradient for output bias" print gradient_weights.bias['output'] print "finite difference approximation for output bias" print finite_difference_model.bias['output'] print "calculated gradient for visible_hidden layer" print gradient_weights.weights['visible_hidden'] print "finite difference approximation for visible_hidden layer" print finite_difference_model.weights['visible_hidden'] print np.sum((finite_difference_model.weights['visible_hidden'] - gradient_weights.weights['visible_hidden'])**2) print "calculated gradient for hidden_hidden_forward layer" print gradient_weights.weights['hidden_hidden_forward'] print "finite difference approximation for hidden_hidden_forward layer" print finite_difference_model.weights['hidden_hidden_forward'] print "calculated gradient for hidden_hidden_backward layer" print gradient_weights.weights['hidden_hidden_backward'] print "finite difference approximation for hidden_hidden_backward layer" print finite_difference_model.weights['hidden_hidden_backward'] print "calculated gradient for hidden_output_forward layer" print gradient_weights.weights['hidden_output_forward'] print "finite difference approximation for hidden_output_forward layer" print finite_difference_model.weights['hidden_output_forward'] print "calculated gradient for hidden_output_backward layer" print gradient_weights.weights['hidden_output_backward'] print "finite difference approximation for hidden_output_backward layer" print finite_difference_model.weights['hidden_output_backward'] sys.exit()