def __init__(self, config_dictionary):  #completed
        """variables for Neural Network: feature_file_name(read from)
        required_variables - required variables for running system
        all_variables - all valid variables for each type"""
        self.feature_file_name = self.default_variable_define(
            config_dictionary, 'feature_file_name', arg_type='string')
        self.features, self.feature_sequence_lens = self.read_feature_file()
        self.model = Bidirectional_RNNLM_Weight()
        self.output_name = self.default_variable_define(config_dictionary,
                                                        'output_name',
                                                        arg_type='string')

        self.required_variables = dict()
        self.all_variables = dict()
        self.required_variables['train'] = [
            'mode', 'feature_file_name', 'output_name'
        ]
        self.all_variables['train'] = self.required_variables['train'] + [
            'label_file_name', 'num_hiddens', 'weight_matrix_name',
            'initial_weight_max', 'initial_weight_min', 'initial_bias_max',
            'initial_bias_min', 'save_each_epoch', 'do_pretrain',
            'pretrain_method', 'pretrain_iterations', 'pretrain_learning_rate',
            'pretrain_batch_size', 'do_backprop', 'backprop_method',
            'backprop_batch_size', 'l2_regularization_const', 'num_epochs',
            'num_line_searches', 'armijo_const', 'wolfe_const',
            'steepest_learning_rate', 'momentum_rate',
            'conjugate_max_iterations', 'conjugate_const_type',
            'truncated_newton_num_cg_epochs',
            'truncated_newton_init_damping_factor', 'krylov_num_directions',
            'krylov_num_batch_splits', 'krylov_num_bfgs_epochs',
            'second_order_matrix', 'krylov_use_hessian_preconditioner',
            'krylov_eigenvalue_floor_const', 'fisher_preconditioner_floor_val',
            'use_fisher_preconditioner', 'structural_damping_const',
            'validation_feature_file_name', 'validation_label_file_name'
        ]
        self.required_variables['test'] = [
            'mode', 'feature_file_name', 'weight_matrix_name', 'output_name'
        ]
        self.all_variables['test'] = self.required_variables['test'] + [
            'label_file_name'
        ]
示例#2
0
    def backprop_adagrad_single_batch(self):
        print "Starting backprop using adagrad"
        adagrad_weight = Bidirectional_RNNLM_Weight()
        adagrad_weight.init_zero_weights(self.model.get_architecture())

        buffer_weight = Bidirectional_RNNLM_Weight()
        buffer_weight.init_zero_weights(self.model.get_architecture())

        fudge_factor = 1.0
        adagrad_weight = adagrad_weight + fudge_factor
        gradient = Bidirectional_RNNLM_Weight()
        gradient.init_zero_weights(self.model.get_architecture())
        if self.validation_feature_file_name is not None:
            cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics(
                self.validation_features, self.validation_labels,
                self.validation_fsl, self.model)
            print "cross-entropy before steepest descent is", cross_entropy
            print "perplexity is", perplexity
            if self.l2_regularization_const > 0.0:
                print "regularized loss is", loss
            print "number correctly classified is", num_correct, "of", num_examples

#        excluded_keys = {'bias':['0'], 'weights':[]}
#        frame_table = np.cumsum(self.feature_sequence_lens)
        for epoch_num in range(len(self.steepest_learning_rate)):
            print "At epoch", epoch_num + 1, "of", len(
                self.steepest_learning_rate
            ), "with learning rate", self.steepest_learning_rate[epoch_num]
            start_frame = 0
            end_frame = 0
            cross_entropy = 0.0
            num_examples = 0
            #            if hasattr(self, 'momentum_rate'):
            #                momentum_rate = self.momentum_rate[epoch_num]
            #                print "momentum is", momentum_rate
            #            else:
            #                momentum_rate = 0.0
            for batch_index, feature_sequence_len in enumerate(
                    self.feature_sequence_lens):
                end_frame = start_frame + feature_sequence_len
                batch_features = self.features[:feature_sequence_len,
                                               batch_index]
                batch_labels = self.labels[start_frame:end_frame, 1]
                #                print ""
                #                print batch_index
                #                print batch_features
                #                print batch_labels
                cur_xent = self.calculate_gradient_single_batch(
                    batch_features,
                    batch_labels,
                    gradient,
                    return_cross_entropy=True,
                    check_gradient=False)
                #                print self.model.norm()
                #                print gradient.norm()
                if self.l2_regularization_const > 0.0:
                    buffer_weight.assign_weights(self.model)
                    buffer_weight *= self.l2_regularization_const
                    gradient += buffer_weight
                buffer_weight.assign_weights(gradient)
                #                print gradient.init_hiddens
                buffer_weight **= 2.0
                adagrad_weight += buffer_weight
                #                print adagrad_weight.init_hiddens
                buffer_weight.assign_weights(adagrad_weight)
                buffer_weight **= 0.5
                #                print buffer_weight.init_hiddens
                gradient /= buffer_weight
                #                print gradient.init_hiddens
                cross_entropy += cur_xent
                per_done = float(batch_index) / self.num_sequences * 100
                sys.stdout.write(
                    "\r                                                                \r"
                )  #clear line
                sys.stdout.write("\r%.1f%% done " %
                                 per_done), sys.stdout.flush()
                ppp = cross_entropy / end_frame
                sys.stdout.write("train X-ent: %f " % ppp), sys.stdout.flush()
                gradient *= -self.steepest_learning_rate[epoch_num]
                self.model += gradient  #/ batch_size
                #                if momentum_rate > 0.0:
                #                    prev_step *= momentum_rate
                #                    self.model += prev_step
                #                prev_step.assign_weights(gradient)
                #                prev_step *= -self.steepest_learning_rate[epoch_num]

                start_frame = end_frame

            if self.validation_feature_file_name is not None:
                cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics(
                    self.validation_features, self.validation_labels,
                    self.validation_fsl, self.model)
                print "cross-entropy at the end of the epoch is", cross_entropy
                print "perplexity is", perplexity
                if self.l2_regularization_const > 0.0:
                    print "regularized loss is", loss
                print "number correctly classified is", num_correct, "of", num_examples

            sys.stdout.write("\r100.0% done \r")
            sys.stdout.write(
                "\r                                                                \r"
            )  #clear line
            if self.save_each_epoch:
                self.model.write_weights(''.join(
                    [self.output_name, '_epoch_',
                     str(epoch_num + 1)]))
示例#3
0
    def calculate_gradient_single_batch(self,
                                        batch_inputs,
                                        batch_labels,
                                        gradient_weights,
                                        hiddens_forward=None,
                                        hiddens_backward=None,
                                        outputs=None,
                                        check_gradient=False,
                                        model=None,
                                        l2_regularization_const=0.0,
                                        return_cross_entropy=False):
        #need to check regularization
        #calculate gradient with particular Neural Network model. If None is specified, will use current weights (i.e., self.model)
        batch_size = batch_labels.size
        if model is None:
            model = self.model
        if hiddens_forward is None or hiddens_backward is None or outputs is None:
            outputs, hiddens_forward, hiddens_backward = self.forward_pass_single_batch(
                batch_inputs, model, return_hiddens=True)
        #derivative of log(cross-entropy softmax)
        batch_indices = np.arange(batch_size)
        gradient_weights *= 0.0
        backward_inputs = outputs

        if return_cross_entropy:
            cross_entropy = -np.sum(
                np.log2(backward_inputs[batch_indices, batch_labels]))
        backward_inputs[batch_indices, batch_labels] -= 1.0

        np.sum(backward_inputs, axis=0, out=gradient_weights.bias['output'][0])

        np.dot(hiddens_forward.T,
               backward_inputs,
               out=gradient_weights.weights['hidden_output_forward'])
        pre_nonlinearity_hiddens_forward = np.dot(
            backward_inputs[batch_size - 1, :],
            model.weights['hidden_output_forward'].T)
        pre_nonlinearity_hiddens_forward *= hiddens_forward[batch_size - 1, :]
        pre_nonlinearity_hiddens_forward *= 1 - hiddens_forward[batch_size -
                                                                1, :]

        np.dot(hiddens_backward.T,
               backward_inputs,
               out=gradient_weights.weights['hidden_output_backward'])
        pre_nonlinearity_hiddens_backward = np.dot(
            backward_inputs[0, :], model.weights['hidden_output_backward'].T)
        pre_nonlinearity_hiddens_backward *= hiddens_backward[0, :]
        pre_nonlinearity_hiddens_backward *= 1 - hiddens_backward[0, :]

        if batch_size > 1:
            gradient_weights.weights['visible_hidden'][batch_inputs[
                batch_size - 1]] += pre_nonlinearity_hiddens_forward
            gradient_weights.weights['hidden_hidden_forward'] += np.outer(
                hiddens_forward[batch_size - 2, :],
                pre_nonlinearity_hiddens_forward)
            gradient_weights.bias['hidden_forward'][
                0] += pre_nonlinearity_hiddens_forward

            gradient_weights.weights['visible_hidden'][
                batch_inputs[0]] += pre_nonlinearity_hiddens_backward
            gradient_weights.weights['hidden_hidden_backward'] += np.outer(
                hiddens_backward[1, :], pre_nonlinearity_hiddens_backward)
            gradient_weights.bias['hidden_backward'][
                0] += pre_nonlinearity_hiddens_backward

        for index in range(batch_size - 2):
            backward_index = batch_size - 2 - index
            forward_index = index + 1
            pre_nonlinearity_hiddens_forward = (
                (np.dot(backward_inputs[backward_index, :],
                        model.weights['hidden_output_forward'].T) +
                 np.dot(pre_nonlinearity_hiddens_forward,
                        model.weights['hidden_hidden_forward'].T)) *
                hiddens_forward[backward_index, :] *
                (1 - hiddens_forward[backward_index, :]))

            pre_nonlinearity_hiddens_backward = (
                (np.dot(backward_inputs[forward_index, :],
                        model.weights['hidden_output_backward'].T) +
                 np.dot(pre_nonlinearity_hiddens_backward,
                        model.weights['hidden_hidden_backward'].T)) *
                hiddens_backward[forward_index, :] *
                (1 - hiddens_backward[forward_index, :]))

            gradient_weights.weights['visible_hidden'][batch_inputs[
                backward_index]] += pre_nonlinearity_hiddens_forward  #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens)
            gradient_weights.weights['hidden_hidden_forward'] += np.outer(
                hiddens_forward[backward_index - 1, :],
                pre_nonlinearity_hiddens_forward)
            gradient_weights.bias['hidden_forward'][
                0] += pre_nonlinearity_hiddens_forward

            gradient_weights.weights['visible_hidden'][batch_inputs[
                forward_index]] += pre_nonlinearity_hiddens_backward  #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens)
            gradient_weights.weights['hidden_hidden_backward'] += np.outer(
                hiddens_backward[forward_index + 1, :],
                pre_nonlinearity_hiddens_backward)
            gradient_weights.bias['hidden_backward'][
                0] += pre_nonlinearity_hiddens_backward

        if batch_size > 1:
            pre_nonlinearity_hiddens_forward = (
                (np.dot(backward_inputs[0, :],
                        model.weights['hidden_output_forward'].T) +
                 np.dot(pre_nonlinearity_hiddens_forward,
                        model.weights['hidden_hidden_forward'].T)) *
                hiddens_forward[0, :] * (1 - hiddens_forward[0, :]))
            pre_nonlinearity_hiddens_backward = (
                (np.dot(backward_inputs[-1, :],
                        model.weights['hidden_output_backward'].T) +
                 np.dot(pre_nonlinearity_hiddens_backward,
                        model.weights['hidden_hidden_backward'].T)) *
                hiddens_backward[-1, :] * (1 - hiddens_backward[-1, :]))

        gradient_weights.weights['visible_hidden'][batch_inputs[
            0]] += pre_nonlinearity_hiddens_forward  # += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens)
        gradient_weights.weights['hidden_hidden_forward'] += np.outer(
            model.init_hiddens['forward'], pre_nonlinearity_hiddens_forward
        )  #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens)
        gradient_weights.bias['hidden_forward'][
            0] += pre_nonlinearity_hiddens_forward
        gradient_weights.init_hiddens['forward'][0] = np.dot(
            pre_nonlinearity_hiddens_forward,
            model.weights['hidden_hidden_forward'].T)

        gradient_weights.weights['visible_hidden'][batch_inputs[
            -1]] += pre_nonlinearity_hiddens_backward  # += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens)
        gradient_weights.weights['hidden_hidden_backward'] += np.outer(
            model.init_hiddens['backward'], pre_nonlinearity_hiddens_backward
        )  #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens)
        gradient_weights.bias['hidden_backward'][
            0] += pre_nonlinearity_hiddens_backward
        gradient_weights.init_hiddens['backward'][0] = np.dot(
            pre_nonlinearity_hiddens_backward,
            model.weights['hidden_hidden_backward'].T)
        #        gradient_weights = self.backward_pass(backward_inputs, hiddens, batch_inputs, model)
        backward_inputs[batch_indices, batch_labels] += 1.0
        gradient_weights /= batch_size

        if l2_regularization_const > 0.0:
            gradient_weights += model * l2_regularization_const
        if return_cross_entropy and not check_gradient:
            return cross_entropy
#        if not check_gradient:
#            if not return_cross_entropy:
#                if l2_regularization_const > 0.0:
#                    return gradient_weights / batch_size + model * l2_regularization_const
#                return gradient_weights / batch_size
#            else:
#                if l2_regularization_const > 0.0:
#                    return gradient_weights / batch_size + model * l2_regularization_const, cross_entropy
#                return gradient_weights / batch_size, cross_entropy

### below block checks gradient... only to be used if you think the gradient is incorrectly calculated ##############
        else:
            gradient_weights *= batch_size
            if l2_regularization_const > 0.0:
                gradient_weights += model * (l2_regularization_const *
                                             batch_size)
            sys.stdout.write(
                "\r                                                                \r"
            )
            print "checking gradient..."
            finite_difference_model = Bidirectional_RNNLM_Weight()
            finite_difference_model.init_zero_weights(
                self.model.get_architecture(), verbose=False)

            direction = Bidirectional_RNNLM_Weight()
            direction.init_zero_weights(self.model.get_architecture(),
                                        verbose=False)
            epsilon = 1E-5
            print "at initial hiddens"
            for key in direction.init_hiddens.keys():
                for index in range(direction.init_hiddens[key].size):
                    direction.init_hiddens[key][0][index] = epsilon
                    forward_loss = -np.sum(
                        np.log(
                            self.forward_pass_single_batch(
                                batch_inputs, model=model +
                                direction)[batch_indices, batch_labels]))
                    backward_loss = -np.sum(
                        np.log(
                            self.forward_pass_single_batch(
                                batch_inputs, model=model -
                                direction)[batch_indices, batch_labels]))
                    finite_difference_model.init_hiddens[key][0][index] = (
                        forward_loss - backward_loss) / (2 * epsilon)
                    direction.init_hiddens[key][0][index] = 0.0
            for key in direction.bias.keys():
                print "at bias key", key
                for index in range(direction.bias[key].size):
                    direction.bias[key][0][index] = epsilon
                    #print direction.norm()
                    forward_loss = -np.sum(
                        np.log(
                            self.forward_pass_single_batch(
                                batch_inputs, model=model +
                                direction)[batch_indices, batch_labels]))
                    backward_loss = -np.sum(
                        np.log(
                            self.forward_pass_single_batch(
                                batch_inputs, model=model -
                                direction)[batch_indices, batch_labels]))
                    finite_difference_model.bias[key][0][index] = (
                        forward_loss - backward_loss) / (2 * epsilon)
                    direction.bias[key][0][index] = 0.0
            for key in direction.weights.keys():
                print "at weight key", key
                for index0 in range(direction.weights[key].shape[0]):
                    for index1 in range(direction.weights[key].shape[1]):
                        direction.weights[key][index0][index1] = epsilon
                        forward_loss = -np.sum(
                            np.log(
                                self.forward_pass_single_batch(
                                    batch_inputs, model=model +
                                    direction)[batch_indices, batch_labels]))
                        backward_loss = -np.sum(
                            np.log(
                                self.forward_pass_single_batch(
                                    batch_inputs, model=model -
                                    direction)[batch_indices, batch_labels]))
                        finite_difference_model.weights[key][index0][
                            index1] = (forward_loss -
                                       backward_loss) / (2 * epsilon)
                        direction.weights[key][index0][index1] = 0.0

            print "calculated gradient for forward initial hiddens"
            print gradient_weights.init_hiddens['forward']
            print "finite difference approximation for forward initial hiddens"
            print finite_difference_model.init_hiddens['forward']

            print "calculated gradient for backward initial hiddens"
            print gradient_weights.init_hiddens['backward']
            print "finite difference approximation for backward initial hiddens"
            print finite_difference_model.init_hiddens['backward']

            print "calculated gradient for forward hidden bias"
            print gradient_weights.bias['hidden_forward']
            print "finite difference approximation for forward hidden bias"
            print finite_difference_model.bias['hidden_forward']

            print "calculated gradient for backward hidden bias"
            print gradient_weights.bias['hidden_backward']
            print "finite difference approximation for backward hidden bias"
            print finite_difference_model.bias['hidden_backward']

            print "calculated gradient for output bias"
            print gradient_weights.bias['output']
            print "finite difference approximation for output bias"
            print finite_difference_model.bias['output']

            print "calculated gradient for visible_hidden layer"
            print gradient_weights.weights['visible_hidden']
            print "finite difference approximation for visible_hidden layer"
            print finite_difference_model.weights['visible_hidden']
            print np.sum((finite_difference_model.weights['visible_hidden'] -
                          gradient_weights.weights['visible_hidden'])**2)

            print "calculated gradient for hidden_hidden_forward layer"
            print gradient_weights.weights['hidden_hidden_forward']
            print "finite difference approximation for hidden_hidden_forward layer"
            print finite_difference_model.weights['hidden_hidden_forward']

            print "calculated gradient for hidden_hidden_backward layer"
            print gradient_weights.weights['hidden_hidden_backward']
            print "finite difference approximation for hidden_hidden_backward layer"
            print finite_difference_model.weights['hidden_hidden_backward']

            print "calculated gradient for hidden_output_forward layer"
            print gradient_weights.weights['hidden_output_forward']
            print "finite difference approximation for hidden_output_forward layer"
            print finite_difference_model.weights['hidden_output_forward']

            print "calculated gradient for hidden_output_backward layer"
            print gradient_weights.weights['hidden_output_backward']
            print "finite difference approximation for hidden_output_backward layer"
            print finite_difference_model.weights['hidden_output_backward']

            sys.exit()