def __init__(self, config_dictionary): # completed """variables for Neural Network: feature_file_name(read from) required_variables - required variables for running system all_variables - all valid variables for each type""" self.feature_file_name = self.default_variable_define(config_dictionary, "feature_file_name", arg_type="string") self.features, self.feature_sequence_lens = self.read_feature_file() self.model = Bidirectional_RNNLM_Weight() self.output_name = self.default_variable_define(config_dictionary, "output_name", arg_type="string") self.required_variables = dict() self.all_variables = dict() self.required_variables["train"] = ["mode", "feature_file_name", "output_name"] self.all_variables["train"] = self.required_variables["train"] + [ "label_file_name", "num_hiddens", "weight_matrix_name", "initial_weight_max", "initial_weight_min", "initial_bias_max", "initial_bias_min", "save_each_epoch", "do_pretrain", "pretrain_method", "pretrain_iterations", "pretrain_learning_rate", "pretrain_batch_size", "do_backprop", "backprop_method", "backprop_batch_size", "l2_regularization_const", "num_epochs", "num_line_searches", "armijo_const", "wolfe_const", "steepest_learning_rate", "momentum_rate", "conjugate_max_iterations", "conjugate_const_type", "truncated_newton_num_cg_epochs", "truncated_newton_init_damping_factor", "krylov_num_directions", "krylov_num_batch_splits", "krylov_num_bfgs_epochs", "second_order_matrix", "krylov_use_hessian_preconditioner", "krylov_eigenvalue_floor_const", "fisher_preconditioner_floor_val", "use_fisher_preconditioner", "structural_damping_const", "validation_feature_file_name", "validation_label_file_name", ] self.required_variables["test"] = ["mode", "feature_file_name", "weight_matrix_name", "output_name"] self.all_variables["test"] = self.required_variables["test"] + ["label_file_name"]
def __init__(self, config_dictionary): #completed """variables for Neural Network: feature_file_name(read from) required_variables - required variables for running system all_variables - all valid variables for each type""" self.feature_file_name = self.default_variable_define( config_dictionary, 'feature_file_name', arg_type='string') self.features, self.feature_sequence_lens = self.read_feature_file() self.model = Bidirectional_RNNLM_Weight() self.output_name = self.default_variable_define(config_dictionary, 'output_name', arg_type='string') self.required_variables = dict() self.all_variables = dict() self.required_variables['train'] = [ 'mode', 'feature_file_name', 'output_name' ] self.all_variables['train'] = self.required_variables['train'] + [ 'label_file_name', 'num_hiddens', 'weight_matrix_name', 'initial_weight_max', 'initial_weight_min', 'initial_bias_max', 'initial_bias_min', 'save_each_epoch', 'do_pretrain', 'pretrain_method', 'pretrain_iterations', 'pretrain_learning_rate', 'pretrain_batch_size', 'do_backprop', 'backprop_method', 'backprop_batch_size', 'l2_regularization_const', 'num_epochs', 'num_line_searches', 'armijo_const', 'wolfe_const', 'steepest_learning_rate', 'momentum_rate', 'conjugate_max_iterations', 'conjugate_const_type', 'truncated_newton_num_cg_epochs', 'truncated_newton_init_damping_factor', 'krylov_num_directions', 'krylov_num_batch_splits', 'krylov_num_bfgs_epochs', 'second_order_matrix', 'krylov_use_hessian_preconditioner', 'krylov_eigenvalue_floor_const', 'fisher_preconditioner_floor_val', 'use_fisher_preconditioner', 'structural_damping_const', 'validation_feature_file_name', 'validation_label_file_name' ] self.required_variables['test'] = [ 'mode', 'feature_file_name', 'weight_matrix_name', 'output_name' ] self.all_variables['test'] = self.required_variables['test'] + [ 'label_file_name' ]
class Bidirectional_Recurrent_Neural_Network_Language_Model(object, Vector_Math): """features are stored in format max_seq_len x nseq x nvis where n_max_obs is the maximum number of observations per sequence and nseq is the number of sequences weights are stored as nvis x nhid at feature level biases are stored as 1 x nhid rbm_type is either rbm_gaussian_bernoulli, rbm_bernoulli_bernoulli, logistic""" def __init__(self, config_dictionary): # completed """variables for Neural Network: feature_file_name(read from) required_variables - required variables for running system all_variables - all valid variables for each type""" self.feature_file_name = self.default_variable_define(config_dictionary, "feature_file_name", arg_type="string") self.features, self.feature_sequence_lens = self.read_feature_file() self.model = Bidirectional_RNNLM_Weight() self.output_name = self.default_variable_define(config_dictionary, "output_name", arg_type="string") self.required_variables = dict() self.all_variables = dict() self.required_variables["train"] = ["mode", "feature_file_name", "output_name"] self.all_variables["train"] = self.required_variables["train"] + [ "label_file_name", "num_hiddens", "weight_matrix_name", "initial_weight_max", "initial_weight_min", "initial_bias_max", "initial_bias_min", "save_each_epoch", "do_pretrain", "pretrain_method", "pretrain_iterations", "pretrain_learning_rate", "pretrain_batch_size", "do_backprop", "backprop_method", "backprop_batch_size", "l2_regularization_const", "num_epochs", "num_line_searches", "armijo_const", "wolfe_const", "steepest_learning_rate", "momentum_rate", "conjugate_max_iterations", "conjugate_const_type", "truncated_newton_num_cg_epochs", "truncated_newton_init_damping_factor", "krylov_num_directions", "krylov_num_batch_splits", "krylov_num_bfgs_epochs", "second_order_matrix", "krylov_use_hessian_preconditioner", "krylov_eigenvalue_floor_const", "fisher_preconditioner_floor_val", "use_fisher_preconditioner", "structural_damping_const", "validation_feature_file_name", "validation_label_file_name", ] self.required_variables["test"] = ["mode", "feature_file_name", "weight_matrix_name", "output_name"] self.all_variables["test"] = self.required_variables["test"] + ["label_file_name"] def dump_config_vals(self): no_attr_key = list() print "********************************************************************************" print "Neural Network configuration is as follows:" for key in self.all_variables[self.mode]: if hasattr(self, key): print key, "=", eval("self." + key) else: no_attr_key.append(key) print "********************************************************************************" print "Undefined keys are as follows:" for key in no_attr_key: print key, "not set" print "********************************************************************************" def default_variable_define( self, config_dictionary, config_key, arg_type="string", default_value=None, error_string=None, exit_if_no_default=True, acceptable_values=None, ): # arg_type is either int, float, string, int_comma_string, float_comma_string, boolean try: if arg_type == "int_comma_string": return self.read_config_comma_string(config_dictionary[config_key], needs_int=True) elif arg_type == "float_comma_string": return self.read_config_comma_string(config_dictionary[config_key], needs_int=False) elif arg_type == "int": return int(config_dictionary[config_key]) elif arg_type == "float": return float(config_dictionary[config_key]) elif arg_type == "string": return config_dictionary[config_key] elif arg_type == "boolean": if ( config_dictionary[config_key] == "False" or config_dictionary[config_key] == "0" or config_dictionary[config_key] == "F" ): return False elif ( config_dictionary[config_key] == "True" or config_dictionary[config_key] == "1" or config_dictionary[config_key] == "T" ): return True else: print config_dictionary[ config_key ], "is not valid for boolean type... Acceptable values are True, False, 1, 0, T, or F... Exiting now" sys.exit() else: print arg_type, "is not a valid type, arg_type can be either int, float, string, int_comma_string, float_comma_string... exiting now" sys.exit() except KeyError: if error_string != None: print error_string else: print "No", config_key, "defined,", if default_value == None and exit_if_no_default: print "since", config_key, "must be defined... exiting now" sys.exit() else: if acceptable_values != None and (default_value not in acceptable_values): print default_value, "is not an acceptable input, acceptable inputs are", acceptable_values, "... Exiting now" sys.exit() if error_string == None: print "setting", config_key, "to", default_value return default_value def read_feature_file(self, feature_file_name=None): # completed if feature_file_name is None: feature_file_name = self.feature_file_name try: feature_data = sp.loadmat(feature_file_name) features = feature_data["features"].astype(np.int32) sequence_len = feature_data["feature_sequence_lengths"] sequence_len = np.reshape(sequence_len, (sequence_len.size,)) return features, sequence_len # in MATLAB format except IOError: print "Unable to open ", feature_file_name, "... Exiting now" sys.exit() def read_label_file(self, label_file_name=None): # completed """label file is a two-column file in the form sent_id label_1 sent_id label_2 ... """ if label_file_name is None: label_file_name = self.label_file_name try: label_data = sp.loadmat(label_file_name)["labels"].astype(np.int32) return label_data # [:,1], label_data[:,0]#in MATLAB format except IOError: print "Unable to open ", label_file_name, "... Exiting now" sys.exit() def batch_size(self, feature_sequence_lens): return np.sum(feature_sequence_lens) def read_config_comma_string(self, input_string, needs_int=False): output_list = [] for elem in input_string.split(","): if "*" in elem: elem_list = elem.split("*") if needs_int: output_list.extend([int(elem_list[1])] * int(elem_list[0])) else: output_list.extend([float(elem_list[1])] * int(elem_list[0])) else: if needs_int: output_list.append(int(elem)) else: output_list.append(float(elem)) return output_list def levenshtein_string_edit_distance(self, string1, string2): # completed dist = dict() string1_len = len(string1) string2_len = len(string2) for idx in range(-1, string1_len + 1): dist[(idx, -1)] = idx + 1 for idx in range(-1, string2_len + 1): dist[(-1, idx)] = idx + 1 for idx1 in range(string1_len): for idx2 in range(string2_len): if string1[idx1] == string2[idx2]: cost = 0 else: cost = 1 dist[(idx1, idx2)] = min( dist[(idx1 - 1, idx2)] + 1, # deletion dist[(idx1, idx2 - 1)] + 1, # insertion dist[(idx1 - 1, idx2 - 1)] + cost, # substitution ) if idx1 and idx2 and string1[idx1] == string2[idx2 - 1] and string1[idx1 - 1] == string2[idx2]: dist[(idx1, idx2)] = min(dist[(idx1, idx2)], dist[idx1 - 2, idx2 - 2] + cost) # transposition return dist[(string1_len - 1, string2_len - 1)] def check_keys(self, config_dictionary): # completed print "Checking config keys...", exit_flag = False config_dictionary_keys = config_dictionary.keys() if self.mode == "train": correct_mode = "train" incorrect_mode = "test" elif self.mode == "test": correct_mode = "test" incorrect_mode = "train" for req_var in self.required_variables[correct_mode]: if req_var not in config_dictionary_keys: print req_var, "needs to be set for", correct_mode, "but is not." if exit_flag == False: print "Because of above error, will exit after checking rest of keys" exit_flag = True for var in config_dictionary_keys: if var not in self.all_variables[correct_mode]: print var, "in the config file given is not a valid key for", correct_mode if var in self.all_variables[incorrect_mode]: print "but", var, "is a valid key for", incorrect_mode, "so either the mode or key is incorrect" else: string_distances = np.array( [ self.levenshtein_string_edit_distance(var, string2) for string2 in self.all_variables[correct_mode] ] ) print "perhaps you meant ***", self.all_variables[correct_mode][ np.argmin(string_distances) ], "\b*** (levenshtein string edit distance", np.min( string_distances ), "\b) instead of ***", var, "\b***?" if exit_flag == False: print "Because of above error, will exit after checking rest of keys" exit_flag = True if exit_flag: print "Exiting now" sys.exit() else: print "seems copacetic" def check_labels(self): # want to prune non-contiguous labels, might be expensive # TODO: check sentids to make sure seqences are good print "Checking labels..." if len(self.labels.shape) != 2: print "labels need to be in (n_samples,2) format and the shape of labels is ", self.labels.shape, "... Exiting now" sys.exit() if self.labels.shape[0] != sum(self.feature_sequence_lens): print "Number of examples in feature file: ", sum( self.feature_sequence_lens ), " does not equal size of label file, ", self.labels.size, "... Exiting now" sys.exit() # if [i for i in np.unique(self.labels)] != range(np.max(self.labels)+1): # print "Labels need to be in the form 0,1,2,....,n,... Exiting now" sys.exit() # label_counts = np.bincount(np.ravel(self.labels[:,1])) #[self.labels.count(x) for x in range(np.max(self.labels)+1)] # print "distribution of labels is:" # for x in range(len(label_counts)): # print "#", x, "\b's:", label_counts[x] print "labels seem copacetic" def forward_layer( self, inputs, weights, biases, weight_type, secondary_inputs=None, secondary_weights=None ): # completed # raise ValueError("forward_layer() not implemented yet") if weight_type == "logistic": return self.softmax( self.weight_matrix_multiply(inputs, weights, biases) + np.dot(secondary_inputs, secondary_weights) ) elif weight_type == "rbm_gaussian_bernoulli" or weight_type == "rbm_bernoulli_bernoulli": return self.sigmoid( weights[(inputs), :] + self.weight_matrix_multiply(secondary_inputs, secondary_weights, biases) ) # added to test finite differences calculation for pearlmutter forward pass elif weight_type == "linear": # only used for the logistic layer return self.weight_matrix_multiply(inputs, weights, biases) + np.dot(secondary_inputs, secondary_weights) else: print "weight_type", weight_type, "is not a valid layer type.", print "Valid layer types are", self.model.valid_layer_types, "Exiting now..." sys.exit() def forward_pass_single_batch(self, inputs, model=None, return_hiddens=False, linear_output=False): """forward pass for single batch size. Mainly for speed in this case """ if model == None: model = self.model num_observations = inputs.size hiddens_forward = model.weights["visible_hidden"][(inputs), :] hiddens_forward[:1, :] += self.weight_matrix_multiply( model.init_hiddens["forward"], model.weights["hidden_hidden_forward"], model.bias["hidden_forward"] ) expit(hiddens_forward[0, :], hiddens_forward[0, :]) hiddens_backward = model.weights["visible_hidden"][(inputs), :] hiddens_backward[-1:, :] += self.weight_matrix_multiply( model.init_hiddens["backward"], model.weights["hidden_hidden_backward"], model.bias["hidden_backward"] ) expit(hiddens_backward[-1, :], hiddens_backward[-1, :]) for time_step in range(1, num_observations): hiddens_forward[time_step : time_step + 1, :] += self.weight_matrix_multiply( hiddens_forward[time_step - 1 : time_step, :], model.weights["hidden_hidden_forward"], model.bias["hidden_forward"], ) expit(hiddens_forward[time_step, :], hiddens_forward[time_step, :]) # sigmoid hiddens_backward[ num_observations - time_step - 1 : num_observations - time_step, : ] += self.weight_matrix_multiply( hiddens_backward[num_observations - time_step : num_observations - time_step + 1, :], model.weights["hidden_hidden_backward"], model.bias["hidden_backward"], ) expit( hiddens_backward[num_observations - time_step - 1, :], hiddens_backward[num_observations - time_step - 1, :], ) # sigmoid outputs = self.forward_layer( hiddens_forward, model.weights["hidden_output_forward"], model.bias["output"], model.weight_type["hidden_output"], hiddens_backward, model.weights["hidden_output_backward"], ) if return_hiddens: return outputs, hiddens_forward, hiddens_backward else: del hiddens_forward, hiddens_backward return outputs def forward_pass( self, inputs, feature_sequence_lens, model=None, return_hiddens=False, linear_output=False ): # completed """forward pass each layer starting with feature level inputs in the form n_max_obs x n_seq x n_vis""" raise ValueError("forward_pass() not implemented yet") if model == None: model = self.model architecture = self.model.get_architecture() max_sequence_observations = inputs.shape[0] num_sequences = inputs.shape[1] num_hiddens = architecture[1] num_outs = architecture[2] hiddens_forward = np.zeros((max_sequence_observations, num_sequences, num_hiddens)) hiddens_backward = np.zeros((max_sequence_observations, num_sequences, num_hiddens)) outputs = np.zeros((max_sequence_observations, num_sequences, num_outs)) # propagate hiddens hiddens_forward[0, :, :] = self.forward_layer( inputs[0, :], model.weights["visible_hidden"], model.bias["hidden"], model.weight_type["visible_hidden"], model.init_hiddens["forward"], model.weights["hidden_hidden_forward"], ) hiddens_backward[0, :, :] = self.forward_layer( inputs[0, :], model.weights["visible_hidden"], model.bias["hidden"], model.weight_type["visible_hidden"], model.init_hiddens["backward"], model.weights["hidden_hidden_backward"], ) if linear_output: outputs[0, :, :] = self.forward_layer( hiddens_forward[0, :, :], model.weights["hidden_output"], model.bias["output"], "linear" ) else: outputs[0, :, :] = self.forward_layer( hiddens[0, :, :], model.weights["hidden_output"], model.bias["output"], model.weight_type["hidden_output"], ) for sequence_index in range(1, max_sequence_observations): sequence_input = inputs[sequence_index, :] hiddens[sequence_index, :, :] = self.forward_layer( sequence_input, model.weights["visible_hidden"], model.bias["hidden"], model.weight_type["visible_hidden"], hiddens[sequence_index - 1, :, :], model.weights["hidden_hidden"], ) if linear_output: outputs[sequence_index, :, :] = self.forward_layer( hiddens[sequence_index, :, :], model.weights["hidden_output"], model.bias["output"], "linear" ) else: outputs[sequence_index, :, :] = self.forward_layer( hiddens[sequence_index, :, :], model.weights["hidden_output"], model.bias["output"], model.weight_type["hidden_output"], ) # find the observations where the sequence has ended, # and then zero out hiddens and outputs, so nothing horrible happens during backprop, etc. zero_input = np.where(feature_sequence_lens <= sequence_index) hiddens[sequence_index, zero_input, :] = 0.0 outputs[sequence_index, zero_input, :] = 0.0 if return_hiddens: return outputs, hiddens else: del hiddens return outputs def flatten_output(self, output, feature_sequence_lens=None): """outputs in the form of max_obs_seq x n_seq x n_outs get converted to form n_data x n_outs, so we can calculate classification accuracy and cross-entropy """ if feature_sequence_lens == None: feature_sequence_lens = self.feature_sequence_lens num_outs = output.shape[2] # num_seq = output.shape[1] flat_output = np.zeros((self.batch_size(feature_sequence_lens), num_outs)) cur_index = 0 for seq_index, num_obs in enumerate(feature_sequence_lens): flat_output[cur_index : cur_index + num_obs, :] = copy.deepcopy(output[:num_obs, seq_index, :]) cur_index += num_obs return flat_output def calculate_log_perplexity(self, output, flat_labels): # completed, expensive, should be compiled """calculates perplexity with flat labels """ return -np.sum( np.log2(np.clip(output, a_min=1e-12, a_max=1.0))[np.arange(flat_labels.shape[0]), flat_labels[:, 1]] ) def calculate_cross_entropy(self, output, flat_labels): # completed, expensive, should be compiled """calculates perplexity with flat labels """ return -np.sum( np.log(np.clip(output, a_min=1e-12, a_max=1.0))[np.arange(flat_labels.shape[0]), flat_labels[:, 1]] ) def calculate_classification_accuracy(self, flat_output, labels): # completed, possibly expensive prediction = flat_output.argmax(axis=1).reshape(labels.shape) classification_accuracy = sum(prediction == labels) / float(labels.size) return classification_accuracy[0]
class Bidirectional_Recurrent_Neural_Network_Language_Model( object, Vector_Math): """features are stored in format max_seq_len x nseq x nvis where n_max_obs is the maximum number of observations per sequence and nseq is the number of sequences weights are stored as nvis x nhid at feature level biases are stored as 1 x nhid rbm_type is either rbm_gaussian_bernoulli, rbm_bernoulli_bernoulli, logistic""" def __init__(self, config_dictionary): #completed """variables for Neural Network: feature_file_name(read from) required_variables - required variables for running system all_variables - all valid variables for each type""" self.feature_file_name = self.default_variable_define( config_dictionary, 'feature_file_name', arg_type='string') self.features, self.feature_sequence_lens = self.read_feature_file() self.model = Bidirectional_RNNLM_Weight() self.output_name = self.default_variable_define(config_dictionary, 'output_name', arg_type='string') self.required_variables = dict() self.all_variables = dict() self.required_variables['train'] = [ 'mode', 'feature_file_name', 'output_name' ] self.all_variables['train'] = self.required_variables['train'] + [ 'label_file_name', 'num_hiddens', 'weight_matrix_name', 'initial_weight_max', 'initial_weight_min', 'initial_bias_max', 'initial_bias_min', 'save_each_epoch', 'do_pretrain', 'pretrain_method', 'pretrain_iterations', 'pretrain_learning_rate', 'pretrain_batch_size', 'do_backprop', 'backprop_method', 'backprop_batch_size', 'l2_regularization_const', 'num_epochs', 'num_line_searches', 'armijo_const', 'wolfe_const', 'steepest_learning_rate', 'momentum_rate', 'conjugate_max_iterations', 'conjugate_const_type', 'truncated_newton_num_cg_epochs', 'truncated_newton_init_damping_factor', 'krylov_num_directions', 'krylov_num_batch_splits', 'krylov_num_bfgs_epochs', 'second_order_matrix', 'krylov_use_hessian_preconditioner', 'krylov_eigenvalue_floor_const', 'fisher_preconditioner_floor_val', 'use_fisher_preconditioner', 'structural_damping_const', 'validation_feature_file_name', 'validation_label_file_name' ] self.required_variables['test'] = [ 'mode', 'feature_file_name', 'weight_matrix_name', 'output_name' ] self.all_variables['test'] = self.required_variables['test'] + [ 'label_file_name' ] def dump_config_vals(self): no_attr_key = list() print "********************************************************************************" print "Neural Network configuration is as follows:" for key in self.all_variables[self.mode]: if hasattr(self, key): print key, "=", eval('self.' + key) else: no_attr_key.append(key) print "********************************************************************************" print "Undefined keys are as follows:" for key in no_attr_key: print key, "not set" print "********************************************************************************" def default_variable_define(self, config_dictionary, config_key, arg_type='string', default_value=None, error_string=None, exit_if_no_default=True, acceptable_values=None): #arg_type is either int, float, string, int_comma_string, float_comma_string, boolean try: if arg_type == 'int_comma_string': return self.read_config_comma_string( config_dictionary[config_key], needs_int=True) elif arg_type == 'float_comma_string': return self.read_config_comma_string( config_dictionary[config_key], needs_int=False) elif arg_type == 'int': return int(config_dictionary[config_key]) elif arg_type == 'float': return float(config_dictionary[config_key]) elif arg_type == 'string': return config_dictionary[config_key] elif arg_type == 'boolean': if config_dictionary[ config_key] == 'False' or config_dictionary[ config_key] == '0' or config_dictionary[ config_key] == 'F': return False elif config_dictionary[ config_key] == 'True' or config_dictionary[ config_key] == '1' or config_dictionary[ config_key] == 'T': return True else: print config_dictionary[ config_key], "is not valid for boolean type... Acceptable values are True, False, 1, 0, T, or F... Exiting now" sys.exit() else: print arg_type, "is not a valid type, arg_type can be either int, float, string, int_comma_string, float_comma_string... exiting now" sys.exit() except KeyError: if error_string != None: print error_string else: print "No", config_key, "defined,", if default_value == None and exit_if_no_default: print "since", config_key, "must be defined... exiting now" sys.exit() else: if acceptable_values != None and (default_value not in acceptable_values): print default_value, "is not an acceptable input, acceptable inputs are", acceptable_values, "... Exiting now" sys.exit() if error_string == None: print "setting", config_key, "to", default_value return default_value def read_feature_file(self, feature_file_name=None): #completed if feature_file_name is None: feature_file_name = self.feature_file_name try: feature_data = sp.loadmat(feature_file_name) features = feature_data['features'].astype(np.int32) sequence_len = feature_data['feature_sequence_lengths'] sequence_len = np.reshape(sequence_len, (sequence_len.size, )) return features, sequence_len #in MATLAB format except IOError: print "Unable to open ", feature_file_name, "... Exiting now" sys.exit() def read_label_file(self, label_file_name=None): #completed """label file is a two-column file in the form sent_id label_1 sent_id label_2 ... """ if label_file_name is None: label_file_name = self.label_file_name try: label_data = sp.loadmat(label_file_name)['labels'].astype(np.int32) return label_data #[:,1], label_data[:,0]#in MATLAB format except IOError: print "Unable to open ", label_file_name, "... Exiting now" sys.exit() def batch_size(self, feature_sequence_lens): return np.sum(feature_sequence_lens) def read_config_comma_string(self, input_string, needs_int=False): output_list = [] for elem in input_string.split(','): if '*' in elem: elem_list = elem.split('*') if needs_int: output_list.extend([int(elem_list[1])] * int(elem_list[0])) else: output_list.extend([float(elem_list[1])] * int(elem_list[0])) else: if needs_int: output_list.append(int(elem)) else: output_list.append(float(elem)) return output_list def levenshtein_string_edit_distance(self, string1, string2): #completed dist = dict() string1_len = len(string1) string2_len = len(string2) for idx in range(-1, string1_len + 1): dist[(idx, -1)] = idx + 1 for idx in range(-1, string2_len + 1): dist[(-1, idx)] = idx + 1 for idx1 in range(string1_len): for idx2 in range(string2_len): if string1[idx1] == string2[idx2]: cost = 0 else: cost = 1 dist[(idx1, idx2)] = min( dist[(idx1 - 1, idx2)] + 1, # deletion dist[(idx1, idx2 - 1)] + 1, # insertion dist[(idx1 - 1, idx2 - 1)] + cost, # substitution ) if idx1 and idx2 and string1[idx1] == string2[ idx2 - 1] and string1[idx1 - 1] == string2[idx2]: dist[(idx1, idx2)] = min(dist[(idx1, idx2)], dist[idx1 - 2, idx2 - 2] + cost) # transposition return dist[(string1_len - 1, string2_len - 1)] def check_keys(self, config_dictionary): #completed print "Checking config keys...", exit_flag = False config_dictionary_keys = config_dictionary.keys() if self.mode == 'train': correct_mode = 'train' incorrect_mode = 'test' elif self.mode == 'test': correct_mode = 'test' incorrect_mode = 'train' for req_var in self.required_variables[correct_mode]: if req_var not in config_dictionary_keys: print req_var, "needs to be set for", correct_mode, "but is not." if exit_flag == False: print "Because of above error, will exit after checking rest of keys" exit_flag = True for var in config_dictionary_keys: if var not in self.all_variables[correct_mode]: print var, "in the config file given is not a valid key for", correct_mode if var in self.all_variables[incorrect_mode]: print "but", var, "is a valid key for", incorrect_mode, "so either the mode or key is incorrect" else: string_distances = np.array([ self.levenshtein_string_edit_distance(var, string2) for string2 in self.all_variables[correct_mode] ]) print "perhaps you meant ***", self.all_variables[ correct_mode][np.argmin( string_distances )], "\b*** (levenshtein string edit distance", np.min( string_distances ), "\b) instead of ***", var, "\b***?" if exit_flag == False: print "Because of above error, will exit after checking rest of keys" exit_flag = True if exit_flag: print "Exiting now" sys.exit() else: print "seems copacetic" def check_labels( self): #want to prune non-contiguous labels, might be expensive #TODO: check sentids to make sure seqences are good print "Checking labels..." if len(self.labels.shape) != 2: print "labels need to be in (n_samples,2) format and the shape of labels is ", self.labels.shape, "... Exiting now" sys.exit() if self.labels.shape[0] != sum(self.feature_sequence_lens): print "Number of examples in feature file: ", sum( self.feature_sequence_lens ), " does not equal size of label file, ", self.labels.size, "... Exiting now" sys.exit() # if [i for i in np.unique(self.labels)] != range(np.max(self.labels)+1): # print "Labels need to be in the form 0,1,2,....,n,... Exiting now" sys.exit() # label_counts = np.bincount(np.ravel(self.labels[:,1])) #[self.labels.count(x) for x in range(np.max(self.labels)+1)] # print "distribution of labels is:" # for x in range(len(label_counts)): # print "#", x, "\b's:", label_counts[x] print "labels seem copacetic" def forward_layer(self, inputs, weights, biases, weight_type, secondary_inputs=None, secondary_weights=None): #completed # raise ValueError("forward_layer() not implemented yet") if weight_type == 'logistic': return self.softmax( self.weight_matrix_multiply(inputs, weights, biases) + np.dot(secondary_inputs, secondary_weights)) elif weight_type == 'rbm_gaussian_bernoulli' or weight_type == 'rbm_bernoulli_bernoulli': return self.sigmoid(weights[ (inputs), :] + self.weight_matrix_multiply( secondary_inputs, secondary_weights, biases)) #added to test finite differences calculation for pearlmutter forward pass elif weight_type == 'linear': #only used for the logistic layer return self.weight_matrix_multiply( inputs, weights, biases) + np.dot(secondary_inputs, secondary_weights) else: print "weight_type", weight_type, "is not a valid layer type.", print "Valid layer types are", self.model.valid_layer_types, "Exiting now..." sys.exit() def forward_pass_single_batch(self, inputs, model=None, return_hiddens=False, linear_output=False): """forward pass for single batch size. Mainly for speed in this case """ if model == None: model = self.model num_observations = inputs.size hiddens_forward = model.weights['visible_hidden'][(inputs), :] hiddens_forward[:1, :] += self.weight_matrix_multiply( model.init_hiddens['forward'], model.weights['hidden_hidden_forward'], model.bias['hidden_forward']) expit(hiddens_forward[0, :], hiddens_forward[0, :]) hiddens_backward = model.weights['visible_hidden'][(inputs), :] hiddens_backward[-1:, :] += self.weight_matrix_multiply( model.init_hiddens['backward'], model.weights['hidden_hidden_backward'], model.bias['hidden_backward']) expit(hiddens_backward[-1, :], hiddens_backward[-1, :]) for time_step in range(1, num_observations): hiddens_forward[time_step:time_step + 1, :] += self.weight_matrix_multiply( hiddens_forward[time_step - 1:time_step, :], model.weights['hidden_hidden_forward'], model.bias['hidden_forward']) expit(hiddens_forward[time_step, :], hiddens_forward[time_step, :]) #sigmoid hiddens_backward[num_observations - time_step - 1:num_observations - time_step, :] += self.weight_matrix_multiply( hiddens_backward[num_observations - time_step:num_observations - time_step + 1, :], model.weights['hidden_hidden_backward'], model.bias['hidden_backward']) expit(hiddens_backward[num_observations - time_step - 1, :], hiddens_backward[num_observations - time_step - 1, :]) #sigmoid outputs = self.forward_layer(hiddens_forward, model.weights['hidden_output_forward'], model.bias['output'], model.weight_type['hidden_output'], hiddens_backward, model.weights['hidden_output_backward']) if return_hiddens: return outputs, hiddens_forward, hiddens_backward else: del hiddens_forward, hiddens_backward return outputs def forward_pass(self, inputs, feature_sequence_lens, model=None, return_hiddens=False, linear_output=False): #completed """forward pass each layer starting with feature level inputs in the form n_max_obs x n_seq x n_vis""" raise ValueError("forward_pass() not implemented yet") if model == None: model = self.model architecture = self.model.get_architecture() max_sequence_observations = inputs.shape[0] num_sequences = inputs.shape[1] num_hiddens = architecture[1] num_outs = architecture[2] hiddens_forward = np.zeros( (max_sequence_observations, num_sequences, num_hiddens)) hiddens_backward = np.zeros( (max_sequence_observations, num_sequences, num_hiddens)) outputs = np.zeros( (max_sequence_observations, num_sequences, num_outs)) #propagate hiddens hiddens_forward[0, :, :] = self.forward_layer( inputs[0, :], model.weights['visible_hidden'], model.bias['hidden'], model.weight_type['visible_hidden'], model.init_hiddens['forward'], model.weights['hidden_hidden_forward']) hiddens_backward[0, :, :] = self.forward_layer( inputs[0, :], model.weights['visible_hidden'], model.bias['hidden'], model.weight_type['visible_hidden'], model.init_hiddens['backward'], model.weights['hidden_hidden_backward']) if linear_output: outputs[0, :, :] = self.forward_layer( hiddens_forward[0, :, :], model.weights['hidden_output'], model.bias['output'], 'linear', ) else: outputs[0, :, :] = self.forward_layer( hiddens[0, :, :], model.weights['hidden_output'], model.bias['output'], model.weight_type['hidden_output']) for sequence_index in range(1, max_sequence_observations): sequence_input = inputs[sequence_index, :] hiddens[sequence_index, :, :] = self.forward_layer( sequence_input, model.weights['visible_hidden'], model.bias['hidden'], model.weight_type['visible_hidden'], hiddens[sequence_index - 1, :, :], model.weights['hidden_hidden']) if linear_output: outputs[sequence_index, :, :] = self.forward_layer( hiddens[sequence_index, :, :], model.weights['hidden_output'], model.bias['output'], 'linear') else: outputs[sequence_index, :, :] = self.forward_layer( hiddens[sequence_index, :, :], model.weights['hidden_output'], model.bias['output'], model.weight_type['hidden_output']) #find the observations where the sequence has ended, #and then zero out hiddens and outputs, so nothing horrible happens during backprop, etc. zero_input = np.where(feature_sequence_lens <= sequence_index) hiddens[sequence_index, zero_input, :] = 0.0 outputs[sequence_index, zero_input, :] = 0.0 if return_hiddens: return outputs, hiddens else: del hiddens return outputs def flatten_output(self, output, feature_sequence_lens=None): """outputs in the form of max_obs_seq x n_seq x n_outs get converted to form n_data x n_outs, so we can calculate classification accuracy and cross-entropy """ if feature_sequence_lens == None: feature_sequence_lens = self.feature_sequence_lens num_outs = output.shape[2] # num_seq = output.shape[1] flat_output = np.zeros( (self.batch_size(feature_sequence_lens), num_outs)) cur_index = 0 for seq_index, num_obs in enumerate(feature_sequence_lens): flat_output[cur_index:cur_index + num_obs, :] = copy.deepcopy( output[:num_obs, seq_index, :]) cur_index += num_obs return flat_output def calculate_log_perplexity( self, output, flat_labels): #completed, expensive, should be compiled """calculates perplexity with flat labels """ return -np.sum( np.log2(np.clip(output, a_min=1E-12, a_max=1.0))[np.arange(flat_labels.shape[0]), flat_labels[:, 1]]) def calculate_cross_entropy( self, output, flat_labels): #completed, expensive, should be compiled """calculates perplexity with flat labels """ return -np.sum( np.log(np.clip(output, a_min=1E-12, a_max=1.0))[np.arange(flat_labels.shape[0]), flat_labels[:, 1]]) def calculate_classification_accuracy( self, flat_output, labels): #completed, possibly expensive prediction = flat_output.argmax(axis=1).reshape(labels.shape) classification_accuracy = sum(prediction == labels) / float( labels.size) return classification_accuracy[0]
def backprop_adagrad_single_batch(self): print "Starting backprop using adagrad" adagrad_weight = Bidirectional_RNNLM_Weight() adagrad_weight.init_zero_weights(self.model.get_architecture()) buffer_weight = Bidirectional_RNNLM_Weight() buffer_weight.init_zero_weights(self.model.get_architecture()) fudge_factor = 1.0 adagrad_weight = adagrad_weight + fudge_factor gradient = Bidirectional_RNNLM_Weight() gradient.init_zero_weights(self.model.get_architecture()) if self.validation_feature_file_name is not None: cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics( self.validation_features, self.validation_labels, self.validation_fsl, self.model) print "cross-entropy before steepest descent is", cross_entropy print "perplexity is", perplexity if self.l2_regularization_const > 0.0: print "regularized loss is", loss print "number correctly classified is", num_correct, "of", num_examples # excluded_keys = {'bias':['0'], 'weights':[]} # frame_table = np.cumsum(self.feature_sequence_lens) for epoch_num in range(len(self.steepest_learning_rate)): print "At epoch", epoch_num + 1, "of", len( self.steepest_learning_rate ), "with learning rate", self.steepest_learning_rate[epoch_num] start_frame = 0 end_frame = 0 cross_entropy = 0.0 num_examples = 0 # if hasattr(self, 'momentum_rate'): # momentum_rate = self.momentum_rate[epoch_num] # print "momentum is", momentum_rate # else: # momentum_rate = 0.0 for batch_index, feature_sequence_len in enumerate( self.feature_sequence_lens): end_frame = start_frame + feature_sequence_len batch_features = self.features[:feature_sequence_len, batch_index] batch_labels = self.labels[start_frame:end_frame, 1] # print "" # print batch_index # print batch_features # print batch_labels cur_xent = self.calculate_gradient_single_batch( batch_features, batch_labels, gradient, return_cross_entropy=True, check_gradient=False) # print self.model.norm() # print gradient.norm() if self.l2_regularization_const > 0.0: buffer_weight.assign_weights(self.model) buffer_weight *= self.l2_regularization_const gradient += buffer_weight buffer_weight.assign_weights(gradient) # print gradient.init_hiddens buffer_weight **= 2.0 adagrad_weight += buffer_weight # print adagrad_weight.init_hiddens buffer_weight.assign_weights(adagrad_weight) buffer_weight **= 0.5 # print buffer_weight.init_hiddens gradient /= buffer_weight # print gradient.init_hiddens cross_entropy += cur_xent per_done = float(batch_index) / self.num_sequences * 100 sys.stdout.write( "\r \r" ) #clear line sys.stdout.write("\r%.1f%% done " % per_done), sys.stdout.flush() ppp = cross_entropy / end_frame sys.stdout.write("train X-ent: %f " % ppp), sys.stdout.flush() gradient *= -self.steepest_learning_rate[epoch_num] self.model += gradient #/ batch_size # if momentum_rate > 0.0: # prev_step *= momentum_rate # self.model += prev_step # prev_step.assign_weights(gradient) # prev_step *= -self.steepest_learning_rate[epoch_num] start_frame = end_frame if self.validation_feature_file_name is not None: cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics( self.validation_features, self.validation_labels, self.validation_fsl, self.model) print "cross-entropy at the end of the epoch is", cross_entropy print "perplexity is", perplexity if self.l2_regularization_const > 0.0: print "regularized loss is", loss print "number correctly classified is", num_correct, "of", num_examples sys.stdout.write("\r100.0% done \r") sys.stdout.write( "\r \r" ) #clear line if self.save_each_epoch: self.model.write_weights(''.join( [self.output_name, '_epoch_', str(epoch_num + 1)]))
def calculate_gradient_single_batch(self, batch_inputs, batch_labels, gradient_weights, hiddens_forward=None, hiddens_backward=None, outputs=None, check_gradient=False, model=None, l2_regularization_const=0.0, return_cross_entropy=False): #need to check regularization #calculate gradient with particular Neural Network model. If None is specified, will use current weights (i.e., self.model) batch_size = batch_labels.size if model is None: model = self.model if hiddens_forward is None or hiddens_backward is None or outputs is None: outputs, hiddens_forward, hiddens_backward = self.forward_pass_single_batch( batch_inputs, model, return_hiddens=True) #derivative of log(cross-entropy softmax) batch_indices = np.arange(batch_size) gradient_weights *= 0.0 backward_inputs = outputs if return_cross_entropy: cross_entropy = -np.sum( np.log2(backward_inputs[batch_indices, batch_labels])) backward_inputs[batch_indices, batch_labels] -= 1.0 np.sum(backward_inputs, axis=0, out=gradient_weights.bias['output'][0]) np.dot(hiddens_forward.T, backward_inputs, out=gradient_weights.weights['hidden_output_forward']) pre_nonlinearity_hiddens_forward = np.dot( backward_inputs[batch_size - 1, :], model.weights['hidden_output_forward'].T) pre_nonlinearity_hiddens_forward *= hiddens_forward[batch_size - 1, :] pre_nonlinearity_hiddens_forward *= 1 - hiddens_forward[batch_size - 1, :] np.dot(hiddens_backward.T, backward_inputs, out=gradient_weights.weights['hidden_output_backward']) pre_nonlinearity_hiddens_backward = np.dot( backward_inputs[0, :], model.weights['hidden_output_backward'].T) pre_nonlinearity_hiddens_backward *= hiddens_backward[0, :] pre_nonlinearity_hiddens_backward *= 1 - hiddens_backward[0, :] if batch_size > 1: gradient_weights.weights['visible_hidden'][batch_inputs[ batch_size - 1]] += pre_nonlinearity_hiddens_forward gradient_weights.weights['hidden_hidden_forward'] += np.outer( hiddens_forward[batch_size - 2, :], pre_nonlinearity_hiddens_forward) gradient_weights.bias['hidden_forward'][ 0] += pre_nonlinearity_hiddens_forward gradient_weights.weights['visible_hidden'][ batch_inputs[0]] += pre_nonlinearity_hiddens_backward gradient_weights.weights['hidden_hidden_backward'] += np.outer( hiddens_backward[1, :], pre_nonlinearity_hiddens_backward) gradient_weights.bias['hidden_backward'][ 0] += pre_nonlinearity_hiddens_backward for index in range(batch_size - 2): backward_index = batch_size - 2 - index forward_index = index + 1 pre_nonlinearity_hiddens_forward = ( (np.dot(backward_inputs[backward_index, :], model.weights['hidden_output_forward'].T) + np.dot(pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T)) * hiddens_forward[backward_index, :] * (1 - hiddens_forward[backward_index, :])) pre_nonlinearity_hiddens_backward = ( (np.dot(backward_inputs[forward_index, :], model.weights['hidden_output_backward'].T) + np.dot(pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T)) * hiddens_backward[forward_index, :] * (1 - hiddens_backward[forward_index, :])) gradient_weights.weights['visible_hidden'][batch_inputs[ backward_index]] += pre_nonlinearity_hiddens_forward #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_forward'] += np.outer( hiddens_forward[backward_index - 1, :], pre_nonlinearity_hiddens_forward) gradient_weights.bias['hidden_forward'][ 0] += pre_nonlinearity_hiddens_forward gradient_weights.weights['visible_hidden'][batch_inputs[ forward_index]] += pre_nonlinearity_hiddens_backward #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_backward'] += np.outer( hiddens_backward[forward_index + 1, :], pre_nonlinearity_hiddens_backward) gradient_weights.bias['hidden_backward'][ 0] += pre_nonlinearity_hiddens_backward if batch_size > 1: pre_nonlinearity_hiddens_forward = ( (np.dot(backward_inputs[0, :], model.weights['hidden_output_forward'].T) + np.dot(pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T)) * hiddens_forward[0, :] * (1 - hiddens_forward[0, :])) pre_nonlinearity_hiddens_backward = ( (np.dot(backward_inputs[-1, :], model.weights['hidden_output_backward'].T) + np.dot(pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T)) * hiddens_backward[-1, :] * (1 - hiddens_backward[-1, :])) gradient_weights.weights['visible_hidden'][batch_inputs[ 0]] += pre_nonlinearity_hiddens_forward # += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_forward'] += np.outer( model.init_hiddens['forward'], pre_nonlinearity_hiddens_forward ) #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens) gradient_weights.bias['hidden_forward'][ 0] += pre_nonlinearity_hiddens_forward gradient_weights.init_hiddens['forward'][0] = np.dot( pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T) gradient_weights.weights['visible_hidden'][batch_inputs[ -1]] += pre_nonlinearity_hiddens_backward # += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_backward'] += np.outer( model.init_hiddens['backward'], pre_nonlinearity_hiddens_backward ) #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens) gradient_weights.bias['hidden_backward'][ 0] += pre_nonlinearity_hiddens_backward gradient_weights.init_hiddens['backward'][0] = np.dot( pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T) # gradient_weights = self.backward_pass(backward_inputs, hiddens, batch_inputs, model) backward_inputs[batch_indices, batch_labels] += 1.0 gradient_weights /= batch_size if l2_regularization_const > 0.0: gradient_weights += model * l2_regularization_const if return_cross_entropy and not check_gradient: return cross_entropy # if not check_gradient: # if not return_cross_entropy: # if l2_regularization_const > 0.0: # return gradient_weights / batch_size + model * l2_regularization_const # return gradient_weights / batch_size # else: # if l2_regularization_const > 0.0: # return gradient_weights / batch_size + model * l2_regularization_const, cross_entropy # return gradient_weights / batch_size, cross_entropy ### below block checks gradient... only to be used if you think the gradient is incorrectly calculated ############## else: gradient_weights *= batch_size if l2_regularization_const > 0.0: gradient_weights += model * (l2_regularization_const * batch_size) sys.stdout.write( "\r \r" ) print "checking gradient..." finite_difference_model = Bidirectional_RNNLM_Weight() finite_difference_model.init_zero_weights( self.model.get_architecture(), verbose=False) direction = Bidirectional_RNNLM_Weight() direction.init_zero_weights(self.model.get_architecture(), verbose=False) epsilon = 1E-5 print "at initial hiddens" for key in direction.init_hiddens.keys(): for index in range(direction.init_hiddens[key].size): direction.init_hiddens[key][0][index] = epsilon forward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model + direction)[batch_indices, batch_labels])) backward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model - direction)[batch_indices, batch_labels])) finite_difference_model.init_hiddens[key][0][index] = ( forward_loss - backward_loss) / (2 * epsilon) direction.init_hiddens[key][0][index] = 0.0 for key in direction.bias.keys(): print "at bias key", key for index in range(direction.bias[key].size): direction.bias[key][0][index] = epsilon #print direction.norm() forward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model + direction)[batch_indices, batch_labels])) backward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model - direction)[batch_indices, batch_labels])) finite_difference_model.bias[key][0][index] = ( forward_loss - backward_loss) / (2 * epsilon) direction.bias[key][0][index] = 0.0 for key in direction.weights.keys(): print "at weight key", key for index0 in range(direction.weights[key].shape[0]): for index1 in range(direction.weights[key].shape[1]): direction.weights[key][index0][index1] = epsilon forward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model + direction)[batch_indices, batch_labels])) backward_loss = -np.sum( np.log( self.forward_pass_single_batch( batch_inputs, model=model - direction)[batch_indices, batch_labels])) finite_difference_model.weights[key][index0][ index1] = (forward_loss - backward_loss) / (2 * epsilon) direction.weights[key][index0][index1] = 0.0 print "calculated gradient for forward initial hiddens" print gradient_weights.init_hiddens['forward'] print "finite difference approximation for forward initial hiddens" print finite_difference_model.init_hiddens['forward'] print "calculated gradient for backward initial hiddens" print gradient_weights.init_hiddens['backward'] print "finite difference approximation for backward initial hiddens" print finite_difference_model.init_hiddens['backward'] print "calculated gradient for forward hidden bias" print gradient_weights.bias['hidden_forward'] print "finite difference approximation for forward hidden bias" print finite_difference_model.bias['hidden_forward'] print "calculated gradient for backward hidden bias" print gradient_weights.bias['hidden_backward'] print "finite difference approximation for backward hidden bias" print finite_difference_model.bias['hidden_backward'] print "calculated gradient for output bias" print gradient_weights.bias['output'] print "finite difference approximation for output bias" print finite_difference_model.bias['output'] print "calculated gradient for visible_hidden layer" print gradient_weights.weights['visible_hidden'] print "finite difference approximation for visible_hidden layer" print finite_difference_model.weights['visible_hidden'] print np.sum((finite_difference_model.weights['visible_hidden'] - gradient_weights.weights['visible_hidden'])**2) print "calculated gradient for hidden_hidden_forward layer" print gradient_weights.weights['hidden_hidden_forward'] print "finite difference approximation for hidden_hidden_forward layer" print finite_difference_model.weights['hidden_hidden_forward'] print "calculated gradient for hidden_hidden_backward layer" print gradient_weights.weights['hidden_hidden_backward'] print "finite difference approximation for hidden_hidden_backward layer" print finite_difference_model.weights['hidden_hidden_backward'] print "calculated gradient for hidden_output_forward layer" print gradient_weights.weights['hidden_output_forward'] print "finite difference approximation for hidden_output_forward layer" print finite_difference_model.weights['hidden_output_forward'] print "calculated gradient for hidden_output_backward layer" print gradient_weights.weights['hidden_output_backward'] print "finite difference approximation for hidden_output_backward layer" print finite_difference_model.weights['hidden_output_backward'] sys.exit()
def backprop_adagrad_single_batch(self): print "Starting backprop using adagrad" adagrad_weight = Bidirectional_RNNLM_Weight() adagrad_weight.init_zero_weights(self.model.get_architecture()) buffer_weight = Bidirectional_RNNLM_Weight() buffer_weight.init_zero_weights(self.model.get_architecture()) fudge_factor = 1.0 adagrad_weight = adagrad_weight + fudge_factor gradient = Bidirectional_RNNLM_Weight() gradient.init_zero_weights(self.model.get_architecture()) if self.validation_feature_file_name is not None: cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics(self.validation_features, self.validation_labels, self.validation_fsl, self.model) print "cross-entropy before steepest descent is", cross_entropy print "perplexity is", perplexity if self.l2_regularization_const > 0.0: print "regularized loss is", loss print "number correctly classified is", num_correct, "of", num_examples # excluded_keys = {'bias':['0'], 'weights':[]} # frame_table = np.cumsum(self.feature_sequence_lens) for epoch_num in range(len(self.steepest_learning_rate)): print "At epoch", epoch_num+1, "of", len(self.steepest_learning_rate), "with learning rate", self.steepest_learning_rate[epoch_num] start_frame = 0 end_frame = 0 cross_entropy = 0.0 num_examples = 0 # if hasattr(self, 'momentum_rate'): # momentum_rate = self.momentum_rate[epoch_num] # print "momentum is", momentum_rate # else: # momentum_rate = 0.0 for batch_index, feature_sequence_len in enumerate(self.feature_sequence_lens): end_frame = start_frame + feature_sequence_len batch_features = self.features[:feature_sequence_len, batch_index] batch_labels = self.labels[start_frame:end_frame,1] # print "" # print batch_index # print batch_features # print batch_labels cur_xent = self.calculate_gradient_single_batch(batch_features, batch_labels, gradient, return_cross_entropy = True, check_gradient = False) # print self.model.norm() # print gradient.norm() if self.l2_regularization_const > 0.0: buffer_weight.assign_weights(self.model) buffer_weight *= self.l2_regularization_const gradient += buffer_weight buffer_weight.assign_weights(gradient) # print gradient.init_hiddens buffer_weight **= 2.0 adagrad_weight += buffer_weight # print adagrad_weight.init_hiddens buffer_weight.assign_weights(adagrad_weight) buffer_weight **= 0.5 # print buffer_weight.init_hiddens gradient /= buffer_weight # print gradient.init_hiddens cross_entropy += cur_xent per_done = float(batch_index)/self.num_sequences*100 sys.stdout.write("\r \r") #clear line sys.stdout.write("\r%.1f%% done " % per_done), sys.stdout.flush() ppp = cross_entropy / end_frame sys.stdout.write("train X-ent: %f " % ppp), sys.stdout.flush() gradient *= -self.steepest_learning_rate[epoch_num] self.model += gradient #/ batch_size # if momentum_rate > 0.0: # prev_step *= momentum_rate # self.model += prev_step # prev_step.assign_weights(gradient) # prev_step *= -self.steepest_learning_rate[epoch_num] start_frame = end_frame if self.validation_feature_file_name is not None: cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics(self.validation_features, self.validation_labels, self.validation_fsl, self.model) print "cross-entropy at the end of the epoch is", cross_entropy print "perplexity is", perplexity if self.l2_regularization_const > 0.0: print "regularized loss is", loss print "number correctly classified is", num_correct, "of", num_examples sys.stdout.write("\r100.0% done \r") sys.stdout.write("\r \r") #clear line if self.save_each_epoch: self.model.write_weights(''.join([self.output_name, '_epoch_', str(epoch_num+1)]))
def calculate_gradient_single_batch(self, batch_inputs, batch_labels, gradient_weights, hiddens_forward = None, hiddens_backward = None, outputs = None, check_gradient=False, model=None, l2_regularization_const = 0.0, return_cross_entropy = False): #need to check regularization #calculate gradient with particular Neural Network model. If None is specified, will use current weights (i.e., self.model) batch_size = batch_labels.size if model is None: model = self.model if hiddens_forward is None or hiddens_backward is None or outputs is None: outputs, hiddens_forward, hiddens_backward = self.forward_pass_single_batch(batch_inputs, model, return_hiddens=True) #derivative of log(cross-entropy softmax) batch_indices = np.arange(batch_size) gradient_weights *= 0.0 backward_inputs = outputs if return_cross_entropy: cross_entropy = -np.sum(np.log2(backward_inputs[batch_indices, batch_labels])) backward_inputs[batch_indices, batch_labels] -= 1.0 np.sum(backward_inputs, axis=0, out = gradient_weights.bias['output'][0]) np.dot(hiddens_forward.T, backward_inputs, out = gradient_weights.weights['hidden_output_forward']) pre_nonlinearity_hiddens_forward = np.dot(backward_inputs[batch_size-1,:], model.weights['hidden_output_forward'].T) pre_nonlinearity_hiddens_forward *= hiddens_forward[batch_size-1,:] pre_nonlinearity_hiddens_forward *= 1 - hiddens_forward[batch_size-1,:] np.dot(hiddens_backward.T, backward_inputs, out = gradient_weights.weights['hidden_output_backward']) pre_nonlinearity_hiddens_backward = np.dot(backward_inputs[0,:], model.weights['hidden_output_backward'].T) pre_nonlinearity_hiddens_backward *= hiddens_backward[0,:] pre_nonlinearity_hiddens_backward *= 1 - hiddens_backward[0,:] if batch_size > 1: gradient_weights.weights['visible_hidden'][batch_inputs[batch_size-1]] += pre_nonlinearity_hiddens_forward gradient_weights.weights['hidden_hidden_forward'] += np.outer(hiddens_forward[batch_size-2,:], pre_nonlinearity_hiddens_forward) gradient_weights.bias['hidden_forward'][0] += pre_nonlinearity_hiddens_forward gradient_weights.weights['visible_hidden'][batch_inputs[0]] += pre_nonlinearity_hiddens_backward gradient_weights.weights['hidden_hidden_backward'] += np.outer(hiddens_backward[1,:], pre_nonlinearity_hiddens_backward) gradient_weights.bias['hidden_backward'][0] += pre_nonlinearity_hiddens_backward for index in range(batch_size-2): backward_index = batch_size - 2 - index forward_index = index + 1 pre_nonlinearity_hiddens_forward = ((np.dot(backward_inputs[backward_index,:], model.weights['hidden_output_forward'].T) + np.dot(pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T)) * hiddens_forward[backward_index,:] * (1 - hiddens_forward[backward_index,:])) pre_nonlinearity_hiddens_backward = ((np.dot(backward_inputs[forward_index,:], model.weights['hidden_output_backward'].T) + np.dot(pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T)) * hiddens_backward[forward_index,:] * (1 - hiddens_backward[forward_index,:])) gradient_weights.weights['visible_hidden'][batch_inputs[backward_index]] += pre_nonlinearity_hiddens_forward #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_forward'] += np.outer(hiddens_forward[backward_index-1,:], pre_nonlinearity_hiddens_forward) gradient_weights.bias['hidden_forward'][0] += pre_nonlinearity_hiddens_forward gradient_weights.weights['visible_hidden'][batch_inputs[forward_index]] += pre_nonlinearity_hiddens_backward #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_backward'] += np.outer(hiddens_backward[forward_index+1,:], pre_nonlinearity_hiddens_backward) gradient_weights.bias['hidden_backward'][0] += pre_nonlinearity_hiddens_backward if batch_size > 1: pre_nonlinearity_hiddens_forward = ((np.dot(backward_inputs[0,:], model.weights['hidden_output_forward'].T) + np.dot(pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T)) * hiddens_forward[0,:] * (1 - hiddens_forward[0,:])) pre_nonlinearity_hiddens_backward = ((np.dot(backward_inputs[-1,:], model.weights['hidden_output_backward'].T) + np.dot(pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T)) * hiddens_backward[-1,:] * (1 - hiddens_backward[-1,:])) gradient_weights.weights['visible_hidden'][batch_inputs[0]] += pre_nonlinearity_hiddens_forward# += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_forward'] += np.outer(model.init_hiddens['forward'], pre_nonlinearity_hiddens_forward) #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens) gradient_weights.bias['hidden_forward'][0] += pre_nonlinearity_hiddens_forward gradient_weights.init_hiddens['forward'][0] = np.dot(pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T) gradient_weights.weights['visible_hidden'][batch_inputs[-1]] += pre_nonlinearity_hiddens_backward# += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens) gradient_weights.weights['hidden_hidden_backward'] += np.outer(model.init_hiddens['backward'], pre_nonlinearity_hiddens_backward) #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens) gradient_weights.bias['hidden_backward'][0] += pre_nonlinearity_hiddens_backward gradient_weights.init_hiddens['backward'][0] = np.dot(pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T) # gradient_weights = self.backward_pass(backward_inputs, hiddens, batch_inputs, model) backward_inputs[batch_indices, batch_labels] += 1.0 gradient_weights /= batch_size if l2_regularization_const > 0.0: gradient_weights += model * l2_regularization_const if return_cross_entropy and not check_gradient: return cross_entropy # if not check_gradient: # if not return_cross_entropy: # if l2_regularization_const > 0.0: # return gradient_weights / batch_size + model * l2_regularization_const # return gradient_weights / batch_size # else: # if l2_regularization_const > 0.0: # return gradient_weights / batch_size + model * l2_regularization_const, cross_entropy # return gradient_weights / batch_size, cross_entropy ### below block checks gradient... only to be used if you think the gradient is incorrectly calculated ############## else: gradient_weights *= batch_size if l2_regularization_const > 0.0: gradient_weights += model * (l2_regularization_const * batch_size) sys.stdout.write("\r \r") print "checking gradient..." finite_difference_model = Bidirectional_RNNLM_Weight() finite_difference_model.init_zero_weights(self.model.get_architecture(), verbose=False) direction = Bidirectional_RNNLM_Weight() direction.init_zero_weights(self.model.get_architecture(), verbose=False) epsilon = 1E-5 print "at initial hiddens" for key in direction.init_hiddens.keys(): for index in range(direction.init_hiddens[key].size): direction.init_hiddens[key][0][index] = epsilon forward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model + direction)[batch_indices, batch_labels])) backward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model - direction)[batch_indices, batch_labels])) finite_difference_model.init_hiddens[key][0][index] = (forward_loss - backward_loss) / (2 * epsilon) direction.init_hiddens[key][0][index] = 0.0 for key in direction.bias.keys(): print "at bias key", key for index in range(direction.bias[key].size): direction.bias[key][0][index] = epsilon #print direction.norm() forward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model + direction)[batch_indices, batch_labels])) backward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model - direction)[batch_indices, batch_labels])) finite_difference_model.bias[key][0][index] = (forward_loss - backward_loss) / (2 * epsilon) direction.bias[key][0][index] = 0.0 for key in direction.weights.keys(): print "at weight key", key for index0 in range(direction.weights[key].shape[0]): for index1 in range(direction.weights[key].shape[1]): direction.weights[key][index0][index1] = epsilon forward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model + direction)[batch_indices, batch_labels])) backward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model - direction)[batch_indices, batch_labels])) finite_difference_model.weights[key][index0][index1] = (forward_loss - backward_loss) / (2 * epsilon) direction.weights[key][index0][index1] = 0.0 print "calculated gradient for forward initial hiddens" print gradient_weights.init_hiddens['forward'] print "finite difference approximation for forward initial hiddens" print finite_difference_model.init_hiddens['forward'] print "calculated gradient for backward initial hiddens" print gradient_weights.init_hiddens['backward'] print "finite difference approximation for backward initial hiddens" print finite_difference_model.init_hiddens['backward'] print "calculated gradient for forward hidden bias" print gradient_weights.bias['hidden_forward'] print "finite difference approximation for forward hidden bias" print finite_difference_model.bias['hidden_forward'] print "calculated gradient for backward hidden bias" print gradient_weights.bias['hidden_backward'] print "finite difference approximation for backward hidden bias" print finite_difference_model.bias['hidden_backward'] print "calculated gradient for output bias" print gradient_weights.bias['output'] print "finite difference approximation for output bias" print finite_difference_model.bias['output'] print "calculated gradient for visible_hidden layer" print gradient_weights.weights['visible_hidden'] print "finite difference approximation for visible_hidden layer" print finite_difference_model.weights['visible_hidden'] print np.sum((finite_difference_model.weights['visible_hidden'] - gradient_weights.weights['visible_hidden']) ** 2) print "calculated gradient for hidden_hidden_forward layer" print gradient_weights.weights['hidden_hidden_forward'] print "finite difference approximation for hidden_hidden_forward layer" print finite_difference_model.weights['hidden_hidden_forward'] print "calculated gradient for hidden_hidden_backward layer" print gradient_weights.weights['hidden_hidden_backward'] print "finite difference approximation for hidden_hidden_backward layer" print finite_difference_model.weights['hidden_hidden_backward'] print "calculated gradient for hidden_output_forward layer" print gradient_weights.weights['hidden_output_forward'] print "finite difference approximation for hidden_output_forward layer" print finite_difference_model.weights['hidden_output_forward'] print "calculated gradient for hidden_output_backward layer" print gradient_weights.weights['hidden_output_backward'] print "finite difference approximation for hidden_output_backward layer" print finite_difference_model.weights['hidden_output_backward'] sys.exit()