def train(self, features, labels): vs_size = int(0.8 * features.rows) vs_features = Matrix(features, 0, 0, vs_size, features.cols) vs_labels = Matrix(labels, 0, 0, vs_size, labels.cols) print(f'Holding out {vs_size} instances for validation set') train_features = Matrix(features, vs_size, 0, features.rows - vs_size, features.cols) train_labels = Matrix(labels, vs_size, 0, labels.rows - vs_size, labels.cols) # self.build_tree_for_instances(train_features, train_labels) self.build_tree_for_instances(features, labels) # print(self.num_nodes) self.prune_tree_using_instances(vs_features, vs_labels) print(self.num_nodes) print(self.root.traverse()) print(self.root)
def get_label_list(self, labels): label_list = [] for count in range(self.output_classes): label_list.append(Matrix(labels, 0, 0, labels.rows, labels.cols)) for row_num in range(labels.rows): output_class = int(labels.row(row_num)[0]) for index in range(0, len(label_list)): label_list[index].set(row_num, 0, 1 if output_class == index else 0) return label_list
def train(self, features, labels): self.setup_network(features.cols, labels.value_count(0)) vs_size = int(self.validation_set_proportion * features.rows) vs_features = Matrix(features, 0, 0, vs_size, features.cols) vs_labels = Matrix(labels, 0, 0, vs_size, labels.cols) print(f'Holding out {vs_size} instances for validation set') train_features = Matrix(features, vs_size, 0, features.rows - vs_size, features.cols) train_labels = Matrix(labels, vs_size, 0, labels.rows - vs_size, labels.cols) stagnant_epochs = 0 epochs = 0 best_vs_mse = float('inf') while stagnant_epochs < 40: this_train_mse = 0 for i in range(train_features.rows): inputs = train_features.row(i) target = train_labels.row(i)[0] self.calc_set_out(inputs) self.output_layer.set_target(target) self.calc_deltas() self.update_weights() this_train_mse += self.se_for_instance() this_train_mse = this_train_mse / train_features.rows epochs += 1 this_vs_mse, this_vs_accy = self.calc_mse_and_accy( vs_features, vs_labels) # print(f'{epochs},{this_train_mse},{this_vs_mse},{this_vs_accy}') if this_vs_mse < best_vs_mse: stagnant_epochs = 0 best_vs_mse = this_vs_mse else: stagnant_epochs += 1 train_features.shuffle(train_labels) self.write_out( f'{epochs},{self.hidden_layers[-1].num_nb_nodes},{this_train_mse},{this_vs_mse},' ) print(f'{epochs} epochs elapsed in training')
def split(self, features, labels): # if class is pure for node, stop if labels.count_values_present(0) == 1: self.output = labels.get(0, 0) return elif features.cols == 0: # no more features to split on self.output = labels.most_common_value(0) return # find lowest info attribute (highest info gain) self.att_idx = self.highest_info_gain_att(features, labels) self.att_name = features.attr_name(self.att_idx) # print(f'Splitting on {self.att_name}') self.children_names = features.enum_to_str[self.att_idx] # split on each value of that attribute features_for_value = {} labels_for_value = {} # iterate over rows and build data for use in child nodes for i in range(features.rows): row = features.row(i) label = labels.row(i) value = row[self.att_idx] new_row = copy.deepcopy(row) del new_row[self.att_idx] try: features_for_value[value].data.append(new_row) labels_for_value[value].data.append(label) except KeyError: new_attr_names = copy.deepcopy(features.attr_names) del new_attr_names[self.att_idx] new_str_to_enum = copy.deepcopy(features.str_to_enum) del new_str_to_enum[self.att_idx] new_enum_to_str = copy.deepcopy(features.enum_to_str) del new_enum_to_str[self.att_idx] features_for_value[value] = Matrix() features_for_value[value].data = [new_row] features_for_value[value].str_to_enum = new_str_to_enum features_for_value[value].enum_to_str = new_enum_to_str features_for_value[value].attr_names = new_attr_names labels_for_value[value] = Matrix() labels_for_value[value].data = [label] for value in sorted(features_for_value.keys()): weight = features_for_value[value].rows / features.rows n = Node(weight, labels.most_common_value(0)) # print(f'Creating child for {self.att_name} = {features.enum_to_str[self.att_idx][value]}') self.children[value] = n n.split(features_for_value[value], labels_for_value[value])
def train(self, features, labels): """ :type features: Matrix :type labels: Matrix """ # LAZY LEARNER if labels.value_count(0) == 0: self.regression = True else: self.regression = False if self.use_weighted_columns: new_feats = Matrix(features, 0, 0, features.rows, features.cols) self.perceptron = Perceptron() self.perceptron.train(new_feats, labels) self.weights = [(1 if np.isnan(value) else value) for value in self.perceptron.weights] self.feature_types = self.get_semantic_types(features) self.training_features = features self.training_labels = labels
def get_bootstrap_datasets(self, features, labels): # make copies to not affect original new_features = Matrix(matrix=features, row_start=0, col_start=0, row_count=features.rows, col_count=features.cols) new_labels = Matrix(matrix=labels, row_start=0, col_start=0, row_count=labels.rows, col_count=labels.cols) new_features.shuffle(buddy=new_labels) # take random samples of the dataset for this particular tree train_size = int(self.percent_strap * new_features.rows) bootstrap_features = Matrix(new_features, 0, 0, train_size, new_features.cols - 1) bootstrap_labels = Matrix(new_labels, 0, new_labels.cols - 1, train_size, 1) return bootstrap_features, bootstrap_labels
def make_plot(): data = Matrix() data.load_arff("../datasets/linear.arff") features = Matrix(data, 0, 0, data.rows, data.cols - 1) labels = Matrix(data, 0, data.cols - 1, data.rows, 1)
def main(self, learner, learner_name, file_name, seed, train=True): # parse the command-line arguments # Evaluation method (training | static <test_ARFF_file> | random <%%_for_training> | cross <num_folds>) eval_method = "random" eval_parameter = .7 # boolean: Print the confusion matrix and learner accuracy on individual class values print_confusion_matrix = False # boolean: Use normalized data normalize = False # string: Random seed random.seed(seed) # load the ARFF file data = Matrix() data.load_arff(file_name) if normalize: print("Using normalized data") data.normalize() # print some stats print("\nDataset name: {}\n" "Number of instances: {}\n" "Number of attributes: {}\n" "Learning algorithm: {}\n" "Evaluation method: {}\n".format(file_name, data.rows, data.cols, learner_name, eval_method)) if eval_method == "training": print("Calculating accuracy on training set...") features = Matrix(data, 0, 0, data.rows, data.cols - 1) labels = Matrix(data, 0, data.cols - 1, data.rows, 1) confusion = Matrix() start_time = time.time() if train: learner.train(features, labels) elapsed_time = time.time() - start_time print("Time to train (in seconds): {}".format(elapsed_time)) accuracy = learner.measure_accuracy(features, labels, confusion) print("Training set accuracy: " + str(accuracy)) if print_confusion_matrix: print( "\nConfusion matrix: (Row=target value, Col=predicted value)" ) confusion.print() print("") elif eval_method == "static": print("Calculating accuracy on separate test set...") test_data = Matrix(arff=eval_parameter) if normalize: test_data.normalize() print("Test set name: {}".format(eval_parameter)) print("Number of test instances: {}".format(test_data.rows)) features = Matrix(data, 0, 0, data.rows, data.cols - 1) labels = Matrix(data, 0, data.cols - 1, data.rows, 1) start_time = time.time() learner.train(features, labels) elapsed_time = time.time() - start_time print("Time to train (in seconds): {}".format(elapsed_time)) train_accuracy = learner.measure_accuracy(features, labels) print("Training set accuracy: {}".format(train_accuracy)) test_features = Matrix(test_data, 0, 0, test_data.rows, test_data.cols - 1) test_labels = Matrix(test_data, 0, test_data.cols - 1, test_data.rows, 1) confusion = Matrix() test_accuracy = learner.measure_accuracy(test_features, test_labels, confusion) print("Test set accuracy: {}".format(test_accuracy)) if print_confusion_matrix: print( "\nConfusion matrix: (Row=target value, Col=predicted value)" ) confusion.print() print("") elif eval_method == "random": print("Calculating accuracy on a random hold-out set...") train_percent = float(eval_parameter) if train_percent < 0 or train_percent > 1: raise Exception( "Percentage for random evaluation must be between 0 and 1") print("Percentage used for training: {}".format(train_percent)) print("Percentage used for testing: {}".format(1 - train_percent)) data.shuffle() train_size = int(train_percent * data.rows) train_features = Matrix(data, 0, 0, train_size, data.cols - 1) train_labels = Matrix(data, 0, data.cols - 1, train_size, 1) test_features = Matrix(data, train_size, 0, data.rows - train_size, data.cols - 1) test_labels = Matrix(data, train_size, data.cols - 1, data.rows - train_size, 1) start_time = time.time() learner.train(train_features, train_labels) elapsed_time = time.time() - start_time print("Time to train (in seconds): {}".format(elapsed_time)) train_accuracy = learner.measure_accuracy(train_features, train_labels) print("Training set accuracy: {}".format(train_accuracy)) confusion = Matrix() test_accuracy = learner.measure_accuracy(test_features, test_labels, confusion) print("Test set accuracy: {}".format(test_accuracy)) if print_confusion_matrix: print( "\nConfusion matrix: (Row=target value, Col=predicted value)" ) confusion.print() print("") elif eval_method == "cross": print("Calculating accuracy using cross-validation...") folds = int(eval_parameter) if folds <= 0: raise Exception("Number of folds must be greater than 0") print("Number of folds: {}".format(folds)) reps = 1 sum_accuracy = 0.0 elapsed_time = 0.0 for j in range(reps): data.shuffle() for i in range(folds): begin = int(i * data.rows / folds) end = int((i + 1) * data.rows / folds) train_features = Matrix(data, 0, 0, begin, data.cols - 1) train_labels = Matrix(data, 0, data.cols - 1, begin, 1) test_features = Matrix(data, begin, 0, end - begin, data.cols - 1) test_labels = Matrix(data, begin, data.cols - 1, end - begin, 1) train_features.add(data, end, 0, data.cols - 1) train_labels.add(data, end, data.cols - 1, 1) start_time = time.time() learner.train(train_features, train_labels) elapsed_time += time.time() - start_time accuracy = learner.measure_accuracy( test_features, test_labels) sum_accuracy += accuracy print("Rep={}, Fold={}, Accuracy={}".format( j, i, accuracy)) elapsed_time /= (reps * folds) print( "Average time to train (in seconds): {}".format(elapsed_time)) print("Mean accuracy={}".format(sum_accuracy / (reps * folds))) else: raise Exception( "Unrecognized evaluation method '{}'".format(eval_method)) if train: return learner.w
def train(self, features, labels): """ :type features: Matrix :type labels: Matrix """ print("The learning rate for this model is {} \n with momentum {}". format(self.learning_rate, self.momentum)) features_bias = Matrix(features, 0, 0, features.rows, features.cols) features_bias = self.add_bias_to_features(features_bias) #### Prepare Validation Set #### if self.validation_set: features_bias.shuffle(buddy=labels) test_size = int(.1 * features_bias.rows) train_features = Matrix(features_bias, 0, 0, features_bias.rows - test_size, features_bias.cols) train_features = self.add_bias_to_features(train_features) train_labels = Matrix(labels, 0, 0, features_bias.rows - test_size, labels.cols) test_features = Matrix(features_bias, test_size, 0, test_size, features_bias.cols) test_features = self.add_bias_to_features(test_features) test_labels = Matrix(labels, test_size, 0, test_size, labels.cols) ##### Setup Weights ##### self.output_classes = len(set(labels.col(labels.cols - 1))) # set up output layer => the number of classes by the size of the previous hidden layer self.output_layer = np.random.uniform(low=self.min, high=self.max, size=(self.output_classes, self.nodes_per_layer + 1)) # setup input layer to match specs - number of nodes to connect to, number of inputs plus bias self.input_layer = np.random.uniform(low=self.min, high=self.max, size=(self.nodes_per_layer, features_bias.cols + 1)) # setup output layer to match specs self.num_output_layers = labels.cols # create a new features set with a bias for training last_row_num = features_bias.rows - 1 self._is_nominal_output = labels.value_count(0) != 0 self.best_inputs = self.input_layer self.best_hidden = self.hidden_layers self.best_output = self.output_layer if not self.validation_set: train_features = features_bias test_features = features_bias train_labels = labels test_labels = labels # start learning while self._is_still_learning(self): print(" #### On Epoch Number {}".format(self.epoch_count)) train_features.shuffle(buddy=train_labels) for row_num in range(train_features.rows): #print("On input number {} #############".format(row_num + 1)) row = train_features.row(row_num) #print("Feed Forward with row: {}".format(row)) output = self._feed_forward(row) #print("backpropogating errors with output: {}".format(output)) self._back_propagate( output, train_labels.row(row_num), row, self.batch_norm_enabled, (self.batch_norm_enabled and row_num == last_row_num)) # test on validation set accuracy_for_epoch: float = self.measure_accuracy(test_features, test_labels, MSE=self.MSE) accuracy_for_epoch_train: float = self.measure_accuracy( train_features, train_labels, MSE=self.MSE) if self.epoch_count > 1: if (self.MSE and min(self.accuracy_hash.items(), key=lambda x: x[1])[1] < accuracy_for_epoch ) or (not self.MSE and max(self.accuracy_hash.items(), key=lambda x: x[1])[1] < accuracy_for_epoch): self.best_inputs = self.input_layer self.best_hidden = self.hidden_layers self.best_output = self.output_layer self.accuracy_hash[self.epoch_count] = accuracy_for_epoch self.accuracy_hash_train[ self.epoch_count] = accuracy_for_epoch_train self.epoch_count += 1 print("The best accuracy on the validation set was {}".format( max(self.accuracy_hash.items(), key=lambda x: x[1]))) self.input_layer = self.best_inputs self.hidden_layers = self.best_hidden self.output_layer = self.best_output final_vs: float = self.measure_accuracy(test_features, test_labels, MSE=self.MSE) final_ts: float = self.measure_accuracy(train_features, train_labels, MSE=self.MSE) print("The final best for VS: {} and for TS: {}".format( final_vs, final_ts)) return
def main(self): # parse the command-line arguments args = self.parser().parse_args() file_name = args.arff learner_name = args.L eval_method = args.E[0] eval_parameter = args.E[1] if len(args.E) > 1 else None print_confusion_matrix = args.verbose normalize = args.normalize random.seed( args.seed ) # Use a seed for deterministic results, if provided (makes debugging easier) # load the model learner = self.get_learner(learner_name) # load the ARFF file data = Matrix() data.load_arff(file_name) if normalize: print("Using normalized data") data.normalize() # print some stats print("\nDataset name: {}\n" "Number of instances: {}\n" "Number of attributes: {}\n" "Learning algorithm: {}\n" "Evaluation method: {}\n".format(file_name, data.rows, data.cols, learner_name, eval_method)) if eval_method == "training": print("Calculating accuracy on training set...") features = Matrix(data, 0, 0, data.rows, data.cols - 1) labels = Matrix(data, 0, data.cols - 1, data.rows, 1) confusion = Matrix() start_time = time.time() learner.train(features, labels) elapsed_time = time.time() - start_time print("Time to train (in seconds): {}".format(elapsed_time)) accuracy = learner.measure_accuracy(features, labels, confusion) print("Training set accuracy: " + str(accuracy)) if print_confusion_matrix: print( "\nConfusion matrix: (Row=target value, Col=predicted value)" ) confusion.print() print("") elif eval_method == "static": print("Calculating accuracy on separate test set...") test_data = Matrix(arff=eval_parameter) if normalize: test_data.normalize() print("Test set name: {}".format(eval_parameter)) print("Number of test instances: {}".format(test_data.rows)) features = Matrix(data, 0, 0, data.rows, data.cols - 1) labels = Matrix(data, 0, data.cols - 1, data.rows, 1) start_time = time.time() learner.train(features, labels) elapsed_time = time.time() - start_time print("Time to train (in seconds): {}".format(elapsed_time)) train_accuracy = learner.measure_accuracy(features, labels) print("Training set accuracy: {}".format(train_accuracy)) test_features = Matrix(test_data, 0, 0, test_data.rows, test_data.cols - 1) test_labels = Matrix(test_data, 0, test_data.cols - 1, test_data.rows, 1) confusion = Matrix() test_accuracy = learner.measure_accuracy(test_features, test_labels, confusion) print("Test set accuracy: {}".format(test_accuracy)) if print_confusion_matrix: print( "\nConfusion matrix: (Row=target value, Col=predicted value)" ) confusion.print() print("") elif eval_method == "random": print("Calculating accuracy on a random hold-out set...") train_percent = float(eval_parameter) if train_percent < 0 or train_percent > 1: raise Exception( "Percentage for random evaluation must be between 0 and 1") print("Percentage used for training: {}".format(train_percent)) print("Percentage used for testing: {}".format(1 - train_percent)) data.shuffle() train_size = int(train_percent * data.rows) train_features = Matrix(data, 0, 0, train_size, data.cols - 1) train_labels = Matrix(data, 0, data.cols - 1, train_size, 1) test_features = Matrix(data, train_size, 0, data.rows - train_size, data.cols - 1) test_labels = Matrix(data, train_size, data.cols - 1, data.rows - train_size, 1) start_time = time.time() learner.train(train_features, train_labels) elapsed_time = time.time() - start_time print("Time to train (in seconds): {}".format(elapsed_time)) train_accuracy = learner.measure_accuracy(train_features, train_labels) print("Training set accuracy: {}".format(train_accuracy)) confusion = Matrix() test_accuracy = learner.measure_accuracy(test_features, test_labels, confusion) print("Test set accuracy: {}".format(test_accuracy)) if print_confusion_matrix: print( "\nConfusion matrix: (Row=target value, Col=predicted value)" ) confusion.print() print("") elif eval_method == "cross": print("Calculating accuracy using cross-validation...") folds = int(eval_parameter) if folds <= 0: raise Exception("Number of folds must be greater than 0") print("Number of folds: {}".format(folds)) reps = 1 sum_accuracy = 0.0 elapsed_time = 0.0 for j in range(reps): data.shuffle() for i in range(folds): begin = int(i * data.rows / folds) end = int((i + 1) * data.rows / folds) train_features = Matrix(data, 0, 0, begin, data.cols - 1) train_labels = Matrix(data, 0, data.cols - 1, begin, 1) test_features = Matrix(data, begin, 0, end - begin, data.cols - 1) test_labels = Matrix(data, begin, data.cols - 1, end - begin, 1) train_features.add(data, end, 0, data.cols - 1) train_labels.add(data, end, data.cols - 1, 1) start_time = time.time() learner.train(train_features, train_labels) elapsed_time += time.time() - start_time accuracy = learner.measure_accuracy( test_features, test_labels) sum_accuracy += accuracy print("Rep={}, Fold={}, Accuracy={}".format( j, i, accuracy)) elapsed_time /= (reps * folds) print( "Average time to train (in seconds): {}".format(elapsed_time)) print("Mean accuracy={}".format(sum_accuracy / (reps * folds))) else: raise Exception( "Unrecognized evaluation method '{}'".format(eval_method))
def get_training_sets(self, features, labels): features_bias = Matrix(features, 0, 0, features.rows, features.cols) # features_bias = self.add_bias_to_features(features_bias) #### Prepare Validation Set #### if self.validation_set: features_bias.shuffle(buddy=labels) test_size = int(.4 * features_bias.rows) train_features = Matrix(features_bias, 0, 0, features_bias.rows - test_size, features_bias.cols) train_labels = Matrix(labels, 0, 0, features_bias.rows - test_size, labels.cols) test_features = Matrix(features_bias, test_size, 0, test_size, features_bias.cols) test_labels = Matrix(labels, test_size, 0, test_size, labels.cols) train_features = train_features.return_pandas_df() train_labels = train_labels.return_pandas_df() test_features = test_features.return_pandas_df() test_labels = test_labels.return_pandas_df() if not self.validation_set: features_bias = features_bias.return_pandas_df() labels = labels.return_pandas_df() train_features = features_bias test_features = features_bias train_labels = labels test_labels = labels return test_features, test_labels, train_features, train_labels