def Q4(): # decision trees syn_data = get_syn_data() X_test, X_train, X_val, y_test, y_train, y_val = syn_data[0], syn_data[1], \ syn_data[2], syn_data[3], \ syn_data[4], syn_data[5] D = [3, 6, 8, 10, 12] training_error, validation_error = [], [] learned_classifiers = [None] * len(D) for d in D: dt = decision_tree.DecisionTree(d) dt.train(X_train, y_train) learned_classifiers[D.index(d)] = dt training_error.append(dt.error(X_train, y_train)) validation_error.append(dt.error(X_val, y_val)) plot_decisions(D, learned_classifiers, X_train, y_train, "CART DT on SynData") plt.plot(D, training_error, label='training error', color='magenta') plt.plot(D, validation_error, label='validation error', color='deepskyblue') plt.title('CART DT error on SynData as function of max depth') plt.legend(loc='best') plt.xlabel('Max Depth') plt.ylabel('Error') plt.show()
def fit(self, X, y, max_depth=15): ''' Fit the data to all trees ''' # save labels self.labels = np.unique(y) # determine n_samples if (self.n_samples == 'all' or self.n_samples > len(X)): self.n_samples = len(X) # determine n_features if (self.n_features == 'auto'): self.n_features = int(math.sqrt(X.shape[1])) elif (self.n_features == 'all' or self.n_features > X.shape[1]): self.n_features = X.shape[1] # QA if (self.n_samples <= 0 or self.n_features <= 0 or self.n_trees < 2): raise ValueError('There is an error in your input values') # generate n trees and fit them self.trees = [] for i in range(self.n_trees): # generate a sub-sample (with returns) for the tree mask = np.random.choice(np.arange(len(X)), self.n_samples, replace=True) # fit tree tree = decision_tree.DecisionTree(max_depth, max_features=self.n_features) tree.fit(X[mask], y[mask]) # add to ensemble self.trees.append(tree)
def check_decision_tree(): try: results = json.load(open('decision_tree.json', 'r')) if results['test_accu'] >= 0.8 and results['train_accu'] >= 0.8: score_results = 0.5 else: score_results = 0 except: return 0 test_features = [[0, 0], [0, 0], [0, 1], [0, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1]] test_labels = [0, 0, 0, 1, 1, 1, 0, 0, 1] try: import decision_tree test_tree = decision_tree.DecisionTree() test_tree.train(test_features, test_labels) predictions = test_tree.predict(test_features) if predictions == [0, 0, 0, 0, 1, 1, 0, 0, 0]: score_tree = 0.5 else: score_tree = 0 except: return 0 return round(score_results + score_tree, 1)
def run_k_folds_custom_dt(corpus, ys, k): x_folds, y_folds = get_folds(corpus, ys, k) classifier = decision_tree.DecisionTree() overall_accuracy = 0 for i in xrange(0, k): train_xs, test_xs, train_ys, test_ys = get_train_and_test(x_folds, y_folds, i, k) train_xs, svd, transform = generate_ngrams(train_xs, 1, 2, 50000, True) matrix = transform.transform(test_xs) matrix = svd.transform(matrix) z = 0 classifier.fit(train_xs[0:10000,:], np.array(train_ys[0:10000])) num_correct = 0 predict = np.zeros((matrix.shape[0],1)) for entry in matrix: predict[z] = classifier.predict(entry) if predict[z] == test_ys[z]: num_correct += 1 z += 1 cm = confusion_matrix(predict, test_ys) plt.matshow(cm) plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() current_accuracy = float(num_correct)/len(test_ys) print i, ": ", current_accuracy overall_accuracy += current_accuracy overall_accuracy /= float(k) print "Overall: %f" % (rep, overall_accuracy)
def test_same_class_tree_default_params(): t = tree.DecisionTree() X_train, _, y_train, _ = _trivial_split() t.fit(X_train, y_train) raw_decision_tree = t.root assert raw_decision_tree.is_leaf assert raw_decision_tree.prediction == 1
def Q4(): # decision trees val_error = [] train_error = [] sample = [3, 6, 8, 10, 12] for samp in sample: dt = dta.DecisionTree(samp) dt.train(X_train, y_train) train_error.append(dt.error(X_train, y_train)) val_error.append(dt.error(X_val, y_val)) plot(sample, train_error) plot(sample, val_error) xlabel("samp") ylabel("error rate") legend(["train error", "validation error"], loc=5) show() # figure(1) # ion() for index, samp in enumerate(sample): dt = dta.DecisionTree(samp) dt.train(X_train, y_train) subplot(2, 3, index + 1) decision_boundaries(dt, X_train, y_train, "samp = " + str(samp)) pause(8) best_d = sample[val_error.index(np.min(val_error))] print(best_d) dt = dta.DecisionTree(best_d) dt.train(X_train, y_train) print(dt.error(X_test, y_test)) # Bagging: val_error = [] for B in range(5, 105, 5): print("B: " + str(B)) bag = bagging.Bagging(dta.DecisionTree, B, best_d) bag.train(X_train, y_train) val_error.append(bag.error(X_val, y_val)) plot(range(5, 105, 5), val_error) xlabel("B") ylabel("validation error rate") show() best_b = list(range(5, 105, 5))[val_error.index(np.min(val_error)) + 5] print("best b: ", best_b) bag = bagging.Bagging(dta.DecisionTree, best_b, best_d) bag.train(X_train, y_train) print(bag.error(X_test, y_test))
def predict(self, data, roots): predictions = [0, 0] test = decision_tree.DecisionTree(self.maxDepth) for x in roots: predictions[test.predict(data, x)] += 1 if predictions[0] > predictions[1]: return 0 else: return 1
def train(self, data, labels): decision_trees = [] test = decision_tree.DecisionTree(self.maxDepth) segments = len(data) // 2 m = np.sqrt(len(data[0])) for x in range(self.trees): data, labels = shuffle(data, labels) decision_trees.append( test.train(data[:segments], labels[:segments], m=int(m))) return decision_trees
def main(): x, y = get_data() model = dt.DecisionTree([0, 1, 2, 3, 4, 5]) model.fit(x, y) count = 0 _sum = 0 x, y = get_test() for i, e in enumerate(x): p = model.predict(e) # print(p, y[i]) if p == y[i]: count += 1 _sum += 1 print(count / _sum)
def setUp(self): """ Loads dataset config. """ self.criterion = criteria.GiniGain self.config = dataset.load_config( os.path.join('.', 'data', 'train_dataset2')) self.data = dataset.Dataset(self.config["filepath"], self.config["key attrib index"], self.config["class attrib index"], self.config["split char"], self.config["missing value string"], load_numeric=True) self.decision_tree = decision_tree.DecisionTree(self.criterion)
def dt(): start_time = time.time() data_frame, data_discrete_info, data_continuous_info = preprocess.read_data(train_filename, discrete_keys, continuous_keys) test_frame, _, __ = preprocess.read_data(test_filename, discrete_keys, continuous_keys) # attributes = discrete_keys + continuous_keys tree = decision_tree.DecisionTree(data_frame, discrete_keys + continuous_keys, data_discrete_info, data_continuous_info, 'y') tree.build() # tree.show_tree() error_rate = tree.inference(test_frame) end_time = time.time() print("Time cost:", end_time - start_time) return error_rate
def fit(self, data, targets): """ fits the data to n decision trees Keyword Arguments data - the arrays which describe the pacman scene target - the move associated with each array """ self.trees = [] # Create n decision trees from random sample # of train data for _ in range(self.dec_num): train, target = self.generate_train(data, targets) dt = decision_tree.DecisionTree(train, target) self.trees.append(dt)
def test_predict_proba_on_digits_dataset(): digits_dataset = load_digits() RANDOM_STATE = 17 X_train, X_test, y_train, y_test = train_test_split( digits_dataset['data'], digits_dataset['target'], test_size=0.2, random_state=RANDOM_STATE) t = tree.DecisionTree(criterion='gini', max_depth=3) t.fit(X_train, y_train) proba = t.predict_proba(X_train[0:1]) np.testing.assert_almost_equal(proba[0].sum(), 1.0, 3)
def __init__(self, input_data, number_trees): """ Creates a new random forest as a list of decision trees. Number of trees must be an odd positive integer. :param input_data: pandas data frame :param number_trees: int """ if number_trees % 2 == 0 or number_trees <= 0: raise ValueError("Number of trees must be an odd positive integer") self.trees = [] for i in range(0, number_trees): bootstrapped_data = input_data.sample(max(input_data.count()), replace=True) self.trees.append( dt.DecisionTree(bootstrapped_data, random_subset=True))
def main(): # ******************************** Part I using k-fold cross validation on the data set ************************** # ******************************** Read data from files *********************************************************** # Get the required data from the file. data_set, features, num_of_features = read_data.read_from_train_file('dataset.txt') # ******************************** K cross validation ************************************************************* # ***************************************************************************************************************** # ***************************************************************************************************************** # K cross validation of the data. # The data is shuffled and split into k chunks. # One chunk is set to be the test set and the rest are mixed to be the training set. train, test = k_cross_validation.data_cross_validation(5, data_set) # Initialize the features. # Cross validation - send the training set. utility.create_feature_dictionaries(features, train) # ******************************** Decision Tree ****************************************************************** # Create the model tree_model = decision_tree.DecisionTree(num_of_features, utility.all_feature_types) # Create the root. # Cross validation - send the training set. tree_root = tree_model.create_tree_root(train, list(utility.all_feature_types.keys()), tree_model.majority_classification(train), 0) # Create the tree. tree = decision_tree.Tree(tree_root) # Run the algorithm on the data set. # Cross validation - send testing set. tree_results = tree_model.classify(test, tree) # Create the tree string. tree_string = tree.create_tree_string(tree_root) # Write it to a file. with open("tree.txt", 'w') as f: f.write(tree_string) # ******************************** KNN **************************************************************************** # Create the model. knn_model = k_nearest_neightbors.KNearestNeighbors(5, num_of_features) # Run the algorithm and get the results. # Cross validation - send training and test set. knn_results = knn_model.classify(train, test) # ******************************** Naives Bayes ****************************************************************** # Create the model. bayes_model = naive_bayes.NaiveBayes(num_of_features) # Run the algorithm and get the results. # Cross validation - send training and test set. bayes_results = bayes_model.classify(train, test) # ******************************** Accuracy ********************************************************************** # Call the accuracy function, send the test set and algorithm results to compare and write results to a file. accuracy.accuracy(test, tree_results, knn_results, bayes_results)
def main(): # ******************************** Part II Hardcoded train.txt and test.txt files ********************************* # ***************************************************************************************************************** # ***************************************************************************************************************** # ******************************** Read data from files *********************************************************** # Get the required data from the file. train, features, num_of_features = read_data.read_from_train_file( 'train.txt') test = read_data.read_from_test_file('test.txt') # Initialize the features. utility.create_feature_dictionaries(features, train) # ******************************** Decision Tree ****************************************************************** # Create the model tree_model = decision_tree.DecisionTree(num_of_features, utility.all_feature_types) # Create the root. tree_root = tree_model.create_tree_root( train, list(utility.all_feature_types.keys()), tree_model.majority_classification(train), 0) # Create the tree. tree = decision_tree.Tree(tree_root) # Run the algorithm on the data set. tree_results = tree_model.classify(test, tree) # Create the tree string. tree_string = tree.create_tree_string(tree_root) # Write it to the output.txt file and add a newline. with open("output.txt", 'w') as f: f.write(tree_string) # Separate by newline for accuracy results later. f.write('\n') # ******************************** KNN **************************************************************************** # Create the model. knn_model = k_nearest_neightbors.KNearestNeighbors(5, num_of_features) # Run the algorithm and get the results. knn_results = knn_model.classify(train, test) # ******************************** Naives Bayes ****************************************************************** # Create the model. bayes_model = naive_bayes.NaiveBayes(num_of_features) # Run the algorithm and get the results. bayes_results = bayes_model.classify(train, test) # ******************************** Accuracy ********************************************************************** # Call the accuracy output function, send the test set and algorithm results and write results # to the output.txt file accuracy.accuracy_output(test, tree_results, knn_results, bayes_results, "output.txt")
def setUp(self): """ Loads dataset config and Dataset without numeric attributes, trains the tree. """ import criteria self.criterion = criteria.GiniGain self.config = dataset.load_config( os.path.join('.', 'data', 'train_dataset1')) self.data = dataset.Dataset(self.config["filepath"], self.config["key attrib index"], self.config["class attrib index"], self.config["split char"], self.config["missing value string"], load_numeric=False) self.decision_tree = decision_tree.DecisionTree(self.criterion) self.decision_tree.train(self.data, list(range(self.data.num_samples)), max_depth=1, min_samples_per_node=1, use_stop_conditions=False, max_p_value_chi_sq=None)
def main(args): filename = args.file method = args.method print("-STARTING-\n") print("Using") print("Ab cutoff: {}".format(args.ab_count_cutoff)) print("Culture cutoff: {}".format(args.culture_cutoff)) print("Method: {}".format(method)) print("Analysis type: {}".format(args.analysis_type)) print() if args.average: classifier = average.Average(filename, CULTURE_SIZE_CUTOFF=args.culture_cutoff, AB_CULTURE_COUNT_CUTOFF=args.ab_count_cutoff, ESBL_AB_RESISTENCE_LIST=ESBL_AB_RESISTANCE_LIST, RELEVANT_MO_LIST=RELEVANT_MO_LIST) # elif args.svm: # SVM.run(filename, args.culture_cutoff, args.ab_count_cutoff, ESBL_AB_RESISTANCE_LIST) elif args.tree: classifier = decision_tree.DecisionTree(filename, CULTURE_SIZE_CUTOFF=args.culture_cutoff, AB_CULTURE_COUNT_CUTOFF=args.ab_count_cutoff, ESBL_AB_RESISTANCE_LIST=ESBL_AB_RESISTANCE_LIST, RELEVANT_MO_LIST=RELEVANT_MO_LIST, testmode=method, analysis_type=args.analysis_type, medication_file=args.medication_file) elif args.perceptron: classifier = perceptron.Perceptron(filename, CULTURE_SIZE_CUTOFF=args.culture_cutoff, AB_CULTURE_COUNT_CUTOFF=args.ab_count_cutoff, ESBL_AB_RESISTANCE_LIST=ESBL_AB_RESISTANCE_LIST, RELEVANT_MO_LIST=RELEVANT_MO_LIST, testmode=method, analysis_type=args.analysis_type, medication_file=args.medication_file) classifier.run()
def fit(self, x, y): """ Fit the model on the data :param x: (Dataframe) Feature data :param y: (array) Dependent variable """ if not self.n_bootstrap: self.n_bootstrap = ((len(x) <= 1000) and min(250, len(x))) or 500 features = x.columns if not self.max_features: self.max_features = m.ceil(len(features)**0.5) data = x data['dependent'] = y self.forest = [] for i in range(self.n_trees): data_bs = bootstrap(data, self.n_bootstrap) self.forest[i] = decision_tree.DecisionTree( data_bs[features], data_bs.dependent, max_features=self.max_features, max_depth=self.max_depth, min_samples=self.min_samples) self.forest[i].grow(True)
def test_wrong_value_of_max_depth(): with pytest.raises(ValueError): tree.DecisionTree(max_depth=0)
import decision_tree import csv import random data = [] labels = [] with open("hw5_titanic_dist/cleaned_data.csv") as census_file: censusreader = csv.reader(census_file) for x in censusreader: data.append(list(map(lambda y: int(y), x))) with open("hw5_titanic_dist/cleaned_data_labels.csv") as census_file: censusreader = csv.reader(census_file) for x in censusreader: labels.append(int(x[0])) test = decision_tree.DecisionTree(2) root = test.train(data, labels) print(root.split_rule) print(root.left.split_rule) print(root.right.split_rule) print(root.left.left.label) print(root.left.right.label) print(root.right.left.label) print(root.right.right.label)
def Q5(): # spam data T = [5, 50, 100, 200, 500, 1000] D = [5, 8, 10, 12, 15, 18] # get spam data spam_data = np.loadtxt('SpamData/spam.data') # change values of 0 to -1 spam_data[:, -1][spam_data[:, -1] == 0] = -1\ # get vault data and train data np.random.shuffle(spam_data) vault_index = np.random.choice(len(spam_data), 1536, replace=False) train_index = np.array( [i for i in range(len(spam_data)) if i not in vault_index]) train_data = spam_data[train_index] vault_data = spam_data[vault_index] # Use 5-fold cross validation to pick T and d data_size = len(train_data) split = int(data_size / 5) folds = np.split(train_data, [split, 2 * split, 3 * split, 4 * split]) data_sets = split_data_to_folds(folds) DT_error = [0] * 6 adaboost_error = [0] * 6 best_DT_error = None bes_adaboost_error = None for i in range(5): fold_size1 = data_sets[i][0].shape[1] arr1 = data_sets[i][0] arr2 = data_sets[i][1] X_train, y_train = arr1[:, 0:fold_size1 - 1],\ arr1[:,fold_size1 - 1:fold_size1] X_validation = arr2[:, 0:(fold_size1 - 1)] y_validation = arr2[:, (fold_size1 - 1):fold_size1] y_train = y_train.reshape((-1, )) y_validation = y_validation.reshape((-1, )) for t in T: ada_boost = adaboost.AdaBoost(tools.DecisionStump, t) ada_boost.train(X_train, y_train) current_adaboost_error = ada_boost.error(X_validation, y_validation) adaboost_error[i] += current_adaboost_error if bes_adaboost_error == None or bes_adaboost_error > current_adaboost_error: bes_adaboost_error = t for d in D: dt = decision_tree.DecisionTree(d) dt.train(X_train, y_train) current_dt_error = dt.error(X_validation, y_validation) DT_error[i] += current_dt_error if best_DT_error == None or best_DT_error > current_dt_error: best_DT_error = d # get mean error adaboost_error = np.array([x / 5 for x in adaboost_error]) DT_error = np.array([x / 5 for x in DT_error]) plt.errorbar(T, adaboost_error, capsize=np.std, color='magenta') plt.title('validation error on SpamData for adaBoost as function of T') plt.legend(loc='best') plt.xlabel('T') plt.ylabel('Error') plt.errorbar(T, adaboost_error) plt.show() # plt.errorbar(D, DT_error, capsize=np.std, color='magenta') plt.title('validation error on SpamData for DT as function of max depth') plt.legend(loc='best') plt.xlabel('max depth') plt.ylabel('Error') plt.show() # Train classifiers using the chosen parameter values, using the complete training set. # X_train, y_train = train_data[:, 0:57], train_data[:, 57] X_vault, y_vault = vault_data[:, 0:57], vault_data[:, 57] ada_boost = adaboost.AdaBoost(tools.DecisionStump, bes_adaboost_error) ada_boost.train(X_train, y_train) vault_adaboost_error = ada_boost.error(X_vault, y_vault) dt = decision_tree.DecisionTree(best_DT_error) dt.train(X_train, y_train) vault_dt_error = dt.error(X_vault, y_vault) print("vault_adaboost_error= " + vault_adaboost_error) print("vault_dt_error= " + vault_dt_error)
def run(dataset_name, train_dataset, criterion, min_num_samples_allowed, max_depth, num_trials, starting_seed, num_folds, is_stratified, use_numeric_attributes, use_chi_sq_test, max_p_value_chi_sq, output_file_descriptor, output_split_char=',', seed=None): """Runs `num_trials` experiments, each one doing a stratified cross-validation in `num_folds` folds. Saves the training and classification information in the `output_file_descriptor` file. """ if seed is not None: random.seed(seed) np.random.seed(seed) for trial_number in range(num_trials): print('*'*80) print('STARTING TRIAL #{} USING SEED #{}'.format( trial_number + 1, starting_seed + trial_number)) print() if seed is None: random.seed(RANDOM_SEEDS[trial_number + starting_seed - 1]) np.random.seed(RANDOM_SEEDS[trial_number + starting_seed - 1]) tree = decision_tree.DecisionTree(criterion=criterion) start_time = timeit.default_timer() (_, num_correct_classifications_w_unkown, num_correct_classifications_wo_unkown, _, _, _, num_unkown, _, _, num_nodes_prunned_per_fold, max_depth_per_fold, num_nodes_per_fold, num_valid_attributes_in_root, num_valid_nominal_attributes_in_root, num_valid_numeric_attributes_in_root, num_values_root_attribute_list, num_trivial_splits, trivial_accuracy_percentage) = tree.cross_validate( curr_dataset=train_dataset, num_folds=num_folds, max_depth=max_depth, min_samples_per_node=min_num_samples_allowed, is_stratified=is_stratified, print_tree=False, print_samples=False, use_stop_conditions=use_chi_sq_test, max_p_value_chi_sq=max_p_value_chi_sq) total_time_taken = timeit.default_timer() - start_time accuracy_with_missing_values = (100.0 * num_correct_classifications_w_unkown / train_dataset.num_samples) try: accuracy_without_missing_values = (100.0 * num_correct_classifications_wo_unkown / (train_dataset.num_samples - num_unkown)) except ZeroDivisionError: accuracy_without_missing_values = None percentage_unkown = 100.0 * num_unkown / train_dataset.num_samples if num_values_root_attribute_list: (avg_num_values_root_attribute, max_num_values_root_attribute, min_num_values_root_attribute) = (np.mean(num_values_root_attribute_list), np.amax(num_values_root_attribute_list), np.amin(num_values_root_attribute_list)) else: (avg_num_values_root_attribute, max_num_values_root_attribute, min_num_values_root_attribute) = (None, None, None) save_trial_info(dataset_name, train_dataset.num_samples, trial_number + starting_seed, criterion.name, max_depth, num_folds, is_stratified, use_numeric_attributes, min_num_samples_allowed, decision_tree.USE_MIN_SAMPLES_SECOND_LARGEST_CLASS, decision_tree.MIN_SAMPLES_SECOND_LARGEST_CLASS, use_chi_sq_test, max_p_value_chi_sq, decision_tree.MIN_SAMPLES_IN_SECOND_MOST_FREQUENT_VALUE, np.mean(num_valid_attributes_in_root), np.mean(num_valid_nominal_attributes_in_root), np.mean(num_valid_numeric_attributes_in_root), total_time_taken, trivial_accuracy_percentage, accuracy_with_missing_values, accuracy_without_missing_values, num_unkown, percentage_unkown, avg_num_values_root_attribute, max_num_values_root_attribute, min_num_values_root_attribute, num_trivial_splits, np.mean(num_nodes_per_fold), np.amax(num_nodes_per_fold), np.amin(num_nodes_per_fold), np.mean(max_depth_per_fold), np.amax(max_depth_per_fold), np.amin(max_depth_per_fold), np.mean(num_nodes_prunned_per_fold), output_split_char, output_file_descriptor)
def main(): data_set = [['ACD', 0.0231, 1.157, 0.919, 93.061, 0.0917], ['ACD', 0.0296, 1.1183, 0.9356, 80.9492, 0.0681], ['ACD', 0.0471, 1.3537, 1.0208, 108.7305, 0.091], ['ACD', 0.0165, 1.2621, 1.1879, 116.3081, 0.1154], ['ACD', 0.0236, 1.117, 0.8673, 77.9446, 0.066], ['ACD', 0.008, 1.413, 1.0474, 102.6556, 0.07], ['ACD', 0.0267, 1.4068, 1.1244, 107.5716, 0.0734], ['ACD', 0.0838, 1.1258, 1.0406, 100.2574, 0.0474], ['ACD', 0.0225, 1.2126, 0.9824, 98.885, 0.0928], ['ACD', 0.0639, 2.1101, 1.2162, 137.5727, 0.159], ['ACD', 0.0021, 0.8333, 0.7004, 68.5042, 0.0464], ['ACD', 0.0208, 1.5963, 1.0204, 142.5501, 0.1329], ['HM', 0.461, 2.1225, 1.5204, 133.2334, 0.0623], ['HM', 0.2118, 1.5373, 1.2326, 99.011, 0.0808], ['HM', 0.2308, 2.3465, 1.3419, 106.459, 0.0548], ['HM', 0.5372, 2.171, 1.8759, 135.6919, 0.0602], ['HM', 0.318, 2.1527, 1.1671, 130.0122, 0.0651], ['HM', 0.2434, 2.3092, 1.6817, 179.5259, 0.1192], ['HM', 0.4191, 1.5634, 0.8894, 117.2704, 0.0265], ['HM', 0.5952, 2.6538, 1.5957, 152.4041, 0.0752], ['HM', 0.3963, 2.0715, 1.2956, 124.8764, 0.094], ['HM', 0.1638, 1.8827, 1.0938, 105.0277, 0.0384], ['HM', 0.2752, 3.0803, 1.6789, 146.2936, 0.0803], ['HM', 0.4227, 1.6529, 0.8303, 84.3475, 0.0399]] if len(sys.argv) > 1: if sys.argv[1] != "train" and sys.argv[1] != "predict": print("Unknown argument, please enter 'predict' or 'train'") sys.exit(1) elif sys.argv[1] == "train": # Train your model model = input( "Which model would you like to train ? Perceptron(p) or Decision Tree(d): " ) if model != "p" and model != "d": print("Sorry! Wrong argument") elif model == "p": perceptron_data = copy.deepcopy(data_set) for data_point in perceptron_data: if "ACD" in data_point[0]: data_point[0] = 1 elif "HM" in data_point[0]: data_point[0] = 0 shuffle(perceptron_data) weights = pt.train_perceptron(perceptron_data, 0.01, 20000) predict = input( "A perceptron has been trained. Would you like to make a prediction?(y/n) " ) if predict == "y": filename = input( "Please enter the name of the file containing text for author identification: " ) data_value = fp.process(filename, "NA") prediction = pt.predict(data_value, weights) if int(prediction) == 1: print("Author is Arthur Conan Doyle.") elif int(prediction) == 0: print("Author is Herman Melville.") elif model == "d": max_depth = int( input( "Please enter the maximum depth of the decision tree: " )) entropy_cutoff = float( input( "Please enter the entropy cutoff of the decision tree(ideal is 0.0): " )) print("Training a decision tree on training data...") tree = dt.DecisionTree(shuffle(data_set), ["ACD", "HM"], max_depth, entropy_cutoff) predict = input( "The decision tree has been trained. Would you like to make a prediction?(y/n) " ) if predict == "y": filename = input( "Please enter the name of the file containing text for author identification: " ) data_value = fp.process(filename, "NA") node = tree while node.FINAL_LABEL == "": if data_value[node.att_index] <= node.threshold: node = node.left elif data_value[node.att_index] > node.threshold: node = node.right if node.FINAL_LABEL == "ACD": print("The author is Arthur Conan Doyle") else: print("The author is Herman Melville") elif sys.argv[1] == "predict": filename = sys.argv[2] print("Predicting using an existing model: ") model_file = open("model_perceptron.txt", "r") line = model_file.readline().split(",") weights = [] for weight in line: weights.append(float(weight)) data_value = fp.process(filename, "NA") prediction = pt.predict(data_value, weights) if int(prediction) == 1: print("Author is Arthur Conan Doyle.") elif int(prediction) == 0: print("Author is Herman Melville") else: print("Please enter argument 'train' or 'predict'. ") sys.exit(1)
def test_default(): t = tree.DecisionTree() assert t.max_depth == np.inf assert t.min_samples_split == 2 assert t.criterion == 'gini'
import numpy as np from sklearn.metrics import accuracy_score import json import data_loader import decision_tree # load data X_train, X_test, y_train, y_test = data_loader.discrete_2D_iris_dataset() # set classifier dTree = decision_tree.DecisionTree() # training dTree.train(X_train, y_train) y_est_train = dTree.predict(X_train) train_accu = accuracy_score(y_est_train, y_train) print('train_accu', train_accu) # testing y_est_test = dTree.predict(X_test) test_accu = accuracy_score(y_est_test, y_test) print('test_accu', test_accu) # print dTree.print_tree() # save json.dump({ 'train_accu': train_accu, 'test_accu': test_accu
import decision_tree as dt # header is not necessary header = ['color', 'size', 'shape', 'label'] train_data = [ ['Green', 3, 'round', 'Apple'], ['Red', 3, 'round', 'Apple'], ['Purple', 1, 'round', 'Grape'], ['Purple', 1, 'round', 'Grape'], ['Yellow', 3, 'round', 'Lemon'], ['Yellow', 3, 'long', 'Banana'], ] # define a decision tree myTree = dt.DecisionTree() # train decision tree with training dta myTree.fit(train_data) # test data, should has 'grape' as its label test_data = ['purple', 1, 'round'] result = myTree.predict(test_data) # output predicted result print(result)
def _run_fold(dataset_name, curr_dataset, criterion, trial_number, min_num_samples_allowed, max_depth, num_folds, is_stratified, use_numeric_attributes, use_chi_sq_test, max_p_value_chi_sq, num_samples, original_valid_nominal_attributes, original_valid_numeric_attributes, training_samples_indices, validation_sample_indices, output_file_descriptor, output_split_char=','): print('\nFold #{}'.format(fold_number + 1)) print_information_per_attrib = { } # ...[attrib_index] = print_information accuracy_criterion_value = [ ] # ...[...] = (accuracy_with_missing_values, criterion_value) tree = decision_tree.DecisionTree(criterion) num_attributes = len(original_valid_nominal_attributes) for (attrib_index, (is_valid_nominal_attrib, is_valid_numeric_attrib)) in enumerate( zip(original_valid_nominal_attributes, original_valid_numeric_attributes)): if not is_valid_nominal_attrib and not is_valid_numeric_attrib: continue # Let's pretend only the current attribute is valid. print() print('Current attribute: {} ({})'.format( curr_dataset.attrib_names[attrib_index], attrib_index)) curr_dataset.valid_nominal_attribute = [False] * num_attributes curr_dataset.valid_nominal_attribute[ attrib_index] = is_valid_nominal_attrib curr_dataset.valid_numeric_attribute = [False] * num_attributes curr_dataset.valid_numeric_attribute[ attrib_index] = is_valid_numeric_attrib num_values = len(curr_dataset.attrib_int_to_value[attrib_index]) if not num_values: continue if max_depth is None: curr_max_depth_allowed = 1 + math.ceil( math.log2(curr_dataset.num_classes)) else: curr_max_depth_allowed = max_depth start_time = timeit.default_timer() ((_, num_correct_classifications_w_unkown, num_correct_classifications_wo_unkown, _, _, _, num_unkown, _), curr_max_depth_found, _, curr_num_nodes_prunned) = tree.train_and_test( curr_dataset, training_samples_indices, validation_sample_indices, max_depth=curr_max_depth_allowed, min_samples_per_node=min_num_samples_allowed, use_stop_conditions=use_chi_sq_test, max_p_value_chi_sq=max_p_value_chi_sq) total_time_taken = timeit.default_timer() - start_time if (not tree.get_root_node().valid_nominal_attribute[attrib_index] and not tree.get_root_node( ).valid_numeric_attribute[attrib_index]): continue try: curr_criterion_value = tree.get_root_node( ).node_split.criterion_value except AttributeError: continue trivial_accuracy = tree.get_trivial_accuracy( validation_sample_indices) accuracy_with_missing_values = ( 100.0 * num_correct_classifications_w_unkown / len(validation_sample_indices)) try: accuracy_without_missing_values = ( 100.0 * num_correct_classifications_wo_unkown / (len(validation_sample_indices) - num_unkown)) except ZeroDivisionError: accuracy_without_missing_values = None percentage_unkown = 100.0 * num_unkown / len( validation_sample_indices) curr_num_nodes = tree.get_root_node().get_num_nodes() print_information_per_attrib[attrib_index] = [ curr_criterion_value, curr_max_depth_allowed, num_values, total_time_taken, trivial_accuracy, accuracy_with_missing_values, accuracy_without_missing_values, num_unkown, percentage_unkown, curr_num_nodes, curr_max_depth_found, curr_num_nodes_prunned ] accuracy_criterion_value.append( (accuracy_with_missing_values, curr_criterion_value)) (num_inversions, num_ties, num_correct) = _count_inversions_and_ties(accuracy_criterion_value) num_valid_attributes = len(print_information_per_attrib) num_valid_numeric_attributes = sum( original_valid_numeric_attributes[attrib_index] for attrib_index in print_information_per_attrib) num_valid_nominal_attributes = num_valid_attributes - num_valid_numeric_attributes for attrib_index in sorted(print_information_per_attrib): save_info(dataset_name, use_numeric_attributes, curr_dataset.attrib_names[attrib_index], original_valid_numeric_attributes[attrib_index], num_samples, trial_number + 1, criterion.name, num_folds, fold_number + 1, is_stratified, min_num_samples_allowed, use_chi_sq_test, max_p_value_chi_sq, num_attributes, num_valid_attributes, num_valid_nominal_attributes, num_valid_numeric_attributes, num_inversions, num_ties, num_correct, *print_information_per_attrib[attrib_index], output_file_descriptor, output_split_char)
def run(dataset_name, train_dataset, num_training_samples, criterion, min_num_samples_allowed, max_depth, num_trials, starting_seed, use_numeric_attributes, use_chi_sq_test, max_p_value_chi_sq, output_file_descriptor, output_split_char=',', seed=None): """Runs `num_trials` experiments, each one randomly selecting `num_training_samples` valid samples to use for training and testing the tree in the rest of the dataset. Saves the training and classification information in the `output_file_descriptor` file. """ if seed is not None: random.seed(seed) np.random.seed(seed) training_samples_indices = list(range(train_dataset.num_samples)) for trial_number in range(num_trials): print('*' * 80) print('STARTING TRIAL #{} USING SEED #{}'.format( trial_number + 1, starting_seed + trial_number)) print() if seed is None: random.seed(RANDOM_SEEDS[trial_number + starting_seed - 1]) np.random.seed(RANDOM_SEEDS[trial_number + starting_seed - 1]) random.shuffle(training_samples_indices) curr_training_samples_indices = training_samples_indices[: num_training_samples] curr_test_samples_indices = training_samples_indices[ num_training_samples:] tree = decision_tree.DecisionTree(criterion=criterion) # First let's train the tree and save the training information start_time = timeit.default_timer() (time_taken_prunning, num_nodes_prunned) = tree.train( curr_dataset=train_dataset, training_samples_indices=curr_training_samples_indices, max_depth=max_depth, min_samples_per_node=min_num_samples_allowed, use_stop_conditions=use_chi_sq_test, max_p_value_chi_sq=max_p_value_chi_sq) total_time_taken = timeit.default_timer() - start_time num_random_tries = 1 while (sorted(tree.get_root_node().class_index_num_samples)[-2] == 0 or sum(tree.get_root_node().valid_nominal_attribute) == 0): num_random_tries += 1 if num_random_tries == MAX_RANDOM_TRIES: print( 'Already did {} random generation, none worked (only one class or no valid' ' attribute).'.format(MAX_RANDOM_TRIES)) print('Will skip to the next test.') return None random.shuffle(training_samples_indices) curr_training_samples_indices = training_samples_indices[: num_training_samples] curr_test_samples_indices = training_samples_indices[ num_training_samples:2 * num_training_samples] start_time = timeit.default_timer() (time_taken_prunning, num_nodes_prunned) = tree.train( curr_dataset=train_dataset, training_samples_indices=curr_training_samples_indices, max_depth=max_depth, min_samples_per_node=min_num_samples_allowed, use_stop_conditions=use_chi_sq_test, max_p_value_chi_sq=max_p_value_chi_sq) total_time_taken = timeit.default_timer() - start_time num_valid_nominal_attributes = sum( tree.get_root_node().valid_nominal_attribute) time_taken_tree = total_time_taken - time_taken_prunning # Time to test this tree's classification and save the classification information trivial_accuracy = tree.get_trivial_accuracy(curr_test_samples_indices) (_, num_correct_classifications_w_unkown, num_correct_classifications_wo_unkown, _, _, _, num_unkown, _) = tree.test(curr_test_samples_indices) accuracy_with_missing_values = (100.0 * num_correct_classifications_w_unkown / len(curr_test_samples_indices)) try: accuracy_without_missing_values = ( 100.0 * num_correct_classifications_wo_unkown / (len(curr_test_samples_indices) - num_unkown)) except ZeroDivisionError: accuracy_without_missing_values = None percentage_unkown = 100.0 * num_unkown / len(curr_test_samples_indices) num_nodes_found = tree.get_root_node().get_num_nodes() max_depth_found = tree.get_root_node().get_max_depth() save_trial_info( dataset_name, train_dataset.num_samples, num_training_samples, trial_number + starting_seed, use_numeric_attributes, criterion.name, max_depth, min_num_samples_allowed, decision_tree.USE_MIN_SAMPLES_SECOND_LARGEST_CLASS, decision_tree.MIN_SAMPLES_SECOND_LARGEST_CLASS, use_chi_sq_test, max_p_value_chi_sq, decision_tree.MIN_SAMPLES_IN_SECOND_MOST_FREQUENT_VALUE, num_valid_nominal_attributes, total_time_taken, time_taken_tree, time_taken_prunning, trivial_accuracy, accuracy_with_missing_values, accuracy_without_missing_values, num_unkown, percentage_unkown, num_nodes_found, max_depth_found, num_nodes_prunned, output_split_char, output_file_descriptor)
def test_wrong_criterion(): with pytest.raises(ValueError): tree.DecisionTree(criterion='non-existed')