def main(argv): assert len(argv) == 3, " Please provide correct arguments as follows : python bayes.py <train file path> <test file path> <n|t>" assert(argv[2] == 'n' or argv[2] == 't') train_dataset_file_path, test_dataset_file_path = argv[0], argv[1] is_naive_bayes_reqd = True if argv[2] == 'n' else False logger.debug("Loading the training dataset ..") train_dataset = file_reader.get_dataset_from_file(train_dataset_file_path) logger.debug("Loading the test dataset ..") test_dataset = file_reader.get_dataset_from_file(test_dataset_file_path) if is_naive_bayes_reqd: start = time.clock() logger.debug("Generating the naive bayes model using training dataset ..") naive_bayes_model = naive_bayes.Naivebayes(train_dataset) logger.debug("Evaulating the test dataset on Naive Bayes Model ..") naive_bayes.evaluate_naive_bayes_model(naive_bayes_model, test_dataset) logger.debug("Time taken for running Naive Bayes Model is " + str(time.clock() - start) + " s ") else: start = time.clock() logger.debug("Generating the TAN model using training dataset ..") tan_model = tan.TAN(train_dataset) logger.debug("Evaulating the test dataset on TAN model ..") tan.evaluate_tan_model(tan_model, test_dataset) logger.debug("Time taken for running TAN Model is " + str(time.clock() - start) + " s ")
def get_learning_curve_with_number_of_instances(): train_dataset = file_reader.get_dataset_from_file(TRAIN_DATASET_FILE_PATH) test_dataset = file_reader.get_dataset_from_file(TEST_DATASET_FILE_PATH) training_set_sizes_repetitions = {} training_set_sizes_repetitions[25] = 4 training_set_sizes_repetitions[50] = 4 training_set_sizes_repetitions[100] = 1 nb_test_set_accuracies_with_size, tan_test_set_accuracies_with_size = {}, {} avg_nb_test_data_accuracy, avg_tan_test_data_accuracy = [], [] for size in sorted(training_set_sizes_repetitions.keys()): num_repetitions = training_set_sizes_repetitions[size] nb_test_set_accuracies, tan_test_set_accuracies = [], [] for counter in xrange(0, num_repetitions): stratified_training_data_set = get_stratified_random_data_set(train_dataset, size) # Test for Naive Bayes Model naive_bayes_model = naive_bayes.Naivebayes(stratified_training_data_set) test_set_accuracy = naive_bayes.evaluate_naive_bayes_model(naive_bayes_model, test_dataset) nb_test_set_accuracies.append(test_set_accuracy*100) logger.info("Naive Bayes accuracy : " + str(test_set_accuracy*100) + " % ") # Test for TAN model tan_model = tan.TAN(stratified_training_data_set) test_set_accuracy = tan.evaluate_tan_model(tan_model, test_dataset) tan_test_set_accuracies.append(test_set_accuracy*100) logger.info("TAN accuracy : " + str(test_set_accuracy*100) + " % ") nb_test_set_accuracies_with_size[size] = nb_test_set_accuracies avg_nb_test_data_accuracy.append(sum(nb_test_set_accuracies)/Decimal(len(nb_test_set_accuracies))) tan_test_set_accuracies_with_size[size] = tan_test_set_accuracies avg_tan_test_data_accuracy.append(sum(tan_test_set_accuracies)/Decimal(len(tan_test_set_accuracies))) training_data_set_sizes = sorted(training_set_sizes_repetitions.keys()) logging.info("\n\n Training data set sizes : " + str(training_data_set_sizes)) logging.info("\n\n NB test set accuracies : " + str(avg_nb_test_data_accuracy)) logging.info("\n\n TAN test set accuracies : " + str(avg_tan_test_data_accuracy)) # Plot the graph plt.figure() plt.plot(training_data_set_sizes, avg_nb_test_data_accuracy, label="Naive Bayes", marker='H') plt.plot(training_data_set_sizes, avg_tan_test_data_accuracy, label="TAN", marker='H') plt.xlabel("Training set size") plt.ylabel("Test set accuracy (%)") plt.title("Test accuracy vs Training set size") plt.xlim(0, 101) plt.ylim(0, 100) plt.grid(True) plt.legend(loc="lower right") plt.savefig("graphs/hw3_accuracy_vs_size_run2.png")
def main(argv): assert len(argv) == 3, " Please provide correct arguments as follows : python dt-learn.py <train file path> <test file path> <leaf threshold>" train_dataset_file_path, test_dataset_file_path, leaf_threshold = argv[0], argv[1], int(argv[2]) # 1) load the training data set train_dataset = file_reader.get_dataset_from_file(train_dataset_file_path) # 2) generate a decision tree using training data set training_dataset_dtree = dtree.learn_dtree(train_dataset, leaf_threshold) print "\n\n=================== DECISION TREE =====================================\n\n" dtree.print_dtree(training_dataset_dtree, train_dataset.output.values, " ") # 3) load the test data set test_dataset = file_reader.get_dataset_from_file(test_dataset_file_path) # 4) evaluate the decision tree using the test data set print "\n\n================== TEST DATA SET EVALUATION ==========================\n\n" dtree.test_dtree(training_dataset_dtree, test_dataset)