示例#1
0
def main(argv):
    assert len(argv) == 3, " Please provide correct arguments as follows : python bayes.py <train file path> <test file path> <n|t>"
    assert(argv[2] == 'n' or argv[2] == 't')

    train_dataset_file_path, test_dataset_file_path = argv[0], argv[1]
    is_naive_bayes_reqd = True if argv[2] == 'n' else False
        
    logger.debug("Loading the training dataset ..")
    train_dataset = file_reader.get_dataset_from_file(train_dataset_file_path)
    
    logger.debug("Loading the test dataset ..")
    test_dataset = file_reader.get_dataset_from_file(test_dataset_file_path)
        
    if is_naive_bayes_reqd:
        start = time.clock()
        logger.debug("Generating the naive bayes model using training dataset ..")
        naive_bayes_model = naive_bayes.Naivebayes(train_dataset)
        
        logger.debug("Evaulating the test dataset on Naive Bayes Model ..")
        naive_bayes.evaluate_naive_bayes_model(naive_bayes_model, test_dataset)
        
        logger.debug("Time taken for running Naive Bayes Model is " + str(time.clock() - start) + " s ")
    else:
        start = time.clock()
        logger.debug("Generating the TAN model using training dataset ..")
        tan_model = tan.TAN(train_dataset)
        
        logger.debug("Evaulating the test dataset on TAN model ..")        
        tan.evaluate_tan_model(tan_model, test_dataset)
        
        logger.debug("Time taken for running TAN Model is " + str(time.clock() - start) + " s ")
示例#2
0
def get_learning_curve_with_number_of_instances():
    train_dataset = file_reader.get_dataset_from_file(TRAIN_DATASET_FILE_PATH)
    test_dataset = file_reader.get_dataset_from_file(TEST_DATASET_FILE_PATH)
    
    training_set_sizes_repetitions = {}
    training_set_sizes_repetitions[25]  = 4
    training_set_sizes_repetitions[50]  = 4
    training_set_sizes_repetitions[100] = 1
    
    nb_test_set_accuracies_with_size, tan_test_set_accuracies_with_size = {}, {}
    avg_nb_test_data_accuracy, avg_tan_test_data_accuracy = [], []
    for size in sorted(training_set_sizes_repetitions.keys()):
        num_repetitions = training_set_sizes_repetitions[size]
        
        nb_test_set_accuracies, tan_test_set_accuracies = [], []
        for counter in xrange(0, num_repetitions):
            stratified_training_data_set = get_stratified_random_data_set(train_dataset, size)
            
            # Test for Naive Bayes Model
            naive_bayes_model = naive_bayes.Naivebayes(stratified_training_data_set)
            test_set_accuracy = naive_bayes.evaluate_naive_bayes_model(naive_bayes_model, test_dataset)
            nb_test_set_accuracies.append(test_set_accuracy*100)
            logger.info("Naive Bayes accuracy : " + str(test_set_accuracy*100) + " % ")
            
            # Test for TAN model
            tan_model = tan.TAN(stratified_training_data_set)
            test_set_accuracy = tan.evaluate_tan_model(tan_model, test_dataset)
            tan_test_set_accuracies.append(test_set_accuracy*100)
            logger.info("TAN accuracy : " + str(test_set_accuracy*100) + " % ")          
            
        nb_test_set_accuracies_with_size[size] = nb_test_set_accuracies
        avg_nb_test_data_accuracy.append(sum(nb_test_set_accuracies)/Decimal(len(nb_test_set_accuracies)))
        
        tan_test_set_accuracies_with_size[size] = tan_test_set_accuracies
        avg_tan_test_data_accuracy.append(sum(tan_test_set_accuracies)/Decimal(len(tan_test_set_accuracies)))        
        
    training_data_set_sizes = sorted(training_set_sizes_repetitions.keys())

    logging.info("\n\n Training data set sizes : " + str(training_data_set_sizes))
    logging.info("\n\n NB test set accuracies : " + str(avg_nb_test_data_accuracy))
    logging.info("\n\n TAN test set accuracies : " + str(avg_tan_test_data_accuracy))
    
    # Plot the graph
    plt.figure()

    plt.plot(training_data_set_sizes, avg_nb_test_data_accuracy, label="Naive Bayes", marker='H')
    plt.plot(training_data_set_sizes, avg_tan_test_data_accuracy, label="TAN", marker='H')
            
    plt.xlabel("Training set size")
    plt.ylabel("Test set accuracy (%)")
    plt.title("Test accuracy vs Training set size")
    
    plt.xlim(0, 101)
    plt.ylim(0, 100) 
 
    plt.grid(True)
    plt.legend(loc="lower right")
    
    plt.savefig("graphs/hw3_accuracy_vs_size_run2.png")    
示例#3
0
def main(argv):
    assert len(argv) == 3, " Please provide correct arguments as follows : python dt-learn.py <train file path> <test file path> <leaf threshold>"
    train_dataset_file_path, test_dataset_file_path, leaf_threshold = argv[0], argv[1], int(argv[2])
    
    # 1) load the training data set
    train_dataset = file_reader.get_dataset_from_file(train_dataset_file_path)
    
    # 2) generate a decision tree using training data set
    training_dataset_dtree = dtree.learn_dtree(train_dataset, leaf_threshold)
    
    print "\n\n=================== DECISION TREE =====================================\n\n"
    dtree.print_dtree(training_dataset_dtree, train_dataset.output.values, " ")
    
    # 3) load the test data set
    test_dataset = file_reader.get_dataset_from_file(test_dataset_file_path)
    
    # 4) evaluate the decision tree using the test data set
    print "\n\n================== TEST DATA SET EVALUATION ==========================\n\n"
    dtree.test_dtree(training_dataset_dtree, test_dataset)