def reduced_error_prune(root, val_data):
    """Prunes nodes from a decision tree through reduced error pruning. Iterates through nodes
    and removes nodes which, when removed, cause the decision tree classifier to perform better
    on the validation data. Performance on validation data is compared using f1-score.

    Arguments:
    root --- The root node of the decision tree.
    val_data --- The validation data set.
    """
    y_true = get_labels(val_data)
    y_pred = [decision_tree_classify(item, root) for item in val_data]
    base_score = f1_score(y_true, y_pred)
    _reduced_error_prune(root, root, base_score, val_data, y_true)
def tune_max_depth(training_data, val_data):
    """Gets the f_scores for decision trees with different maximum depths on the validation data.

    Arguments:
    training_data, val_data --- The training and validation data sets.
    """
    depths, scores = [], []
    y_true = get_labels(val_data)
    for x in range(2, 14):
        root = build_decision_tree(training_data, x)
        y_pred = [decision_tree_classify(item, root) for item in val_data]
        depths.append(x)
        scores.append(f1_score(y_true, y_pred))

    return depths, scores
示例#3
0
def get_clean_data():
    x, y = load_training_data()

    # Reshape images from 1x4096 to 64x64
    x = threshold(x)
    x = remove_dots(x)
    x = x.reshape(-1, 64, 64, 1)

    test_split = 0.1
    np.random.seed(113)
    indices = np.arange(len(x))
    np.random.shuffle(indices)
    _, num_to_index, _ = get_labels()
    x = x[indices]
    y = y[indices]
    y = [num_to_index[yi] for yi in y.tolist()]

    x_train = np.array(x[:int(len(x) * (1 - test_split))])
    y_train = np.array(y[:int(len(x) * (1 - test_split))])
    x_test = np.array(x[int(len(x) * (1 - test_split)):])
    y_test = np.array(y[int(len(x) * (1 - test_split)):])
    return (x_train, y_train), (x_test, y_test)
示例#4
0
def main():
    args = parse_args()

    data = load_data('data/adult.data')
    test_data = load_data('data/adult.test2')
    val_data = load_data('data/adult.val')

    if args.depth_plot:
        print('Calculating f1-scores for different depths...')
        depths, scores = dt.tune_max_depth(data, val_data)
        plt.plot(depths, scores)
        plt.ylabel('F1-score')
        plt.xlabel('Maximum Depth')
        plt.show()
        quit()

    baseline_tree = dt.build_decision_tree(
        data, max_depth=1, forced_attribute=args.baseline_attribute)
    print('Building decision tree...')
    dt_start = time.time()
    if args.depth is not None:
        tree = dt.build_decision_tree(data, max_depth=args.depth)
    else:
        tree = dt.build_decision_tree(data)

    print('Decision tree built in ' + str(time.time() - dt_start) + ' s.')

    baseline_metrics = compute_metrics(dt.decision_tree_classify, test_data,
                                       [baseline_tree])
    dt_metrics = compute_metrics(dt.decision_tree_classify, test_data, [tree])

    if args.rep:
        print('Pruning decision tree (reduced error)...')
        dtre_start = time.time()
        dt.reduced_error_prune(tree, val_data)
        print('Decision tree pruned (reduced error) in ' +
              str(time.time() - dtre_start) + ' s.')
        dtre_metrics = compute_metrics(dt.decision_tree_classify, test_data,
                                       [tree])
    elif args.csp:
        print('Pruning decision tree (chi-square)...')
        dtcs_start = time.time()
        dt.chi_square_prune(tree)
        print('Decision tree pruned (chi-square) in ' +
              str(time.time() - dtcs_start) + ' s.')
        dtcs_metrics = compute_metrics(dt.decision_tree_classify, test_data,
                                       [tree])

    y_train = get_labels(data)
    y_test = get_labels(test_data)

    features = extract_features(data, test_data)
    X_train = features[0]
    X_test = features[1]
    feature_names = features[2]
    print('Building logistic regression model...')
    lr_start = time.time()
    lr_model = LogisticRegression(solver='sag').fit(X_train, y_train)

    print('Logistic regression model built in ' + str(time.time() - lr_start) +
          ' s.')

    if args.lr_top is not None:
        print('Top weighted features in logistic regression model: ' +
              str(get_lr_top_weights(lr_model, args.lr_top, feature_names)[0]))
    if args.lr_bot is not None:
        print(
            'Top negatively weighted features in logistic regression model: ' +
            str(get_lr_top_weights(lr_model, args.lr_bot, feature_names)[1]))

    lr_pred = lr_model.predict(X_test)

    weights = perceptron.perceptron(X_train, y_train, 10)
    perceptron_pred = perceptron.perceptron_test(X_test, weights)

    perceptron_metrics = [
        y_test[i] == perceptron_pred[i] for i in range(len(y_test))
    ].count(True) / len(test_data), precision_score(
        y_test, perceptron_pred), recall_score(y_test,
                                               perceptron_pred), f1_score(
                                                   y_test, perceptron_pred)
    lr_metrics = [y_test[i] == lr_pred[i] for i in range(len(y_test))
                  ].count(True) / len(test_data), precision_score(
                      y_test, lr_pred), recall_score(y_test,
                                                     lr_pred), f1_score(
                                                         y_test, lr_pred)

    print('Baseline:')
    print('Accuracy: ' + str(baseline_metrics[0]))
    print('Precision: ' + str(baseline_metrics[1]))
    print('Recall: ' + str(baseline_metrics[2]))
    print('F1 Score: ' + str(baseline_metrics[3]))

    print('\nDecision Tree:')
    print('Accuracy: ' + str(dt_metrics[0]))
    print('Precision: ' + str(dt_metrics[1]))
    print('Recall: ' + str(dt_metrics[2]))
    print('F1 Score: ' + str(dt_metrics[3]))

    if args.rep:
        print('\nDecision Tree (w/ reduced error pruning):')
        print('Accuracy: ' + str(dtre_metrics[0]))
        print('Precision: ' + str(dtre_metrics[1]))
        print('Recall: ' + str(dtre_metrics[2]))
        print('F1 Score: ' + str(dtre_metrics[3]))
    elif args.csp:
        print('\nDecision Tree (w/ chi-square pruning):')
        print('Accuracy: ' + str(dtcs_metrics[0]))
        print('Precision: ' + str(dtcs_metrics[1]))
        print('Recall: ' + str(dtcs_metrics[2]))
        print('F1 Score: ' + str(dtcs_metrics[3]))

    print('\nPerceptron:')
    print('Accuracy: ' + str(perceptron_metrics[0]))
    print('Precision: ' + str(perceptron_metrics[1]))
    print('Recall: ' + str(perceptron_metrics[2]))
    print('F1 Score: ' + str(perceptron_metrics[3]))

    print('\nLogistic Regression:')
    print('Accuracy: ' + str(lr_metrics[0]))
    print('Precision: ' + str(lr_metrics[1]))
    print('Recall: ' + str(lr_metrics[2]))
    print('F1 Score: ' + str(lr_metrics[3]))

    if args.plot:
        metrics_baseline = (baseline_metrics[0], baseline_metrics[1],
                            baseline_metrics[2], baseline_metrics[3])
        metrics_dt = (dt_metrics[0], dt_metrics[1], dt_metrics[2],
                      dt_metrics[3])
        metrics_perceptron = (perceptron_metrics[0], perceptron_metrics[1],
                              perceptron_metrics[2], perceptron_metrics[3])
        metrics_lr = (lr_metrics[0], lr_metrics[1], lr_metrics[2],
                      lr_metrics[3])
        metrics_dtre, metrics_dtcs = None, None
        if args.rep:
            metrics_dtre = (dtre_metrics[0], dtre_metrics[1], dtre_metrics[2],
                            dtre_metrics[3])
        elif args.csp:
            metrics_dtcs = (dtcs_metrics[0], dtcs_metrics[1], dtcs_metrics[2],
                            dtcs_metrics[3])
        plot_metrics(metrics_baseline, metrics_dt, metrics_perceptron,
                     metrics_lr, metrics_dtre, metrics_dtcs)
示例#5
0
def main():
    data = load_data('data/adult.data')
    baseline_tree = dt.build_decision_tree(data, max_depth=1)
    print('Building decision tree...')
    dt_start = time.time()
    tree = dt.build_decision_tree(data)
    print('Decision tree built in ' + str(time.time() - dt_start) + ' s.')

    test_data = load_data('data/adult.val')
    baseline_metrics = compute_metrics(dt.decision_tree_classify, test_data, [baseline_tree])
    dt_metrics = compute_metrics(dt.decision_tree_classify, test_data, [tree])
    
    y_train = get_labels(data)
    y_test = get_labels(test_data)

    features = extract_features(data, test_data)
    X_train = features[0]
    X_test = features[1]

    print('Building logistic regression model...')
    lr_start = time.time()
    lr_model = build_lr_model(X_train, y_train)
    print('Logistic regression model built in ' + str(time.time() - lr_start) + ' s.')

    lr_pred = lr_model.predict(X_test)

    #perceptron
    weights = perceptron.perceptron(X_train, y_train, 6)
    perceptron_pred=perceptron.perceptron_test(X_test,weights)

    #skilearn model's perceptron
    perceptron_ski = build_perceptron_ski(X_train, y_train)
    y_percep_pred = perceptron_ski.predict(X_test)
    '''
    Result:
    Accuracy: 0.8032061912658928
    Precision: 0.5655369538587178
    Recall: 0.7202288091523661
    F1 Score: 0.6335773101555352
    '''

    # Gaussian Naive Bayes
    naive_bayes_model = build_naive_bayes(X_train, y_train)
    y_naive_bayes_pred = naive_bayes_model.predict(X_test)

    '''
    Result:
    Accuracy: 0.48473680977826916
    Precision: 0.3092619027626165
    Recall: 0.9576183047321893
    F1 Score: 0.4675341161536021
    '''


    print('Baseline:')
    print('Accuracy: ' + str(baseline_metrics[0]))
    print('Precision: ' + str(baseline_metrics[1]))
    print('Recall: ' + str(baseline_metrics[2]))
    print('F1 Score: ' + str(baseline_metrics[3]))
    
    print('\nDecision Tree:')
    print('Accuracy: ' + str(dt_metrics[0]))
    print('Precision: ' + str(dt_metrics[1]))
    print('Recall: ' + str(dt_metrics[2]))
    print('F1 Score: ' + str(dt_metrics[3]))

    print('\nLogistic Regression:')
    print('Accuracy: ' + str([y_test[i] == lr_pred[i] for i in range(len(y_test))].count(True) / len(test_data)))
    print('Precision: ' + str(precision_score(y_test, lr_pred)))
    print('Recall: ' + str(recall_score(y_test, lr_pred)))
    print('F1 Score: ' + str(f1_score(y_test, lr_pred)))

    print('\nPerceptron Regression:')
    print('Accuracy: ' + str([y_test[i] == perceptron_pred[i] for i in range(len(y_test))].count(True) / len(test_data)))
    print('Precision: ' + str(precision_score(y_test, perceptron_pred)))
    print('Recall: ' + str(recall_score(y_test, perceptron_pred)))
    print('F1 Score: ' + str(f1_score(y_test, perceptron_pred)))

    print('\nPerceptron Regression (ski):')
    print('Accuracy: ' + str([y_test[i] == y_percep_pred[i] for i in range(len(y_test))].count(True) / len(test_data)))
    print('Precision: ' + str(precision_score(y_test, y_percep_pred)))
    print('Recall: ' + str(recall_score(y_test, y_percep_pred)))
    print('F1 Score: ' + str(f1_score(y_test, y_percep_pred)))

    print('\nNaive Bayes (ski):')
    print('Accuracy: ' + str([y_test[i] == y_naive_bayes_pred[i] for i in range(len(y_test))].count(True) / len(test_data)))
    print('Precision: ' + str(precision_score(y_test, y_naive_bayes_pred)))
    print('Recall: ' + str(recall_score(y_test, y_naive_bayes_pred)))
    print('F1 Score: ' + str(f1_score(y_test, y_naive_bayes_pred)))

    print("\nCross Validation")
示例#6
0
args = parser.parse_args()

# set the testing data transform
img_transform = transform.Compose([
    transform.Resize((300,300)),
    transform.ToTensor(),
    transform.Normalize(mean = [0.485, 0.456, 0.406],
                        std = [0.229, 0.224, 0.225])
])

# load the testing model and data
network = torch.load('./model/'+args.test_model)
print(network)
test_data = load_data.Load_testdata(transform=img_transform)
test_load = Data.DataLoader(dataset=test_data, batch_size=1, shuffle=False)
labels = load_data.get_labels()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
network.to(device)
print(device)

# start testing the model
def test_model():
    ans = []
    ids = []
    network.eval()
    with torch.no_grad():
        for data in test_load:
            x, y = data
            x = x.to(device)
            outputs = network(x)
            predict = torch.max(outputs.data, 1)[1]
示例#7
0
print("Removing dots...")
#x_test = remove_dots(x_test)

#new_im = np.zeros((256, 256))
#r = 0
#for i in range(0, 256,64):
#    for j in range(0, 256, 64):
#        # paste the image at location i,j
#        new_im[i:i+64, j:j+64] = x_test[r]
#        r += 1
#
#new_im = scipy.ndimage.zoom(new_im, 4, order=0)
#scipy.misc.imsave('sample_clean.jpg', new_im)

#scipy.misc.imshow(new_im) # to visualize only
#quit()

print("Generating predictions...")
x_test = x_test.reshape(-1, 64, 64, 1)
predictions = model.predict(x_test)

labels, _, _ = get_labels()
labels = list(labels)

print("Writing predictions to " + output_path + "...")
with open(output_path, 'w+') as output:
    print("Id,Label", file=output)
    for i, prediction in enumerate(predictions):
        print(str(i + 1) + "," + str(labels[np.argmax(prediction)]),
              file=output)