Пример #1
0
def all_ks_accuracies(df):

    #looping through all values of k
    for k_value in range(1, len(df)):
        num_correct = 0
        #looping throught all the rows of the dataframe
        for row_num in range(len(df)):
            # Remove one row from the dataframe
            copy_df = df.copy()

            row = copy_df.iloc[row_num]
            for column in df:
                del df[column][row_num]

            #fit the new dataframe to KNearestNeighborsClassifier
            knn = KNearestNeighborsClassifier(k=k_value)
            knn.fit(copy_df, dependent_variable='Cookie Type')

            #classify row
            knn_classify = knn.classify(row)
            if knn_classify == row['Cookie Type']:
                num_correct += 1
                print('correct', row_num)
            else:
                print('incorrect', row_num)
        accuracies.append(num_correct / len(df))
    return accuracies
def leave_one_out_cross_validation_accuracy(df, dependent_variable, k):
    correct_classfications = 0
    total_classifications = len(df.to_array())
    for i in range(total_classifications):
        knn = KNearestNeighborsClassifier(k)
        knn.fit(get_df_without_row(df, i), dependent_variable)
        left_out = {
            k: v[i]
            for k, v in df.data_dict.items() if k != dependent_variable
        }
        predicted_classification = knn.classify(left_out)
        actual_classification = df.data_dict[dependent_variable][i]

        if predicted_classification == actual_classification:
            correct_classfications += 1

    return correct_classfications / total_classifications
def classifying(dataset, k=5, n_folds=5):
    dataset_folds = DatasetUtils.cross_validation_split(dataset, n_folds)

    accuracies = []

    for test_set in dataset_folds:
        training_set = dataset_folds[:]
        training_set.remove(test_set)
        training_set = sum(training_set, [])

        classifier = KNearestNeighborsClassifier(training_set, k)

        result = [(classifier.classify(item), item) for item in test_set]

        accuracies.append(Utils.get_accuracy(result))

    print('Accuracies: {}'.format(', '.join('{:.3}'.format(a) for a in accuracies)))
    print('Mean Accuracy: {:.3}'.format(
        sum(accuracies) / len(accuracies)))
    print()
df = DataFrame.from_array(
    [['Shortbread', 0.14, 0.14, 0.28, 0.44],
     ['Shortbread', 0.10, 0.18, 0.28, 0.44],
     ['Shortbread', 0.12, 0.10, 0.33, 0.45],
     ['Shortbread', 0.10, 0.25, 0.25, 0.40], ['Sugar', 0.00, 0.10, 0.40, 0.50],
     ['Sugar', 0.00, 0.20, 0.40, 0.40], ['Sugar', 0.10, 0.08, 0.35, 0.47],
     ['Sugar', 0.00, 0.05, 0.30, 0.65], ['Fortune', 0.20, 0.00, 0.40, 0.40],
     ['Fortune', 0.25, 0.10, 0.30, 0.35], ['Fortune', 0.22, 0.15, 0.50, 0.13],
     ['Fortune', 0.15, 0.20, 0.35, 0.30], ['Fortune', 0.22, 0.00, 0.40, 0.38]],
    columns=[
        'Cookie Type', 'Portion Eggs', 'Portion Butter', 'Portion Sugar',
        'Portion Flour'
    ])

knn = KNearestNeighborsClassifier(k=5)
knn.fit(df, dependent_variable='Cookie Type')
observation = {
    'Portion Eggs': 0.10,
    'Portion Butter': 0.15,
    'Portion Sugar': 0.30,
    'Portion Flour': 0.45
}

print(knn.compute_distances(observation).to_array())
# Returns a dataframe representation of the following array:

# [[0.047, 'Shortbread'],
#  [0.037, 'Shortbread'],
#  [0.062, 'Shortbread'],
#  [0.122, 'Shortbread'],
     ['Shortbread', 0.10, 0.25, 0.25, 0.40], ['Sugar', 0.00, 0.10, 0.40, 0.50],
     ['Sugar', 0.00, 0.20, 0.40, 0.40], ['Sugar', 0.02, 0.08, 0.45, 0.45],
     ['Sugar', 0.10, 0.15, 0.35, 0.40], ['Sugar', 0.10, 0.08, 0.35, 0.47],
     ['Sugar', 0.00, 0.05, 0.30, 0.65], ['Fortune', 0.20, 0.00, 0.40, 0.40],
     ['Fortune', 0.25, 0.10, 0.30, 0.35], ['Fortune', 0.22, 0.15, 0.50, 0.13],
     ['Fortune', 0.15, 0.20, 0.35, 0.30], ['Fortune', 0.22, 0.00, 0.40, 0.38],
     ['Shortbread', 0.05, 0.12, 0.28, 0.55],
     ['Shortbread', 0.14, 0.27, 0.31, 0.28],
     ['Shortbread', 0.15, 0.23, 0.30, 0.32],
     ['Shortbread', 0.20, 0.10, 0.30, 0.40]],
    columns=[
        'Cookie Type', 'Portion Eggs', 'Portion Butter', 'Portion Sugar',
        'Portion Flour'
    ])

knn = KNearestNeighborsClassifier(df, prediction_column='Cookie Type', k=5)

cv = LeaveOneOutCrossValidator(knn, df, prediction_column='Cookie Type')

assert cv.accuracy() == 0.7894736842105263

accuracies = []
for k in range(1, len(df.to_array()) - 1):
    knn = KNearestNeighborsClassifier(k)
    cv = LeaveOneOutCrossValidator(knn, df, prediction_column='Cookie Type')
    accuracies.append(cv.accuracy())

answers = [
    0.5789473684210527,
    0.5789473684210527,  #(Updated!)
    0.5789473684210527,
Пример #6
0
max_depth_10_decision_tree.fit()
max_depth_10_decision_tree_classifications = get_classifications(
    max_depth_10_decision_tree, testing_dataframe)
for row in max_depth_10_decision_tree_classifications:
    print(row)
'''
print('\n')
max_depth_3_random_decision_tree = RandomForest(number_of_random_trees = 100, class_name='Survived', max_depth = 5)
max_depth_3_random_decision_tree.fit(dataframe, features=[column for column in dataframe.columns if column != 'Survived'])
max_depth_3_random_decision_tree_classifications = get_classifications(max_depth_3_random_decision_tree, testing_dataframe)
for row in max_depth_3_random_decision_tree_classifications:
    print(row)

print('\n')
max_depth_5_random_decision_tree = RandomForest(number_of_random_trees = 100, class_name='Survived', max_depth = 10)
max_depth_5_random_decision_tree.fit(dataframe, features=[column for column in dataframe.columns if column != 'Survived'])
max_depth_5_random_decision_tree_classifications = get_classifications(max_depth_5_random_decision_tree, testing_dataframe)
for row in max_depth_5_random_decision_tree_classifications:
    print(row)
print('\n')

'''
k_nearest_neighbors_classifier = KNearestNeighborsClassifier()
k_nearest_neighbors_classifier.fit(dataframe, 'Survived')
k_nearest_neighbors_classifications = classifications(
    k_nearest_neighbors_classifier, testing_dataframe)
print(
    'k_nearest_neighbors_classifier accuracy',
    k_nearest_neighbors_classifications.count(True) /
    len(k_nearest_neighbors_classifications))
Пример #7
0
     ['Fortune', 0.15, 0.20, 0.35, 0.30], ['Fortune', 0.22, 0.00, 0.40, 0.38],
     ['Shortbread', 0.05, 0.12, 0.28, 0.55],
     ['Shortbread', 0.14, 0.27, 0.31, 0.28],
     ['Shortbread', 0.15, 0.23, 0.30, 0.32],
     ['Shortbread', 0.20, 0.10, 0.30, 0.40]],
    columns=[
        'Cookie Type', 'Portion Eggs', 'Portion Butter', 'Portion Sugar',
        'Portion Flour'
    ])

plot_data = []
k = 18
arr = []

for n in range(1, k):
    knn = KNearestNeighborsClassifier(df, 'Cookie Type', n)
    correct_observations = 0
    print('Testing k = ' + str(n))
    for i in range(len(knn.df.to_array())):
        correct = knn.df.to_array()[i][0]
        observation = {
            column: knn.df.ordered_dict[column][i]
            for column in knn.df.columns if column != 'Cookie Type'
        }
        copy = knn.df.to_array()
        del copy[i]
        df1 = DataFrame.from_array(copy, columns=knn.df.columns)
        knn2 = KNearestNeighborsClassifier(df1, 'Cookie Type', k)

        if knn2.fit(observation) == correct:
            correct_observations += 1