示例#1
0
文件: lab.py 项目: waque/CD
def exercise2():
    breast_cancer = pd.read_csv('breast_cancer.csv')

    X, y = split_dataset_transformed(breast_cancer, 'Class', ['?'])
    X_train, X_test, y_train, y_test = split_train_test(X, y)

    # 2.a

    clf = RandomForestClassifier()
    clf_stats = classifier_statistics(clf, X_train, X_test, y_train, y_test)
    print_dict(clf_stats, ['predicted'])

    # 2.b

    numb_trees = np.arange(10, 201, step=10)

    for trees in numb_trees:
        print('Experimenting with {} number of trees'.format(trees))

        clf = RandomForestClassifier(n_estimators=trees)
        clf_stats = classifier_statistics(clf, X_train, X_test, y_train,
                                          y_test)
        print_dict(clf_stats, ['predicted'])

    # 2.c

    depths = np.arange(5, 20)

    for dep in depths:
        print('Experimenting with {} depth'.format(dep))

        clf = RandomForestClassifier(max_depth=dep)
        clf_stats = classifier_statistics(clf, X_train, X_test, y_train,
                                          y_test)
        print_dict(clf_stats, ['predicted'])
示例#2
0
文件: lab.py 项目: waque/CD
def exercise3():

    credit = pd.read_csv('credit.csv')

    X, y = split_dataset_transformed(credit, 'class')
    X_train, X_test, y_train, y_test = split_train_test(X, y)

    min_samples = np.arange(2, 11)

    for samples in min_samples:
        print('Experimenting with {} number of instances to split'.format(
            samples))
        print('Train data')

        clf = DecisionTreeClassifier(min_samples_split=samples)
        clf_stats = classifier_statistics(clf, X_train, X_train, y_train,
                                          y_train)
        print_dict(clf_stats, ['predicted'])

        print('Test data')

        clf = DecisionTreeClassifier(min_samples_split=samples)
        clf_stats = classifier_statistics(clf, X_train, X_test, y_train,
                                          y_test)
        print_dict(clf_stats, ['predicted'])

        print()
        print()

    numb_trees = np.arange(10, 201, step=10)

    for trees in numb_trees:
        print('Experimenting with {} number of trees'.format(trees))
        print('Train data')

        clf = RandomForestClassifier(n_estimators=trees)
        clf_stats = classifier_statistics(clf, X_train, X_train, y_train,
                                          y_train)
        print_dict(clf_stats, ['predicted'])

        print('Test data')

        clf = RandomForestClassifier(n_estimators=trees)
        clf_stats = classifier_statistics(clf, X_train, X_test, y_train,
                                          y_test)
        print_dict(clf_stats, ['predicted'])

        print()
        print()
示例#3
0
def naive_bayes():
	gnb = GaussianNB()
	for d in data:
		X, y = split_dataset_transformed(d[0], 'consensus')
		X_train, X_test, y_train, y_test = split_train_test(X, y)
		res = classifier_statistics(gnb, X_train, X_test, y_train, y_test) 
		export_file(res, d[1], 'Naive bayes', "")
示例#4
0
文件: full_analysis.py 项目: waque/CD
def compare_baseline_clf():
    neighbors = [3, 5, 10]
    estimators = [25, 50, 100]

    naive_bayes = [(GaussianNB(), 'Gaussian'),
                   (MultinomialNB(), 'Multinomial'),
                   (BernoulliNB(), 'Bernoulli')]
    knns = [(KNeighborsClassifier(n_neighbors=x),
             'K Nearest Neighbors {}'.format(x)) for x in neighbors]
    random_forests = [(RandomForestClassifier(n_estimators=x),
                       'Random Forest {}'.format(x)) for x in estimators]

    classifiers = {
        'Naive Bayes': naive_bayes,
        'K Nearest Neighbors': knns,
        'Random Forest': random_forests
    }
    CLASSIFIER = 'Classifier'

    measures_dict = {}
    i = 0
    for model_type in classifiers:
        for specific in classifiers[model_type]:
            clf, parameter = specific
            res = classifier_statistics(clf, X_train, X_test, y_train, y_test)

            conf_matrix = res['confusion_matrix']
            score(conf_matrix)

            accuracy = res['accuracy']
            sensibility = res['sensibility']
            specificity = res['specificity']
            measures_dict[i] = {
                CLASSIFIER: parameter,
                'Measure': 'Accuracy',
                'Value': accuracy
            }
            i += 1
            measures_dict[i] = {
                CLASSIFIER: parameter,
                'Measure': 'Sensibility',
                'Value': sensibility
            }
            i += 1
            measures_dict[i] = {
                CLASSIFIER: parameter,
                'Measure': 'Specificity',
                'Value': specificity
            }
            i += 1

    measures = pd.DataFrame.from_dict(measures_dict, "index")
    measures.to_csv('plot_data/initial_results.csv')
    plt.figure(figsize=(22, 6))
    ax = sns.barplot(x=CLASSIFIER, y='Value', hue='Measure', data=measures)
    #plt.savefig('images/initial_results.pdf')
    plt.clf()
示例#5
0
def knn ():
	n_neighbors_values = [2,3,10]
	for d in data:
		for n in n_neighbors_values:
			X, y = split_dataset_transformed(d[0], 'consensus')
			X_train, X_test, y_train, y_test = split_train_test(X, y)
			neigh = KNeighborsClassifier(n_neighbors=n)
			res = classifier_statistics(neigh, X_train, X_test, y_train, y_test) 
			export_file(res, d[1], 'KNN', "k=" + str(n))
示例#6
0
cost1 = 10
cost2 = 500
conf_matrix = results['confusion_matrix']
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]

total_cost = cost1*fp + cost2*fn

print('Total cost aachived: {}'.format(total_cost))
"""

"""
probably something is wrong here, the results seem to be too good
"""


clf = KNeighborsClassifier(n_neighbors=10)
results = classifier_statistics(clf, X_train_res, X_test, y_train_res, y_test)

print_dict(results, excluded_keys=['predicted'])

cost1 = 10
cost2 = 500
conf_matrix = results['confusion_matrix']
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]

total_cost = cost1*fp + cost2*fn

print('Total cost aachived: {}'.format(total_cost))
示例#7
0
def naive_bayes_balenced():
	gnb = GaussianNB()
	for d in data:
		X_train, X_test, y_train, y_test, X_train_res, y_train_res = balance_dataset(d[0], 'consensus')
		res = classifier_statistics(gnb, X_train_res, X_test, y_train_res.ravel(), y_test.ravel())
		export_file(res, d[1], 'Naive bayes balenced', "")
示例#8
0
def knn ():
	n_neighbors_values = [2,3,10]
	for d in data:
		for n in n_neighbors_values:
			X, y = split_dataset_transformed(d[0], 'consensus')
			X_train, X_test, y_train, y_test = split_train_test(X, y)
			neigh = KNeighborsClassifier(n_neighbors=n)
			res = classifier_statistics(neigh, X_train, X_test, y_train, y_test) 
			export_file(res, d[1], 'KNN', "k=" + str(n))

def naive_bayes():
	gnb = GaussianNB()
	for d in data:
		X, y = split_dataset_transformed(d[0], 'consensus')
		X_train, X_test, y_train, y_test = split_train_test(X, y)
		res = classifier_statistics(gnb, X_train, X_test, y_train, y_test) 
		export_file(res, d[1], 'Naive bayes', "")


def naive_bayes_balenced():
	gnb = GaussianNB()
	for d in data:
		X_train, X_test, y_train, y_test, X_train_res, y_train_res = balance_dataset(d[0], 'consensus')
		res = classifier_statistics(gnb, X_train_res, X_test, y_train_res.ravel(), y_test.ravel())
		export_file(res, d[1], 'Naive bayes balenced', "")

clf_RF = RandomForestClassifier(n_estimators=800, max_depth=4)
X_train, X_test, y_train, y_test, X_train_res, y_train_res = balance_dataset(data[0][0], 'consensus')
res = classifier_statistics(clf_RF, X_train, X_test, y_train, y_test)
pprint(res)
示例#9
0
文件: lab.py 项目: waque/CD
        print_dict(clf_stats, ['predicted'])

        print()
        print()


diabetes = pd.read_csv('diabetes.csv')
X, y = split_dataset_transformed(diabetes, 'class')
X_train, X_test, y_train, y_test = split_train_test(X, y)

# 3.a

enc = KBinsDiscretizer(n_bins=10, encode='onehot')
X_binned = enc.fit_transform(X)
X_binned = X_binned.toarray()
X_binned_train, X_binned_test, y_binned_train, y_binned_test = split_train_test(
    X_binned, y)

print('With discretization')

clf1 = RandomForestClassifier()
clf_stats = classifier_statistics(clf1, X_binned_train, X_binned_test,
                                  y_binned_train, y_binned_test)
print_dict(clf_stats, ['predicted'])

print('Without discretization')

clf2 = RandomForestClassifier()
clf_stats = classifier_statistics(clf2, X_train, X_test, y_train, y_test)
print_dict(clf_stats, ['predicted'])
示例#10
0
文件: knn.py 项目: waque/CD
aps_train = aps_train.dropna()

aps_test = aps_test.dropna()

X_train, y_train = split_dataset(aps_train, 'class')
X_test, y_test = split_dataset(aps_train, 'class')

y_train = y_train.map({'pos': 1, 'neg': 0})
y_test = y_test.map({'pos': 1, 'neg': 0})

X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

neigh = KNeighborsClassifier(n_neighbors=10)

res = classifier_statistics(neigh, X_train, X_test, y_train, y_test)

pprint(res)
"""
k=2
accuracy: 0.936
sensibility: 0.5

k=3 
accuracy: 0.946
sensibility: 0.658

k=10
accuracy: 0.892
sensibility: 0.224