/
CapstoneMaster.py
90 lines (87 loc) · 3.15 KB
/
CapstoneMaster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""re-write exploring DC
"""
import os
import pandas as pd
import numpy as np
import urllib
u=urllib.urlopen('http://archive.ics.uci.edu/ml/datasets/Drug+consumption+%28quantified%29')
text=u.read()
import re
matches=re.findall(r"\d+\.\s\w+", text)
del(matches[0:3])
del(matches[12])
for i in range(0, len(matches)):
matches[i]=re.sub(r"\d+\.\s","",matches[i])
"""#local import
os.chdir('C:\Users\Alissa\Documents\Springboard\Capstone1\data\DrugConsumption') #cd and import data
data=pd.read_csv('drug_consumption.txt',header=None,names=matches) #also header=None"""
#web import
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00373/drug_consumption.data"
data = pd.read_csv(url,names=matches)
#print(data.head())
#make drug use binary instead #code_dict={CL0:"Never Used", CL1:"Used over a Decade Ago",CL2:"Used in Last Decade",CL3:"Used in Last Year",CL4:"Used in Last Month", CL5:"Used in Last Week",CL6:"Used in Last Day"}
#month based binary variable
data=data.replace(['CL0','CL1','CL2','CL3'],0)
data=data.replace(['CL4','CL5','CL6'],1)
#1-13 are attributes 14-32 are drugs
attribute=data.iloc[:,:13]
#attribute.columns=[0,1,2,3,4,5,6,7,8,9,10,11,12]
drugs=data.iloc[:,13:] #CL4, CL5, CL6 are use in last month, week, day respectively
#drugs.columns=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
print(attribute.head())
print(drugs.head())
coormat=data.corr()
#To try: XG Boost or boosting - tree tech.
#"""
# Compare Algorithms
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
#rename data
X=attribute
Y=drugs.iloc[:,:17]
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
count=0
#n=0
#val=[]
#best2=[]
for name in Y.columns:
y = Y[name]
print(name)
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring) #ValueError: bad input shape (1696, 19)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# for i in xrange(0,5):
# val[i]=results[6*n].mean()+ results[6*n].std()
# best2=val.sort_values().head(2)
# # boxplot algorithm comparison
# fig = plt.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# plt.boxplot(results[count]) #list call by row - not working
# ax.set_xticklabels(names)
# plt.show()
# count=count+1
#"""