-
Notifications
You must be signed in to change notification settings - Fork 1
/
untitled0.py
931 lines (892 loc) · 32.7 KB
/
untitled0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 24 11:31:24 2016
@author: ahmed
"""
"""
Numpy Arrays
"""
import numpy as np
#generating a random array
X=np.random.random((3,5)) # a 3 X 5 array
print(X)
print("X type is")
print(type(X))
print("X shape is 3 rows and 5 columns")
print(X.shape)
#Accessing elements
#Get a single element
print("Some X single element")
print(X[0,0])
print(X[0,1])
print(X[0,2])
print(X[0,3])
print(X[0,4])
#Get a row
print(X[0])
print(X[1])
print(X[2])
#Get a column
print(X[:,0])
print(X[:,1])
print(X[:,2])
print(X[:,3])
print(X[:,4])
print(X)
# Transposing X
print(X.T)
# Turning a row vector into a column vector
y=np.linspace(0,12,5)
print(y)
# make into a column vector
print(y[:,np.newaxis])
print(y[:,])
# Getting the shape or reshaping an array: many examples
print(X.shape)
print(X.reshape(5,3))
print(X.reshape(15,1))
print(X.reshape(1,15))
# Indexing by an array of integers (fancy indexing)
indices=np.array([3,1,0])
print(indices)
X[:,indices]
"""
Scipy Sparse Matrices
"""
from scipy import sparse
#create a random array with a lot of zeros
X=np.random.random((10,5))
print(X)
#set the majority of elements to zero
X[X<0.7]=0
print(X)
#turn X into a csr Compressed Sparse row matrix
X_csr=sparse.csr_matrix(X)
print(X_csr)
#Convert the sparse matrix to a dense array
print(X_csr.toarray())
#Create an empty LIL matrix and add some items
X_lil=sparse.lil_matrix((5,5))
for i,j in np.random.randint(0,5,(15,2)):
X_lil[i,j]=i+j
print(X_lil)
print(X_lil.toarray())
print(X_lil.tocsr())
"""
Matplotlib
"""
import matplotlib.pyplot as plt
# plotting a line
x = np.linspace(0, 10, 100)
plt.plot(x, np.sin(x))
# scatter-plot points
x = np.random.normal(size=500)
y = np.random.normal(size=500)
plt.scatter(x, y)
# showing images
x = np.linspace(1, 12, 100)
y = x[:, np.newaxis]
im = y * np.sin(x) * np.cos(y)
print(im.shape)
# imshow - note that origin is at the top-left by default!
plt.imshow(im)
# Contour plot - note that origin here is at the bottom-left by default!
plt.contour(im)
# 3D plotting
from mpl_toolkits.mplot3d import Axes3D
ax = plt.axes(projection='3d')
xgrid, ygrid = np.meshgrid(x, y.ravel())
ax.plot_surface(xgrid, ygrid, im, cmap=plt.cm.jet, cstride=2, rstride=2, linewidth=0)
# %load http://matplotlib.org/mpl_examples/pylab_examples/ellipse_collection.py
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.collections import EllipseCollection
x = np.arange(10)
y = np.arange(15)
X, Y = np.meshgrid(x, y)
XY = np.hstack((X.ravel()[:,np.newaxis], Y.ravel()[:,np.newaxis]))
ww = X/10.0
hh = Y/15.0
aa = X*9
fig, ax = plt.subplots()
ec = EllipseCollection(ww, hh, aa, units='x', offsets=XY,transOffset=ax.transData)
ec.set_array((X+Y).ravel())
ax.add_collection(ec)
ax.autoscale_view()
ax.set_xlabel('X')
ax.set_ylabel('y')
cbar = plt.colorbar(ec)
cbar.set_label('X+Y')
plt.show()
"""
Another examples with matplotlib
"""
"""
Supervised learning classification CLASSIFICATION
"""
import matplotlib.pyplot as plt
import numpy as np
"""
To visualize the ML Working algorithms, it is helpful to study 2D & 1D data i.e
data with only 2 or 1 features.
The first example will use: synthetic data generated by the make_blobs function
"""
from sklearn.datasets import make_blobs
X,y=make_blobs(centers=2,random_state=0)
print(type(y))
print(X.shape)
print(type(y))
print(y.shape)
print(X[:5,:])
print(y[:5])
plt.scatter(X[:,0],X[:,1],c=y,s=40)
plt.xlabel("First feature")
plt.ylabel("second feature")
# the train_test_split function from the cross_validation modules does that
# for us, by randomly splitting of 25% of the data for testing
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)
#Every algorithm is an Estimator object, a logistic regression
from sklearn.linear_model import LogisticRegression
# Method
# First we instantiate the estimator object
classifier=LogisticRegression()
print(X_train.shape)
print(y_train.shape)
# Second we call the fit function with the training data
classifier.fit(X_train, y_train)
# Third we call the predict function with the testing data
prediction=classifier.predict(X_test)
"""
Fourth we compare between the prediction and the data
We can evaluate the classifier quantitatively by measuring what fraction
of prediction is correct: this called accuracy
"""
print(prediction)
print(y_test)
np.mean(prediction == y_test)
print("mean(prediction == y_test)")
print(np.mean(prediction == y_test))
"""
There is also an direct function inside sckit-learn which is the score function
it computes directly from the test data
"""
classifier.score(X_test,y_test)
print("classifier.score(X_train,y_train)")
print(classifier.score(X_test,y_test))
classifier.score(X_train,y_train)
print("classifier.score(X_train,y_train)")
print(classifier.score(X_train,y_train))
# plotting results if possible
from figures import plot_2d_separator
plt.scatter(X[:,0],X[:,1],c=y,s=40)
plt.xlabel("First feature")
plt.ylabel("Second feature")
plot_2d_separator(classifier, X)
# Finally we estimate the estimated paramters ending by an underscore
print(classifier.coef_)
print(classifier.intercept_)
print(classifier.classes_)
"""
Another classifier: K Nearest Neighbors: popular and easy
One of the simplest strategies:
Given a new unknown observation, look up in your reference database
which one have the closest features and assign the predominant class
"""
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=1)
#knn=KNeighborsClassifier(n_neighbors=3)
#knn=KNeighborsClassifier(n_neighbors=10)
#knn=KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train,y_train)
plt.scatter(X[:,0],X[:,1],c=y,s=40)
plt.xlabel("first feature")
plt.ylabel("second feature")
plot_2d_separator(knn,X)
knn.score(X_test,y_test)
"""
Application on the iris dataset
we change the number of n_neighbors in the estimator
"""
from sklearn.datasets import load_iris
iris=load_iris()
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target)
knn=KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)
print(knn.score(X_train,y_train))
print(knn.score(X_test,y_test))
print(knn.predict(X_test))
"""
Now we start with another important subject which is REGRESSION
in regression, we try to predict a continuous output variable
"""
import matplotlib.pyplot as plt
import numpy as np
x=np.linspace(-3,3,100)
print(x)
y=np.sin(4*x)+x+np.random.uniform(size=len(x))
plt.plot(x,y,'o')
# Linear regression
# to apply a scikit learn model, we need to make X be a 2d array
print(x.shape)
X=x[:,np.newaxis]
print(X.shape)
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y)
#then we can build our regression model
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train,y_train)
y_pred_train=regressor.predict(X_train)
plt.plot(X_train,y_train,'o',label="data")
plt.plot(X_train,y_pred_train,'o',label='prediction')
plt.legend(loc='best')
# let's try the test set
y_pred_test=regressor.predict(X_test)
plt.plot(X_test,y_test,'o',label="data")
plt.plot(X_test,y_pred_test,'o',label='prediction')
plt.legend(loc='best')
#Quantitative evaluation of the score method
regressor.score(X_test,y_test)
"""
Another exercise:
We compare between the KNeighborsRegressor and LinearRegression on the boston
housing dataset
"""
from sklearn.datasets import load_boston
boston=load_boston()
X_train,X_test,y_train,y_test=train_test_split(boston.data,boston.target,random_state=42)
print(boston.DESCR)
print(boston.keys())
#Another compacter manner to do the code
lr=LinearRegression().fit(X_train,y_train)
print(lr.score(X_train,y_train))
print(lr.score(X_test,y_test))
from sklearn.neighbors import KNeighborsRegressor
knn=KNeighborsRegressor(n_neighbors=3).fit(X_train,y_train)
print(knn.score(X_train,y_train))
print(knn.score(X_test,y_test))
#######################################################
#######################################################
#######################################################
#######################################################
#######################################################
# UNSUPERVISED LEARNING METHOD
"""
Unsupervised Learning:
- dimensionality reduction
- manifold learning
- feature extraction
- find a new representation of the input data without any additional input
Another important application is rescaling the data to have zero mean and
unit variance which is a very helpful preprocessing step for many machine
learning models
"""
import matplotlib.pyplot as plt
import numpy as np
# RESCALING application
# The iris dataset is not centered: non-zero mean and the std is different
# for each component
from sklearn.datasets import load_iris
iris=load_iris()
X,y = iris.data, iris.target
print(X.shape)
print(y.shape)
print("mean: %s " %X.mean(axis=0))
print("standard deviation: %s " %X.std(axis=0))
# to use preprocessing method we import the estimator: StandardScaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
# As this is an unsupervised model we pass only X and not y
# we estimate so mean and standard deviation
scaler.fit(X)
# we don't call predict but transform for rescaling
X_scaled=scaler.transform(X)
print(X_scaled.shape)
print("New mean of scaled data")
print("mean: %s " %X_scaled.mean(axis=0))
print("New std for scaled data")
print("standard deviation: %s " %X_scaled.std(axis=0))
"""
Principal Component Analysis
PCA is an unsupervised transformation. It is a technique to reduce the data
dimensionality by creating a linear projection so we find new features to
represent the data which are linear combination of the old ones (by a rotation)
Method: PCA looks for the maximum variance directions then only few components
that explains most of the variance in the data are kept.
Note that the PCA directions are orthogonal
"""
# An example
rnd=np.random.RandomState(42)
X_blob=np.dot(rnd.normal(size=(100,2)),rnd.normal(size=(2,2)))+rnd.normal(size=2)
plt.scatter(X_blob[:,0],X_blob[:,1])
plt.xlabel("feature 1")
plt.ylabel("feature 2")
"""
# Another example but with another code
rnd = np.random.RandomState(5)
X_ = rnd.normal(size=(300, 2))
X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2)
y = X_[:, 0] > 0
plt.scatter(X_blob[:, 0], X_blob[:, 1], c=y, linewidths=0, s=30)
plt.xlabel("feature 1")
plt.ylabel("feature 2")
# end of the other manner to do the same things
"""
from sklearn.decomposition import PCA
pca=PCA()
# We fit the PCA model with our data. As PCA is an unsupervised algorithm
# there is no output y
pca.fit(X_blob)
# we transform the data and project it on the principal components
X_pca=pca.transform(X_blob)
plt.scatter(X_pca[:,0],X_pca[:,1])
plt.xlabel("First principal component")
plt.ylabel("Second principal component")
"""
Dimensionality Reduction for Visualization with PCA
Now we study an example with 64 features i.e. dimensions
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
random_projection)
digits = datasets.load_digits(n_class=6)
n_digits = 500
X = digits.data[:n_digits]
y = digits.target[:n_digits]
n_samples, n_features = X.shape
n_neighbors = 30
def plot_embedding(X, title=None):
x_min, x_max = np.min(X, 0), np.max(X, 0)
X = (X - x_min) / (x_max - x_min)
plt.figure()
ax = plt.subplot(111)
for i in range(X.shape[0]):
plt.text(X[i, 0], X[i, 1], str(digits.target[i]),
color=plt.cm.Set1(y[i] / 10.),
fontdict={'weight': 'bold', 'size': 9})
if hasattr(offsetbox, 'AnnotationBbox'):
# only print thumbnails with matplotlib > 1.0
shown_images = np.array([[1., 1.]]) # just something big
for i in range(X.shape[0]):
dist = np.sum((X[i] - shown_images) ** 2, 1)
if np.min(dist) < 1e5:
# don't show points that are too close
# set a high threshold to basically turn this off
continue
shown_images = np.r_[shown_images, [X[i]]]
imagebox = offsetbox.AnnotationBbox(
offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
X[i])
ax.add_artist(imagebox)
plt.xticks([]), plt.yticks([])
if title is not None:
plt.title(title)
n_img_per_row = 10
img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row))
for i in range(n_img_per_row):
ix = 10 * i + 1
for j in range(n_img_per_row):
iy = 10 * j + 1
img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8))
plt.imshow(img, cmap=plt.cm.binary)
plt.xticks([])
plt.yticks([])
plt.title('A selection from the 64-dimensional digits dataset')
print("Computing PCA projection")
pca = decomposition.PCA(n_components=2).fit(X)
X_pca = pca.transform(X)
plot_embedding(X_pca, "Principal Components projection of the digits")
plt.matshow(pca.components_[0, :].reshape(8, 8), cmap="gray")
plt.axis('off')
plt.matshow(pca.components_[1, :].reshape(8, 8), cmap="gray")
plt.axis('off')
plt.show()
"""
MANIFOLD LEARNING
PCA has one weakness which is it cannot detect non-linear features. Then
the manifold learning algorithms have been developed to bypass this deficiency.
In manifold learning, we use a canonical dataset called the S-curve.
"""
from sklearn.datasets import make_s_curve
X,y=make_s_curve(n_samples=1000)
from mpl_toolkits.mplot3d import Axes3D
ax=plt.axes(projection='3d')
ax.scatter3D(X[:,0],X[:,1],X[:,2],c=y)
ax.view_init(10,-60)
# this is a 2D dataset embedded in 3D, but it is embedded in such a way that
#PCA can't discover the underlying data orientation.
from sklearn import decomposition
X_pca=decomposition.PCA(n_components=2).fit_transform(X)
plt.scatter(X_pca[:,0],X_pca[:,1],c=y)
#Manifold learning algorithms, however, available in the sklearn.manifold
#submodule, are able to recover the underlying 2-dimensional manifold:
from sklearn.manifold import Isomap
iso = Isomap(n_neighbors=15, n_components=2)
X_iso = iso.fit_transform(X)
plt.scatter(X_iso[:, 0], X_iso[:, 1], c=y)
"""
Exercise: Compare the results of Isomap and PCA on a 5-class subset of the
digits dataset (load_digits(5))
Bonus: Also compare to TSNE, another popular manifold learning technique.
"""
from sklearn.datasets import load_digits
digits=load_digits(5)
X=digits.data
isomap=Isomap(n_neighbors=15,n_components=2)
X_trans=isomap.fit_transform(X)
print(X_trans.shape)
plt.scatter(X_trans[:,0],X_trans[:,1],c=digits.target)
# Another method
from sklearn.manifold import TSNE
tsne = TSNE()
X_tsne = tsne.fit_transform(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=digits.target)
print(X_tsne.shape)
"""
Clustering with unsupervised learning method
"""
from sklearn.datasets import make_blobs
X,y=make_blobs(random_state=42)
X.shape
plt.scatter(X[:,0],X[:,1])
"""
There are 3 groups in the data. We want to recover them using clustering.
Even if the groups are obvious in the data, it is hard to find them when
the data is located in high-dimensional space.
We will use one of the simplest clustering algorithm which is K-means
"""
from sklearn.cluster import KMeans
kmeans=KMeans(n_clusters=3,random_state=42)
labels=kmeans.fit_predict(X)
print(all(labels==kmeans.labels_))
plt.scatter(X[:,0],X[:,1],c=labels)
""""
we need a better estimator for the clustering accuracy so we compare our data
to the ground truth we got wen generating the blobs
"""
from sklearn.metrics import confusion_matrix,accuracy_score
print(accuracy_score(y,labels))
print(confusion_matrix(y,labels))
np.mean(y==labels)
# Invariant to permutations of the labels
# Attention a very important technique in clustering to avoid loosing labels
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y,labels)
"""
Clustering comes with assumptions: A clustering algorithm finds clusters by
making assumptions with samples should be grouped together. Each algorithm makes
different assumptions and the quality and interpretability of your results
will depend on whether the assumptions are satisfied for your goal:
For K-means clustering: the model is that all clusters have equal spherical
variance
K-means comes with idea that there are a centers and circles around centers
which are the variances for clusters points
VERY IMPORTANT: if we want to fail the k-means algorithm, we generate non-
isotropic clusters.
"""
from sklearn.datasets import make_blobs
X,y=make_blobs(random_state=170,n_samples=600)
rng=np.random.RandomState(74)
transformation=rng.normal(size=(2,2))
X=np.dot(X,transformation)
y_pred=KMeans(n_clusters=3).fit_predict(X)
plt.scatter(X[:,0],X[:,1],c=y_pred)
kmeans.cluster_centers_
# After this failed example for kmeans we go to an exercise
"""
Digits clustering
"""
from sklearn.datasets import load_digits
digits=load_digits()
# Is the solution from solutions/08B_digits_clustering.py
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
clusters = kmeans.fit_predict(digits.data)
print(kmeans.cluster_centers_.shape)
#------------------------------------------------------------
# visualize the cluster centers
fig = plt.figure(figsize=(8, 3))
for i in range(10):
ax = fig.add_subplot(2, 5, 1 + i)
ax.imshow(kmeans.cluster_centers_[i].reshape((8, 8)),
cmap=plt.cm.binary)
from sklearn.manifold import Isomap
X_iso = Isomap(n_neighbors=10).fit_transform(digits.data)
#-----------------------------------------------------------
# visualize the projected data
fig, ax = plt.subplots(1, 2, figsize=(8, 4))
ax[0].scatter(X_iso[:, 0], X_iso[:, 1], c=clusters)
ax[1].scatter(X_iso[:, 0], X_iso[:, 1], c=digits.target)
# End of the solution code
plt.imshow(digits.images[0])
plt.figure()
plt.imshow(digits.images[0],interpolation='nearest')
plt.matshow(digits.images[0])
adjusted_rand_score(digits.target,clusters)
"""
Now we start with application of the ML paradigms and algorithms on real data
Case Study number 1 - Supervised Classification of Handwritten Digits
First of all: a good idea to start a data problem is to visuliaze data using
one of the dimensionality reduction techniques. One starts with the most
straightforward one which is Principal Component Analysis (PCA).
IDEA OF PCA:
PCA seeks orthogonal linear combinations of the features which show the
greatest variance and like this "as such" can help give us a good idea of the
data structure
We will use RandomizedPCA because it is faster for large N
"""
from sklearn.datasets import load_digits
digits=load_digits()
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(6, 6)) # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
# plot the digits: each image is 8x8 pixels
for i in range(64):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')
# label the image with the target value
ax.text(0, 7, str(digits.target[i]))
from sklearn.decomposition import RandomizedPCA
pca=RandomizedPCA(n_components=2,random_state=1999)
proj=pca.fit_transform(digits.data)
plt.scatter(proj[:,0],proj[:,1],c=digits.target)
plt.colorbar()
"""
A weakness of PCA is that it produces a linear dimensionality reduction:
this may miss some interesting relationships in the data.
For non-linear data mapping: we can use manifold module methods. For the moment,
we will use Isomap (a concatenation of Isometric Mapping) based on graph
theory.
"""
from sklearn.manifold import Isomap
iso=Isomap(n_neighbors=5,n_components=2)
proj=iso.fit_transform(digits.data)
plt.scatter(proj[:,0],proj[:,1],c=digits.target)
plt.colorbar()
# these visualizations show us that there is hope: even a simple classifier
#should be able to adequately identify the members of the various classes.
"""
Now we continue with the basic idea that consists of finding the most simple
method or algorithm to understand data before going to more complex methods
A good method Gaussian Naive Bayes:
It is a generative classifier which fits an axis-aligned multi-dimensional
Gaussian distribution to each training label, and uses this to quickly give
a rough classification. It is generally not sufficiently accurate for
real-world data, but can perform surprisingly well.
"""
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import train_test_split
#split the data into training and validation sets
X_train, X_test, y_train, y_test=train_test_split(digits.data,digits.target)
#train the model
clf=GaussianNB()
clf.fit(X_train,y_train)
#use the model to predict the labels of the test data
predicted=clf.predict(X_test)
expected=y_test
fig = plt.figure(figsize=(6, 6)) # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
# plot the digits: each image is 8x8 pixels
for i in range(64):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(X_test.reshape(-1, 8, 8)[i], cmap=plt.cm.binary,
interpolation='nearest')
# label the image with the target value
if predicted[i] == expected[i]:
ax.text(0, 7, str(predicted[i]), color='green')
else:
ax.text(0, 7, str(predicted[i]), color='red')
#Quantitative analysis of the error
matches = (predicted == expected)
print(matches.sum())
print(len(matches))
matches.sum() / float(len(matches))
print(clf.score(X_test, y_test))
from sklearn import metrics
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
plt.matshow(metrics.confusion_matrix(expected, predicted))
#plt.matshow(metrics.confusion_matrix(expected, predicted),map="gray")
"""
Let's start now with a difficult example which is
Unsupervised Preprocessing and an example from Image Processing
"""
import matplotlib.pyplot as plt
# Using PCA to plot Datasets
"""
PCA is a useful preprocessing technique for both visualizing data in 2 or 3
dimensions, and for improving the performance of downstream algorithms such as
classifiers. We will see more details about using PCA as part of a ML pipeline
in the net section, but here we explain the intuition behind what PCA does and
why it is useful for certain tasks.
the goal of PCA is to find the dimensions of maximum variation in the data, and
project onto them. this is helpful for data that stretched in a particular
dimension. Here a 2D example
"""
import numpy as np
random_state=np.random.RandomState(1999)
X=np.random.randn(500,2)
red_idx=np.where(X[:,0]<0)[0]
blue_idx=np.where(X[:,0]>=0)[0]
#stretching
s_matrix=np.array([[1,0],[0,20]])
#Rotation
r_angle=33
r_rad=np.pi*r_angle/180
r_matrix=np.array([[np.cos(r_rad), -np.sin(r_rad)],[np.sin(r_rad), np.cos(r_rad)]])
X=np.dot(X,s_matrix).dot(r_matrix)
plt.scatter(X[red_idx,0],X[red_idx,1],color="darkred")
plt.scatter(X[blue_idx,0],X[blue_idx,1],color="steelblue")
plt.axis('off')
plt.title("Skewed Data")
# We use PCA method now
from sklearn.decomposition import PCA
pca=PCA()
X_t=pca.fit_transform(X)
plt.scatter(X_t[red_idx,0],X_t[red_idx,1],color="darkred")
plt.scatter(X_t[blue_idx,0],X_t[blue_idx,1],color="steelblue")
plt.axis('off')
plt.title("PCA Corrected Data")
"""
Note that we can use PCA to visualize complex data in low dimensions in order
to see how "close" and "far" different datapoints are in a 2D space.
There are many different ways to do this visualization, and some common
algorithms are found in sklearn.manifold. PCA is one of the simplest and most
common methods for quickly visualizing a dataset.
"""
"""
Now we'll take a look at unsupervised learning on a facial recognition example.
This uses a dataset available within scikit-learn consisting of a subset of
the Labeled Faces in the Wild data. Note that this is a relatively large
download (~200MB) so it may take a while to execute.
"""
from sklearn import datasets
# The dataset will be downloaded from internet 200 MB
lfw_people=datasets.fetch_lfw_people(min_faces_per_person=70,resize=0.4,
data_home='datasets')
lfw_people.data.shape
# Visualization of the faces
# Let's visualize these faces to see what we're working with:
fig=plt.figure(figsize=(8,6))
#plot several images
for i in range(15):
ax=fig.add_subplot(3,5,i+1,xticks=[],yticks=[])
ax.imshow(lfw_people.images[i],cmap=plt.cm.bone)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(lfw_people.data,
lfw_people.target, random_state=0)
print(X_train.shape, X_test.shape)
from sklearn import decomposition
pca = decomposition.RandomizedPCA(n_components=150, whiten=True)
pca.fit(X_train)
plt.imshow(pca.mean_.reshape((50, 37)), cmap=plt.cm.bone)
print(pca.components_.shape)
fig = plt.figure(figsize=(16, 6))
for i in range(30):
ax = fig.add_subplot(3, 10, i + 1, xticks=[], yticks=[])
ax.imshow(pca.components_[i].reshape((50, 37)), cmap=plt.cm.bone)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_train_pca.shape)
print(X_test_pca.shape)
"""
"""
import numpy as np
plt.figure(figsize=(10, 2))
unique_targets = np.unique(lfw_people.target)
counts = [(lfw_people.target == i).sum() for i in unique_targets]
plt.xticks(unique_targets, lfw_people.target_names[unique_targets])
locs, labels = plt.xticks()
plt.setp(labels, rotation=45, size=14)
_ = plt.bar(unique_targets, counts)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
lfw_people.data, lfw_people.target, random_state=0)
print(X_train.shape, X_test.shape)
from sklearn import decomposition
pca = decomposition.RandomizedPCA(n_components=150, whiten=True,
random_state=1999)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_train_pca.shape)
print(X_test_pca.shape)
from sklearn import svm
clf = svm.SVC(C=5., gamma=0.001)
clf.fit(X_train_pca, y_train)
fig = plt.figure(figsize=(8, 6))
for i in range(15):
ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[])
ax.imshow(X_test[i].reshape((50, 37)), cmap=plt.cm.bone)
y_pred = clf.predict(X_test_pca[i])[0]
color = 'black' if y_pred == y_test[i] else 'red'
ax.set_title(lfw_people.target_names[y_pred], fontsize='small', color=color)
print(clf.score(X_test_pca, y_test))
"""
NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW
NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW
We start the next 3 hours of the courses of scikit-learn
Machine Learning with Scikit Learn:
"SciPy 2015 Tutorial Andreas Mueller & Kyle Kastner Part I & II"
"""
"""
Let's start with the Cross validation techniques:
"""
# Add this later to the Cross validation and the grid search part
clf = GridSearchCV(SVR(), param_grid=param_grid)
cross_val_score(clf,X,y)
"""
Now we start in depth with the Linear models:
Linear Models for classification:
All linear models for classification learn a coefficient parameter coef_ and
an offset intercept_ to make predictions using a linear combination features.
The classification process is similar to regression but only that a threshold
at zero is applied.
The difference between the different linear models is the regularization of
the coef_ and intercept_ and the loss function.
For linear classification, the 2 most common models are the linear SVM
implemented in LinearSVC and LogisticRegression.
Regularization:
In the presence of many features: the linear classifier can suffer an over-fit
so it is necessary to regularize. Then large C values give unregularized model
while small C give strongly regularized models.
We can have two kind of behaviors:
- In high regularization: the importance is given for most of the points: it is
enough is most of the points are classified correctly.
- In less regularization: the importance is given to each individual data point.
"""
# An illustration using a linear SVM with different values of C:
from figures import plot_linear_svc_regularization
plot_linear_svc_regularization()
# Similar for the Ridge/Lasso separation (we can set the penalty parameter to
# to l1 to enforce sparsity of the coefficients)
# Exercise : Use logisticRegression to classify digits, and
#grid-search for the C parameter.
from sklearn.linear_model import LogisticRegression
params={'C' : [0.001,0.01,0.1,1,10,100]}
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
digits=load_digits()
X_train,X_test,y_train,y_test=train_test_split(digits.data,digits.target)
grid=GridSearchCV(LogisticRegression(),param_grid=params,n_jobs=-1)
grid.fit(X_train,y_train)
grid.score(X_test,y_test)
"""
Linear models for regression:
Linear models are useful when little data is available or for very large feature
spaces as in text classification.
They form a good case study for regularization.
the params are in coef_
the interecept is in intercept_
The most standard linear model is the ordinary least squares regression often
simply called linear regression, this model does not put any additional
restrictions on coef_ so when the features number is largen it becomes
ill-posed and the model overfits.
"""
#Now we will generate a simple simulation and see the behavior of the model
import numpy as np
import matplotlib.pyplot as plt
rng=np.random.RandomState(4)
X=rng.normal(size=(1000,50))
beta=rng.normal(size=50)
y=np.dot(X,beta)+4*rng.normal(size=1000)
from sklearn.utils import shuffle
X,y=shuffle(X,y)
from sklearn import linear_model, cross_validation
from sklearn.learning_curve import learning_curve
def plot_learning_curve(est,X,y):
training_set_size,train_scores,test_scores=learning_curve(est,X,y,train_sizes=np.linspace(0.1,1,30))
estimator_name=est.__class__.__name__
line=plt.plot(training_set_size,train_scores.mean(axis=1),'--',label="training scores"+estimator_name)
plt.xlabel("training set size")
plt.legend(loc="best")
#plt.ylim(-1,1)
plot_learning_curve(linear_model.LinearRegression(),X,y)
"""
We see two important things:
The ordinary linear regression is not defined if the number of training samples
is than features.
In the presence of noise: this model is overfitting: we need then to regularize
"""
"""
The ridge estimator is a simple regularization (called l2 penalty) of the
OLR
The ridge estimator is less expensive in computation than the OLR.
"""
plot_learning_curve(linear_model.LinearRegression(),X,y)
plot_learning_curve(linear_model.Ridge(alpha=20),X,y)
plot_learning_curve(linear_model.RidgeCV(), X, y)
plot_learning_curve(linear_model.LinearRegression(),X,y)
plot_learning_curve(linear_model.Ridge(alpha=20),X,y)
plot_learning_curve(linear_model.RidgeCV(), X, y)
"""
The Lasso estimator is useful to impose sparsity on the coefficient.
It is used if we believe that many of the features are not relevant which is
done via the l1 penalty.
"""
#Let us create such a situation with a new simulation where only 10 out
#of the 50 features are relevant:
beta[10:] = 0
y = np.dot(X, beta) + 4*rng.normal(size=1000)
plot_learning_curve(linear_model.Ridge(), X, y)
plot_learning_curve(linear_model.Lasso(), X, y)
"""
I will return another day on the linear model
Now let's go to the Support Vector Machines
For classification problems: SVC
For regression problems: SVR
Linear SVM and Kernel SVM (linear,poly,rbf)
"""
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import rbf_kernel
line=np.linspace(-3,3,100)[:,np.newaxis]
kernel_value=rbf_kernel(line,[[0]],gamma=1)
plt.plot(line,kernel_value)
# the idea here is to vary the value of C and gamma and see the change
from figures import plot_svm_interactive
plot_svm_interactive()
# Exercise without solution
from sklearn import datasets
digits = datasets.load_digits()
X, y = digits.data, digits.target
"""
Estimators in Depth: Trees and Forests:
Here we will explore a class of algorithms based on decision trees.
Decision trees are very intuitive. They encode a series of if and else choices.
really similar to how a person might make a decision However which questions
to ask and how to proceed for each answer is entirely learned from the data.
"""
"""
Decision tree Regression:
A decision tree is a simple binary classification tree that is similar to nearest
neighbor classification.
"""
from figures import make_dataset
x,y=make_dataset()
X=x.reshape(-1,1)
from sklearn.tree import DecisionTreeRegressor
reg=DecisionTreeRegressor(max_depth=5)
reg.fit(X,y)
X_fit=np.linspace(-3,3,1000).reshape((-1,1))
y_fit_1=reg.predict(X_fit)
plt.plot(X_fit.ravel(),y_fit_1,color='blue',label='prediction')
plt.plot(X.ravel(),y,'.k',label='training data')
plt.legend(loc='best')
"""
Decision Tree Classification:
"""
from sklearn.datasets import make_blobs
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from figures import plot_2d_separator
X,y=make_blobs(centers=[[0,0],[1,1]],random_state=61526,n_samples=100)
X_train,X_test,y_train,y_test=train_test_split(X,y)
clf=DecisionTreeClassifier(max_depth=5)
clf.fit(X_train,y_train)
plot_2d_separator(clf,X,fill=True)