untitled0.py

# -*- coding: utf-8 -*-
"""
Created on Sun Jul 24 11:31:24 2016

@author: ahmed
"""

"""
Numpy Arrays
"""
import numpy as np
#generating a random array
X=np.random.random((3,5)) # a 3 X 5 array
print(X)
print("X type is")
print(type(X))
print("X shape is 3 rows and 5 columns")
print(X.shape)

#Accessing elements
#Get a single element
print("Some X single element")
print(X[0,0])
print(X[0,1])
print(X[0,2])
print(X[0,3])
print(X[0,4])
#Get a row
print(X[0])
print(X[1])
print(X[2])
#Get a column
print(X[:,0])
print(X[:,1])
print(X[:,2])
print(X[:,3])
print(X[:,4])

print(X)
# Transposing X
print(X.T)
# Turning a row vector into a column vector
y=np.linspace(0,12,5)
print(y)
# make into a column vector
print(y[:,np.newaxis])
print(y[:,])
# Getting the shape or reshaping an array: many examples
print(X.shape)
print(X.reshape(5,3))
print(X.reshape(15,1))
print(X.reshape(1,15))

# Indexing by an array of integers (fancy indexing)
indices=np.array([3,1,0])
print(indices)
X[:,indices]
"""
Scipy Sparse Matrices
"""
from scipy import sparse
#create a random array with a lot of zeros
X=np.random.random((10,5))
print(X)
#set the majority of elements to zero
X[X<0.7]=0
print(X)
#turn X into a csr Compressed Sparse row matrix
X_csr=sparse.csr_matrix(X)
print(X_csr)
#Convert the sparse matrix to a dense array
print(X_csr.toarray())
#Create an empty LIL matrix and add some items
X_lil=sparse.lil_matrix((5,5))
for i,j in np.random.randint(0,5,(15,2)):
    X_lil[i,j]=i+j
print(X_lil)
print(X_lil.toarray())
print(X_lil.tocsr())
"""
Matplotlib
"""
import matplotlib.pyplot as plt
# plotting a line
x = np.linspace(0, 10, 100)
plt.plot(x, np.sin(x))
# scatter-plot points
x = np.random.normal(size=500)
y = np.random.normal(size=500)
plt.scatter(x, y)
# showing images
x = np.linspace(1, 12, 100)
y = x[:, np.newaxis]

im = y * np.sin(x) * np.cos(y)
print(im.shape)
# imshow - note that origin is at the top-left by default!
plt.imshow(im)
# Contour plot - note that origin here is at the bottom-left by default!
plt.contour(im)
# 3D plotting
from mpl_toolkits.mplot3d import Axes3D
ax = plt.axes(projection='3d')
xgrid, ygrid = np.meshgrid(x, y.ravel())
ax.plot_surface(xgrid, ygrid, im, cmap=plt.cm.jet, cstride=2, rstride=2, linewidth=0)
# %load http://matplotlib.org/mpl_examples/pylab_examples/ellipse_collection.py
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.collections import EllipseCollection

x = np.arange(10)
y = np.arange(15)
X, Y = np.meshgrid(x, y)
XY = np.hstack((X.ravel()[:,np.newaxis], Y.ravel()[:,np.newaxis]))
ww = X/10.0
hh = Y/15.0
aa = X*9
fig, ax = plt.subplots()
ec = EllipseCollection(ww, hh, aa, units='x', offsets=XY,transOffset=ax.transData)
ec.set_array((X+Y).ravel())
ax.add_collection(ec)
ax.autoscale_view()
ax.set_xlabel('X')
ax.set_ylabel('y')
cbar = plt.colorbar(ec)
cbar.set_label('X+Y')
plt.show()
"""
Another examples with matplotlib
"""


"""
Supervised learning classification CLASSIFICATION
"""
import matplotlib.pyplot as plt
import numpy as np
"""
To visualize the ML Working algorithms, it is helpful to study 2D & 1D data i.e
data with only 2 or 1 features. 
The first example will use: synthetic data generated by the make_blobs function
"""
from sklearn.datasets import make_blobs
X,y=make_blobs(centers=2,random_state=0)
print(type(y))
print(X.shape)
print(type(y))
print(y.shape)
print(X[:5,:])
print(y[:5])
plt.scatter(X[:,0],X[:,1],c=y,s=40)
plt.xlabel("First feature")
plt.ylabel("second feature")
# the train_test_split function from the cross_validation modules does that
# for us, by randomly splitting of 25% of the data for testing
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=0)
#Every algorithm is an Estimator object, a logistic regression
from sklearn.linear_model import LogisticRegression
# Method
# First we instantiate the estimator object
classifier=LogisticRegression()
print(X_train.shape)
print(y_train.shape)
# Second we call the fit function with the training data
classifier.fit(X_train, y_train)
# Third we call the predict function with the testing data
prediction=classifier.predict(X_test)
"""
Fourth we compare between the prediction and the data
We can evaluate the classifier quantitatively by measuring what fraction
of prediction is correct: this called accuracy
"""
print(prediction)
print(y_test)
np.mean(prediction == y_test)
print("mean(prediction == y_test)")
print(np.mean(prediction == y_test))
"""
There is also an direct function inside sckit-learn which is the score function
it computes directly from the test data
"""
classifier.score(X_test,y_test)
print("classifier.score(X_train,y_train)")
print(classifier.score(X_test,y_test))
classifier.score(X_train,y_train)
print("classifier.score(X_train,y_train)")
print(classifier.score(X_train,y_train))
# plotting results if possible 
from figures import plot_2d_separator
plt.scatter(X[:,0],X[:,1],c=y,s=40)
plt.xlabel("First feature")
plt.ylabel("Second feature")
plot_2d_separator(classifier, X)
# Finally we estimate the estimated paramters ending by an underscore
print(classifier.coef_)
print(classifier.intercept_)
print(classifier.classes_)
""" 
Another classifier: K Nearest Neighbors: popular and easy
One of the simplest strategies:
Given a new unknown observation, look up in your reference database
which one have the closest features and assign the predominant class
"""
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=1)
#knn=KNeighborsClassifier(n_neighbors=3)
#knn=KNeighborsClassifier(n_neighbors=10)
#knn=KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train,y_train)
plt.scatter(X[:,0],X[:,1],c=y,s=40)
plt.xlabel("first feature")
plt.ylabel("second feature")
plot_2d_separator(knn,X)
knn.score(X_test,y_test)
"""
Application on the iris dataset
we change the number of n_neighbors in the estimator
"""
from sklearn.datasets import load_iris
iris=load_iris()
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target)
knn=KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)
print(knn.score(X_train,y_train))
print(knn.score(X_test,y_test))
print(knn.predict(X_test))
"""
Now we start with another important subject which is REGRESSION
in regression, we try to predict a continuous output variable
"""
import matplotlib.pyplot as plt
import numpy as np
x=np.linspace(-3,3,100)
print(x)
y=np.sin(4*x)+x+np.random.uniform(size=len(x))
plt.plot(x,y,'o')
# Linear regression
# to apply a scikit learn model, we need to make X be a 2d array
print(x.shape)
X=x[:,np.newaxis]
print(X.shape)
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y)
#then we can build our regression model
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train,y_train)
y_pred_train=regressor.predict(X_train)
plt.plot(X_train,y_train,'o',label="data")
plt.plot(X_train,y_pred_train,'o',label='prediction')
plt.legend(loc='best')
# let's try the test set
y_pred_test=regressor.predict(X_test)
plt.plot(X_test,y_test,'o',label="data")
plt.plot(X_test,y_pred_test,'o',label='prediction')
plt.legend(loc='best')
#Quantitative evaluation of the score method
regressor.score(X_test,y_test)
"""
Another exercise:
We compare between the KNeighborsRegressor and LinearRegression on the boston
housing dataset 
"""
from sklearn.datasets import load_boston
boston=load_boston()
X_train,X_test,y_train,y_test=train_test_split(boston.data,boston.target,random_state=42)
print(boston.DESCR)
print(boston.keys())
#Another compacter manner to do the code
lr=LinearRegression().fit(X_train,y_train)
print(lr.score(X_train,y_train))
print(lr.score(X_test,y_test))
from sklearn.neighbors import KNeighborsRegressor
knn=KNeighborsRegressor(n_neighbors=3).fit(X_train,y_train)
print(knn.score(X_train,y_train))
print(knn.score(X_test,y_test))
#######################################################
#######################################################
#######################################################
#######################################################
#######################################################
# UNSUPERVISED LEARNING METHOD
"""
Unsupervised Learning: 
- dimensionality reduction
- manifold learning
- feature extraction
- find a new representation of the input data without any additional input
Another important application is rescaling the data to have zero mean and 
unit variance which is a very helpful preprocessing step for many machine 
learning models
"""
import matplotlib.pyplot as plt
import numpy as np
# RESCALING application
# The iris dataset is not centered: non-zero mean and the std is different 
# for each component
from sklearn.datasets import load_iris
iris=load_iris()
X,y = iris.data, iris.target
print(X.shape)
print(y.shape)
print("mean: %s " %X.mean(axis=0))
print("standard deviation: %s " %X.std(axis=0))
# to use preprocessing method we import the estimator: StandardScaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
# As this is an unsupervised model we pass only X and not y
# we estimate so mean and standard deviation
scaler.fit(X)
# we don't call predict but transform for rescaling 
X_scaled=scaler.transform(X)
print(X_scaled.shape)
print("New mean of scaled data")
print("mean: %s " %X_scaled.mean(axis=0))
print("New std for scaled data")
print("standard deviation: %s " %X_scaled.std(axis=0))
"""
Principal Component Analysis
PCA is an unsupervised transformation. It is a technique to reduce the data 
dimensionality by creating a linear projection so we find new features to 
represent the data which are linear combination of the old ones (by a rotation)

Method: PCA looks for the maximum variance directions then only few components 
that explains most of the variance in the data are kept.
Note that the PCA directions are orthogonal
"""
# An example
rnd=np.random.RandomState(42)
X_blob=np.dot(rnd.normal(size=(100,2)),rnd.normal(size=(2,2)))+rnd.normal(size=2)
plt.scatter(X_blob[:,0],X_blob[:,1])
plt.xlabel("feature 1")
plt.ylabel("feature 2")
"""
# Another example but with another code
rnd = np.random.RandomState(5)
X_ = rnd.normal(size=(300, 2))
X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2)
y = X_[:, 0] > 0
plt.scatter(X_blob[:, 0], X_blob[:, 1], c=y, linewidths=0, s=30)
plt.xlabel("feature 1")
plt.ylabel("feature 2")
# end of the other manner to do the same things
"""
from sklearn.decomposition import PCA
pca=PCA()
# We fit the PCA model with our data. As PCA is an unsupervised algorithm 
# there is no output y
pca.fit(X_blob)
# we transform the data and project it on the principal components
X_pca=pca.transform(X_blob)
plt.scatter(X_pca[:,0],X_pca[:,1])
plt.xlabel("First principal component")
plt.ylabel("Second principal component")
"""
Dimensionality Reduction for Visualization with PCA
Now we study an example with 64 features i.e. dimensions
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from sklearn import (manifold, datasets, decomposition, ensemble, lda,
                     random_projection)

digits = datasets.load_digits(n_class=6)
n_digits = 500
X = digits.data[:n_digits]
y = digits.target[:n_digits]
n_samples, n_features = X.shape
n_neighbors = 30
def plot_embedding(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure()
    ax = plt.subplot(111)
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], str(digits.target[i]),
                color=plt.cm.Set1(y[i] / 10.),
                fontdict={'weight': 'bold', 'size': 9})

    if hasattr(offsetbox, 'AnnotationBbox'):
        # only print thumbnails with matplotlib > 1.0
        shown_images = np.array([[1., 1.]])  # just something big
        for i in range(X.shape[0]):
            dist = np.sum((X[i] - shown_images) ** 2, 1)
            if np.min(dist) < 1e5:
                # don't show points that are too close
                # set a high threshold to basically turn this off
                continue
            shown_images = np.r_[shown_images, [X[i]]]
            imagebox = offsetbox.AnnotationBbox(
                offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
                X[i])
            ax.add_artist(imagebox)
    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)
n_img_per_row = 10
img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row))
for i in range(n_img_per_row):
    ix = 10 * i + 1
    for j in range(n_img_per_row):
        iy = 10 * j + 1
        img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8))
plt.imshow(img, cmap=plt.cm.binary)
plt.xticks([])
plt.yticks([])
plt.title('A selection from the 64-dimensional digits dataset')
print("Computing PCA projection")
pca = decomposition.PCA(n_components=2).fit(X)
X_pca = pca.transform(X)
plot_embedding(X_pca, "Principal Components projection of the digits")
plt.matshow(pca.components_[0, :].reshape(8, 8), cmap="gray")
plt.axis('off')
plt.matshow(pca.components_[1, :].reshape(8, 8), cmap="gray")
plt.axis('off')
plt.show()
"""
MANIFOLD LEARNING 
PCA has one weakness which is it cannot detect non-linear features. Then 
the manifold learning algorithms have been developed to bypass this deficiency.
In manifold learning, we use a canonical dataset called the S-curve.
"""
from sklearn.datasets import make_s_curve
X,y=make_s_curve(n_samples=1000)
from mpl_toolkits.mplot3d import Axes3D
ax=plt.axes(projection='3d')
ax.scatter3D(X[:,0],X[:,1],X[:,2],c=y)
ax.view_init(10,-60)
# this is a 2D dataset embedded in 3D, but it is embedded in such a way that 
#PCA can't discover the underlying data orientation.
from sklearn import decomposition
X_pca=decomposition.PCA(n_components=2).fit_transform(X)
plt.scatter(X_pca[:,0],X_pca[:,1],c=y)
#Manifold learning algorithms, however, available in the sklearn.manifold
#submodule, are able to recover the underlying 2-dimensional manifold:
from sklearn.manifold import Isomap
iso = Isomap(n_neighbors=15, n_components=2)
X_iso = iso.fit_transform(X)
plt.scatter(X_iso[:, 0], X_iso[:, 1], c=y)
"""
Exercise: Compare the results of Isomap and PCA on a 5-class subset of the 
digits dataset (load_digits(5))
Bonus: Also compare to TSNE, another popular manifold learning technique.
"""
from sklearn.datasets import load_digits
digits=load_digits(5)
X=digits.data
isomap=Isomap(n_neighbors=15,n_components=2)
X_trans=isomap.fit_transform(X)
print(X_trans.shape)
plt.scatter(X_trans[:,0],X_trans[:,1],c=digits.target)
# Another method
from sklearn.manifold import TSNE
tsne = TSNE()
X_tsne = tsne.fit_transform(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=digits.target)
print(X_tsne.shape)
"""
Clustering with unsupervised learning method

"""
from sklearn.datasets import make_blobs
X,y=make_blobs(random_state=42)
X.shape
plt.scatter(X[:,0],X[:,1])
"""
There are 3 groups in the data. We want to recover them using clustering.
Even if the groups are obvious in the data, it is hard to find them when 
the data is located in high-dimensional space.
We will use one of the simplest clustering algorithm which is K-means
"""
from sklearn.cluster import KMeans
kmeans=KMeans(n_clusters=3,random_state=42)
labels=kmeans.fit_predict(X)
print(all(labels==kmeans.labels_))
plt.scatter(X[:,0],X[:,1],c=labels)
""""
we need a better estimator for the clustering accuracy so we compare our data
to the ground truth we got wen generating the blobs
"""
from sklearn.metrics import confusion_matrix,accuracy_score
print(accuracy_score(y,labels))
print(confusion_matrix(y,labels))
np.mean(y==labels)
# Invariant to permutations of the labels
# Attention a very important technique in clustering to avoid loosing labels
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y,labels)
"""
Clustering comes with assumptions: A clustering algorithm finds clusters by 
making assumptions with samples should be grouped together. Each algorithm makes
different assumptions and the quality and interpretability of your results 
will depend on whether the assumptions are satisfied for your goal: 
For K-means clustering: the model is that all clusters have equal spherical 
variance
K-means comes with idea that there are a centers and circles around centers 
which are the variances for clusters points
VERY IMPORTANT: if we want to fail the k-means algorithm, we generate non-
isotropic clusters.
"""
from sklearn.datasets import make_blobs
X,y=make_blobs(random_state=170,n_samples=600)
rng=np.random.RandomState(74)
transformation=rng.normal(size=(2,2))
X=np.dot(X,transformation)
y_pred=KMeans(n_clusters=3).fit_predict(X)
plt.scatter(X[:,0],X[:,1],c=y_pred)
kmeans.cluster_centers_
# After this failed example for kmeans we go to an exercise
"""
Digits clustering

"""
from sklearn.datasets import load_digits
digits=load_digits()
# Is the solution from solutions/08B_digits_clustering.py
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
clusters = kmeans.fit_predict(digits.data)
print(kmeans.cluster_centers_.shape)
#------------------------------------------------------------
# visualize the cluster centers
fig = plt.figure(figsize=(8, 3))
for i in range(10):
    ax = fig.add_subplot(2, 5, 1 + i)
    ax.imshow(kmeans.cluster_centers_[i].reshape((8, 8)),
              cmap=plt.cm.binary)
from sklearn.manifold import Isomap
X_iso = Isomap(n_neighbors=10).fit_transform(digits.data)
#-----------------------------------------------------------
# visualize the projected data
fig, ax = plt.subplots(1, 2, figsize=(8, 4))
ax[0].scatter(X_iso[:, 0], X_iso[:, 1], c=clusters)
ax[1].scatter(X_iso[:, 0], X_iso[:, 1], c=digits.target)
# End of the solution code
plt.imshow(digits.images[0])
plt.figure()
plt.imshow(digits.images[0],interpolation='nearest')
plt.matshow(digits.images[0])
adjusted_rand_score(digits.target,clusters)
"""
Now we start with application of the ML paradigms and algorithms on real data
Case Study number 1 - Supervised Classification of Handwritten Digits

First of all: a good idea to start a data problem is to visuliaze data using 
one of the dimensionality reduction techniques. One starts with the most 
straightforward one which is Principal Component Analysis (PCA).
IDEA OF PCA:
PCA seeks orthogonal linear combinations of the features which show the 
greatest variance and like this "as such" can help give us a good idea of the 
data structure 
We will use RandomizedPCA because it is faster for large N
"""
from sklearn.datasets import load_digits
digits=load_digits()
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(6, 6))  # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

# plot the digits: each image is 8x8 pixels
for i in range(64):
    ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
    ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')
    
    # label the image with the target value
    ax.text(0, 7, str(digits.target[i]))
from sklearn.decomposition import RandomizedPCA
pca=RandomizedPCA(n_components=2,random_state=1999)
proj=pca.fit_transform(digits.data)
plt.scatter(proj[:,0],proj[:,1],c=digits.target)
plt.colorbar()
"""
A weakness of PCA is that it produces a linear dimensionality reduction:
this may miss some interesting relationships in the data.
For non-linear data mapping: we can use manifold module methods. For the moment, 
we will use Isomap (a concatenation of Isometric Mapping) based on graph
theory.
"""
from sklearn.manifold import Isomap
iso=Isomap(n_neighbors=5,n_components=2)
proj=iso.fit_transform(digits.data)
plt.scatter(proj[:,0],proj[:,1],c=digits.target)
plt.colorbar()
# these visualizations show us that there is hope: even a simple classifier 
#should be able to adequately identify the members of the various classes.
"""
Now we continue with the basic idea that consists of finding the most simple
method or algorithm to understand data before going to more complex methods
A good method Gaussian Naive Bayes: 
It is a generative classifier which fits an axis-aligned multi-dimensional 
Gaussian distribution to each training label, and uses this to quickly give
a rough classification. It is generally not sufficiently accurate for 
real-world data, but can perform surprisingly well.
"""
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import train_test_split
#split the data into training and validation sets
X_train, X_test, y_train, y_test=train_test_split(digits.data,digits.target)
#train the model
clf=GaussianNB()
clf.fit(X_train,y_train)
#use the model to predict the labels of the test data
predicted=clf.predict(X_test)
expected=y_test
fig = plt.figure(figsize=(6, 6))  # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)

# plot the digits: each image is 8x8 pixels
for i in range(64):
    ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
    ax.imshow(X_test.reshape(-1, 8, 8)[i], cmap=plt.cm.binary,
              interpolation='nearest')
    
    # label the image with the target value
    if predicted[i] == expected[i]:
        ax.text(0, 7, str(predicted[i]), color='green')
    else:
        ax.text(0, 7, str(predicted[i]), color='red')
#Quantitative analysis of the error
matches = (predicted == expected)
print(matches.sum())
print(len(matches))
matches.sum() / float(len(matches))
print(clf.score(X_test, y_test))

from sklearn import metrics
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
plt.matshow(metrics.confusion_matrix(expected, predicted))
#plt.matshow(metrics.confusion_matrix(expected, predicted),map="gray")
"""
Let's start now with a difficult example which is 
Unsupervised Preprocessing and an example from Image Processing
"""
import matplotlib.pyplot as plt
# Using PCA to plot Datasets
"""
PCA is a useful preprocessing technique for both visualizing data in 2 or 3 
dimensions, and for improving the performance of downstream algorithms such as 
classifiers. We will see more details about using PCA as part of a ML pipeline 
in the net section, but here we explain the intuition behind what PCA does and 
why it is useful for certain tasks.
the goal of PCA is to find the dimensions of maximum variation in the data, and 
project onto them. this is helpful for data that stretched in a particular 
dimension. Here a 2D example 
"""
import numpy as np
random_state=np.random.RandomState(1999)
X=np.random.randn(500,2)
red_idx=np.where(X[:,0]<0)[0]
blue_idx=np.where(X[:,0]>=0)[0]
#stretching
s_matrix=np.array([[1,0],[0,20]])
#Rotation
r_angle=33
r_rad=np.pi*r_angle/180
r_matrix=np.array([[np.cos(r_rad), -np.sin(r_rad)],[np.sin(r_rad), np.cos(r_rad)]])
X=np.dot(X,s_matrix).dot(r_matrix)
plt.scatter(X[red_idx,0],X[red_idx,1],color="darkred")
plt.scatter(X[blue_idx,0],X[blue_idx,1],color="steelblue")
plt.axis('off')
plt.title("Skewed Data")
# We use PCA method now
from sklearn.decomposition import PCA
pca=PCA()
X_t=pca.fit_transform(X)
plt.scatter(X_t[red_idx,0],X_t[red_idx,1],color="darkred")
plt.scatter(X_t[blue_idx,0],X_t[blue_idx,1],color="steelblue")
plt.axis('off')
plt.title("PCA Corrected Data")
"""
Note that we can use PCA to visualize complex data in low dimensions in order 
to see how "close" and "far" different datapoints are in a 2D space. 
There are many different ways to do this visualization, and some common 
algorithms are found in sklearn.manifold. PCA is one of the simplest and most 
common methods for quickly visualizing a dataset.
"""
"""
Now we'll take a look at unsupervised learning on a facial recognition example.
 This uses a dataset available within scikit-learn consisting of a subset of 
 the Labeled Faces in the Wild data. Note that this is a relatively large 
 download (~200MB) so it may take a while to execute.
"""
from sklearn import datasets
# The dataset will be downloaded from internet 200 MB
lfw_people=datasets.fetch_lfw_people(min_faces_per_person=70,resize=0.4,
                                     data_home='datasets')
lfw_people.data.shape
# Visualization of the faces 
# Let's visualize these faces to see what we're working with:
fig=plt.figure(figsize=(8,6))
#plot several images
for i in range(15):
    ax=fig.add_subplot(3,5,i+1,xticks=[],yticks=[])
    ax.imshow(lfw_people.images[i],cmap=plt.cm.bone)

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(lfw_people.data, 
                                                    lfw_people.target, random_state=0)
print(X_train.shape, X_test.shape)
from sklearn import decomposition
pca = decomposition.RandomizedPCA(n_components=150, whiten=True)
pca.fit(X_train)
plt.imshow(pca.mean_.reshape((50, 37)), cmap=plt.cm.bone)
print(pca.components_.shape)
fig = plt.figure(figsize=(16, 6))
for i in range(30):
    ax = fig.add_subplot(3, 10, i + 1, xticks=[], yticks=[])
    ax.imshow(pca.components_[i].reshape((50, 37)), cmap=plt.cm.bone)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_train_pca.shape)
print(X_test_pca.shape)
"""

"""
import numpy as np
plt.figure(figsize=(10, 2))
unique_targets = np.unique(lfw_people.target)
counts = [(lfw_people.target == i).sum() for i in unique_targets]
plt.xticks(unique_targets, lfw_people.target_names[unique_targets])
locs, labels = plt.xticks()
plt.setp(labels, rotation=45, size=14)
_ = plt.bar(unique_targets, counts)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    lfw_people.data, lfw_people.target, random_state=0)

print(X_train.shape, X_test.shape)

from sklearn import decomposition
pca = decomposition.RandomizedPCA(n_components=150, whiten=True,
                                 random_state=1999)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print(X_train_pca.shape)
print(X_test_pca.shape)

from sklearn import svm
clf = svm.SVC(C=5., gamma=0.001)
clf.fit(X_train_pca, y_train)

fig = plt.figure(figsize=(8, 6))
for i in range(15):
    ax = fig.add_subplot(3, 5, i + 1, xticks=[], yticks=[])
    ax.imshow(X_test[i].reshape((50, 37)), cmap=plt.cm.bone)
    y_pred = clf.predict(X_test_pca[i])[0]
    color = 'black' if y_pred == y_test[i] else 'red'
    ax.set_title(lfw_people.target_names[y_pred], fontsize='small', color=color)

print(clf.score(X_test_pca, y_test))
"""
NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW
NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW NOW
We start the next 3 hours of the courses of scikit-learn
Machine Learning with Scikit Learn:  
"SciPy 2015 Tutorial Andreas Mueller & Kyle Kastner Part I & II" 
"""
"""
Let's start with the Cross validation techniques: 
"""


# Add this later to the Cross validation and the grid search part
clf = GridSearchCV(SVR(), param_grid=param_grid)
cross_val_score(clf,X,y)
"""
Now we start in depth with the Linear models:
Linear Models for classification:
All linear models for classification learn a coefficient parameter coef_ and 
an offset intercept_ to make predictions using a linear combination features.
The classification process is similar to regression but only that a threshold 
at zero is applied.
The difference between the different linear models is the regularization of 
the coef_ and intercept_ and the loss function.
For linear classification, the 2 most common models are the linear SVM 
implemented in LinearSVC and LogisticRegression.
Regularization:
In the presence of many features: the linear classifier can suffer an over-fit
 so it is necessary to regularize. Then large C values give unregularized model
  while small C give strongly regularized models.
We can have two kind of behaviors:
- In high regularization: the importance is given for most of the points: it is 
enough is most of the points are classified correctly.
- In less regularization: the importance is given to each individual data point.
"""
# An illustration using a linear SVM with different values of C:
from figures import plot_linear_svc_regularization
plot_linear_svc_regularization()
# Similar for the Ridge/Lasso separation (we can set the penalty parameter to
# to l1 to enforce sparsity of the coefficients)
# Exercise : Use logisticRegression to classify digits, and 
#grid-search for the C parameter.
from sklearn.linear_model import LogisticRegression
params={'C' : [0.001,0.01,0.1,1,10,100]}
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
digits=load_digits()
X_train,X_test,y_train,y_test=train_test_split(digits.data,digits.target)
grid=GridSearchCV(LogisticRegression(),param_grid=params,n_jobs=-1)
grid.fit(X_train,y_train)
grid.score(X_test,y_test)
"""
Linear models for regression:
Linear models are useful when little data is available or for very large feature 
spaces as in text classification. 
They form a good case study for regularization.
the params are in coef_
the interecept is in intercept_
The most standard linear model is the ordinary least squares regression often
 simply called linear regression, this model does not put any additional 
 restrictions on coef_ so when the features number is largen it becomes 
 ill-posed and the model overfits.
"""
#Now we will generate a simple simulation and see the behavior of the model
import numpy as np
import matplotlib.pyplot as plt
rng=np.random.RandomState(4)
X=rng.normal(size=(1000,50))
beta=rng.normal(size=50)
y=np.dot(X,beta)+4*rng.normal(size=1000)
from sklearn.utils import shuffle
X,y=shuffle(X,y)
from sklearn import linear_model, cross_validation
from sklearn.learning_curve import learning_curve
def plot_learning_curve(est,X,y):
    training_set_size,train_scores,test_scores=learning_curve(est,X,y,train_sizes=np.linspace(0.1,1,30))
    estimator_name=est.__class__.__name__
    line=plt.plot(training_set_size,train_scores.mean(axis=1),'--',label="training scores"+estimator_name)
    plt.xlabel("training set size")
    plt.legend(loc="best")
    #plt.ylim(-1,1)
plot_learning_curve(linear_model.LinearRegression(),X,y)
"""
We see two important things:
The ordinary linear regression is not defined if the number of training samples 
is than features.
In the presence of noise: this model is overfitting: we need then to regularize 
"""
"""
The ridge estimator is a simple regularization (called l2 penalty) of the 
OLR
The ridge estimator is less expensive in computation than the OLR.
"""
plot_learning_curve(linear_model.LinearRegression(),X,y)
plot_learning_curve(linear_model.Ridge(alpha=20),X,y)
plot_learning_curve(linear_model.RidgeCV(), X, y)
plot_learning_curve(linear_model.LinearRegression(),X,y)
plot_learning_curve(linear_model.Ridge(alpha=20),X,y)
plot_learning_curve(linear_model.RidgeCV(), X, y)
"""
The Lasso estimator is useful to impose sparsity on the coefficient.
It is used if we believe that many of the features are not relevant which is 
done via the l1 penalty.
"""
#Let us create such a situation with a new simulation where only 10 out 
#of the 50 features are relevant:
beta[10:] = 0
y = np.dot(X, beta) + 4*rng.normal(size=1000)
plot_learning_curve(linear_model.Ridge(), X, y)
plot_learning_curve(linear_model.Lasso(), X, y)


"""
I will return another day on the linear model 
Now let's go to the Support Vector Machines
For classification problems: SVC
For regression problems: SVR
Linear SVM and Kernel SVM (linear,poly,rbf) 
"""
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import rbf_kernel
line=np.linspace(-3,3,100)[:,np.newaxis]
kernel_value=rbf_kernel(line,[[0]],gamma=1)
plt.plot(line,kernel_value)
# the idea here is to vary the value of C and gamma and see the change
from figures import plot_svm_interactive
plot_svm_interactive()

# Exercise without solution
from sklearn import datasets
digits = datasets.load_digits()
X, y = digits.data, digits.target
"""
Estimators in Depth: Trees and Forests:
Here we will explore a class of algorithms based on decision trees.
Decision trees are very intuitive. They encode a series of if and else choices.
really similar to how a person might make a decision However which questions 
to ask and how to proceed for each answer is entirely learned from the data.
"""
"""
Decision tree Regression:
A decision tree is a simple binary classification tree that is similar to nearest
 neighbor classification. 
"""
from figures import make_dataset
x,y=make_dataset()
X=x.reshape(-1,1)
from sklearn.tree import DecisionTreeRegressor
reg=DecisionTreeRegressor(max_depth=5)
reg.fit(X,y)
X_fit=np.linspace(-3,3,1000).reshape((-1,1))
y_fit_1=reg.predict(X_fit)
plt.plot(X_fit.ravel(),y_fit_1,color='blue',label='prediction')
plt.plot(X.ravel(),y,'.k',label='training data')
plt.legend(loc='best')
"""
Decision Tree Classification:
"""
from sklearn.datasets import make_blobs
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from figures import plot_2d_separator
X,y=make_blobs(centers=[[0,0],[1,1]],random_state=61526,n_samples=100)
X_train,X_test,y_train,y_test=train_test_split(X,y)
clf=DecisionTreeClassifier(max_depth=5)
clf.fit(X_train,y_train)
plot_2d_separator(clf,X,fill=True)