def load_lfw_people(data_home=None, funneled=True, resize=0.5, min_faces_per_person=10, color=False, slice_=(slice(70, 195), slice(78, 172))): """Loader for the Labeled Faces in the Wild (LFW) people dataset This dataset is a collection of JPEG pictures of famous people collected on the internet, all details are available on the official website: http://vis-www.cs.umass.edu/lfw/ Each picture is centered on a single face. Each pixel of each channel (color in RGB) is encoded by a float in range 0.0 - 1.0. The task is called Face Recognition (or Identification): given the picture of a face, find the name of the person given a training set (gallery). Parameters ---------- data_home: optional, default: None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/scikit_learn_data' subfolders. funneled: boolean, optional, default: True Download and use the funneled variant of the dataset. resize: float, optional, default 0.5 Ratio used to resize the each face picture. min_faces_per_person=10: int, optional, default 10 The extracted dataset will only retain pictures of people that have at least `min_faces_per_person` different pictures. color: boolean, optional, default False Keep the 3 RGB channels instead of averaging them to a single gray level channel. If color is True the shape of the data has one more dimension than than the shape with color = False. slice_: optional Provide a custom 2D slice (height, width) to extract the 'interesting' part of the jpeg files and avoid use statistical correlation from the background """ lfw_home, data_folder_path = check_fetch_lfw(data_home=data_home, funneled=funneled) logging.info('Loading LFW people faces from %s', lfw_home) # wrap the loader in a memoizing function that will return memmaped data # arrays for optimal memory usage m = Memory(cachedir=lfw_home, mmap_mode='c', verbose=0) load_func = m.cache(_load_lfw_people)
def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5, color=False, slice_=(slice(70, 195), slice(78, 172)), download_if_missing=True): """Loader for the Labeled Faces in the Wild (LFW) pairs dataset This dataset is a collection of JPEG pictures of famous people collected on the internet, all details are available on the official website: http://vis-www.cs.umass.edu/lfw/ Each picture is centered on a single face. Each pixel of each channel (color in RGB) is encoded by a float in range 0.0 - 1.0. The task is called Face Verification: given a pair of two pictures, a binary classifier must predict whether the two images are from the same person. In the official `README.txt`_ this task is described as the "Restricted" task. As I am not sure as to implement the "Unrestricted" variant correctly, I left it as unsupported for now. .. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt Parameters ---------- subset: optional, default: 'train' Select the dataset to load: 'train' for the development training set, 'test' for the development test set, and '10_folds' for the official evaluation set that is meant to be used with a 10-folds cross validation. data_home: optional, default: None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/scikit_learn_data' subfolders. funneled: boolean, optional, default: True Download and use the funneled variant of the dataset. resize: float, optional, default 0.5 Ratio used to resize the each face picture. color: boolean, optional, default False Keep the 3 RGB channels instead of averaging them to a single gray level channel. If color is True the shape of the data has one more dimension than than the shape with color = False. slice_: optional Provide a custom 2D slice (height, width) to extract the 'interesting' part of the jpeg files and avoid use statistical correlation from the background download_if_missing: optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. """ lfw_home, data_folder_path = check_fetch_lfw( data_home=data_home, funneled=funneled, download_if_missing=download_if_missing) logging.info('Loading %s LFW pairs from %s', subset, lfw_home) # wrap the loader in a memoizing function that will return memmaped data # arrays for optimal memory usage m = Memory(cachedir=lfw_home, mmap_mode='c', verbose=0) load_func = m.cache(_fetch_lfw_pairs) # select the right metadata file according to the requested subset label_filenames = { 'train': 'pairsDevTrain.txt', 'test': 'pairsDevTest.txt', '10_folds': 'pairs.txt', } if subset not in label_filenames: raise ValueError("subset='%s' is invalid: should be one of %r" % ( subset, list(sorted(label_filenames.keys())))) index_file_path = join(lfw_home, label_filenames[subset]) # load and memoize the pairs as np arrays pairs, target, target_names = load_func( index_file_path, data_folder_path, resize=resize, color=color, slice_=slice_) # pack the results as a Bunch instance return Bunch(data=pairs, target=target, target_names=target_names, DESCR="'%s' segment of the LFW pairs dataset" % subset)
def fetch_lfw_people(data_home=None, funneled=True, resize=0.5, min_faces_per_person=None, color=False, slice_=(slice(70, 195), slice(78, 172)), download_if_missing=True): """Loader for the Labeled Faces in the Wild (LFW) people dataset This dataset is a collection of JPEG pictures of famous people collected on the internet, all details are available on the official website: http://vis-www.cs.umass.edu/lfw/ Each picture is centered on a single face. Each pixel of each channel (color in RGB) is encoded by a float in range 0.0 - 1.0. The task is called Face Recognition (or Identification): given the picture of a face, find the name of the person given a training set (gallery). Parameters ---------- data_home: optional, default: None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/scikit_learn_data' subfolders. funneled: boolean, optional, default: True Download and use the funneled variant of the dataset. resize: float, optional, default 0.5 Ratio used to resize the each face picture. min_faces_per_person: int, optional, default None The extracted dataset will only retain pictures of people that have at least `min_faces_per_person` different pictures. color: boolean, optional, default False Keep the 3 RGB channels instead of averaging them to a single gray level channel. If color is True the shape of the data has one more dimension than than the shape with color = False. slice_: optional Provide a custom 2D slice (height, width) to extract the 'interesting' part of the jpeg files and avoid use statistical correlation from the background download_if_missing: optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. """ lfw_home, data_folder_path = check_fetch_lfw( data_home=data_home, funneled=funneled, download_if_missing=download_if_missing) logging.info('Loading LFW people faces from %s', lfw_home) # wrap the loader in a memoizing function that will return memmaped data # arrays for optimal memory usage m = Memory(cachedir=lfw_home, mmap_mode='c', verbose=0) load_func = m.cache(_fetch_lfw_people) # load and memoize the pairs as np arrays faces, target, target_names = load_func( data_folder_path, resize=resize, min_faces_per_person=min_faces_per_person, color=color, slice_=slice_) # pack the results as a Bunch instance return Bunch(data=faces, target=target, target_names=target_names, DESCR="LFW faces dataset")
def fetch_lfw_pairs(subset='train', data_home=None, funneled=True, resize=0.5, color=False, slice_=(slice(70, 195), slice(78, 172)), download_if_missing=True): """Loader for the Labeled Faces in the Wild (LFW) pairs dataset This dataset is a collection of JPEG pictures of famous people collected on the internet, all details are available on the official website: http://vis-www.cs.umass.edu/lfw/ Each picture is centered on a single face. Each pixel of each channel (color in RGB) is encoded by a float in range 0.0 - 1.0. The task is called Face Verification: given a pair of two pictures, a binary classifier must predict whether the two images are from the same person. In the official `README.txt`_ this task is described as the "Restricted" task. As I am not sure as to implement the "Unrestricted" variant correctly, I left it as unsupported for now. .. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt Parameters ---------- subset: optional, default: 'train' Select the dataset to load: 'train' for the development training set, 'test' for the development test set, and '10_folds' for the official evaluation set that is meant to be used with a 10-folds cross validation. data_home: optional, default: None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/scikit_learn_data' subfolders. funneled: boolean, optional, default: True Download and use the funneled variant of the dataset. resize: float, optional, default 0.5 Ratio used to resize the each face picture. color: boolean, optional, default False Keep the 3 RGB channels instead of averaging them to a single gray level channel. If color is True the shape of the data has one more dimension than than the shape with color = False. slice_: optional Provide a custom 2D slice (height, width) to extract the 'interesting' part of the jpeg files and avoid use statistical correlation from the background download_if_missing: optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. """ lfw_home, data_folder_path = check_fetch_lfw( data_home=data_home, funneled=funneled, download_if_missing=download_if_missing) logging.info('Loading %s LFW pairs from %s', subset, lfw_home) # wrap the loader in a memoizing function that will return memmaped data # arrays for optimal memory usage m = Memory(cachedir=lfw_home, mmap_mode='c', verbose=0) load_func = m.cache(_fetch_lfw_pairs) # select the right metadata file according to the requested subset label_filenames = { 'train': 'pairsDevTrain.txt', 'test': 'pairsDevTest.txt', '10_folds': 'pairs.txt', } if subset not in label_filenames: raise ValueError("subset='%s' is invalid: should be one of %r" % (subset, list(sorted(label_filenames.keys())))) index_file_path = join(lfw_home, label_filenames[subset]) # load and memoize the pairs as np arrays pairs, target, target_names = load_func(index_file_path, data_folder_path, resize=resize, color=color, slice_=slice_) # pack the results as a Bunch instance return Bunch(data=pairs, target=target, target_names=target_names, DESCR="'%s' segment of the LFW pairs dataset" % subset)
(redirects_url, redirects_filename), (page_links_url, page_links_filename), ] for url, filename in resources: if not os.path.exists(filename): import urllib print "Downloading data from '%s', please wait..." % url opener = urllib.urlopen(url) open(filename, 'wb').write(opener.read()) print ################################################################################ # Loading the redirect files memory = Memory(cachedir=".") def index(redirects, index_map, k): """Find the index of an article name after redirect resolution""" k = redirects.get(k, k) return index_map.setdefault(k, len(index_map)) DBPEDIA_RESOURCE_PREFIX_LEN = len("http://dbpedia.org/resource/") SHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1) def short_name(nt_uri): """Remove the < and > URI markers and the common URI prefix""" return nt_uri[SHORTNAME_SLICE]
X = np.random.randn(n_samples, size**2) for x in X: # smooth data x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel() X -= X.mean(axis=0) X /= X.std(axis=0) y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise ############################################################################### # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(len(y), 2) # cross-validation generator for model selection ridge = BayesianRidge() mem = Memory(cachedir='.', verbose=1) # Ward agglomeration followed by BayesianRidge A = grid_to_graph(n_x=size, n_y=size) ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem, n_components=1) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1) clf.fit(X, y, cv=cv) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_ coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function
X = np.random.randn(n_samples, size**2) for x in X: # smooth data x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel() X -= X.mean(axis=0) X /= X.std(axis=0) y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise ############################################################################### # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(len(y), 2) # cross-validation generator for model selection ridge = BayesianRidge() mem = Memory(cachedir='.', verbose=1) # Ward agglomeration followed by BayesianRidge A = grid_to_graph(n_x=size, n_y=size) ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem, n_components=1) clf = Pipeline([('ward', ward), ('ridge', ridge)]) parameters = {'ward__n_clusters': [10, 20, 30]} # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, parameters, n_jobs=1) clf.fit(X, y, cv=cv) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_ coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size)