Exemplo n.º 1
1
def tryLinearDiscriminantAnalysis(goFast):
  from sklearn.datasets import dump_svmlight_file, load_svmlight_file
  if goFast:
    training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True)
  else:
    training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True)
    validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True)
    testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True)

  from sklearn.lda import LDA
  from sklearn.metrics import accuracy_score
  from sklearn.grid_search import ParameterGrid
  from sklearn.decomposition import RandomizedPCA

  rpcaDataGrid = [{"n_components": [10,45,70,100],
                    "iterated_power": [2, 3, 4],
                    "whiten": [True]}]

  for rpca_parameter_set in ParameterGrid(rpcaDataGrid):
    rpcaOperator = RandomizedPCA(**rpca_parameter_set)
    rpcaOperator.fit(training_data,training_labels)
    new_training_data = rpcaOperator.transform(training_data,training_labels)
    new_validation_data = rpcaOperator.transform(validation_data,validation_labels)
    ldaOperator = LDA()
    ldaOperator.fit(new_training_data,training_labels)
    print "Score = " + str(accuracy_score(validation_labels,ldaOperator.predict(new_validation_data)))
Exemplo n.º 2
0
def test_randomized_pca_check_list():
    """Test that the projection by RandomizedPCA on list data is correct"""
    X = [[1.0, 0.0], [0.0, 1.0]]
    X_transformed = RandomizedPCA(n_components=1, random_state=0).fit(X).transform(X)
    assert_equal(X_transformed.shape, (2, 1))
    assert_almost_equal(X_transformed.mean(), 0.00, 2)
    assert_almost_equal(X_transformed.std(), 0.71, 2)
def make_pca_datapoints(terms_map, stopwords, clusters):
	new_terms_map = {}
	raw_data = []
	target = []
	for line in open(tweets_file):
		tokens = line.split()
		terms = [terms_map[int(term)] for term in tokens[3].split(',') if terms_map[int(term)] not in stopwords]
		for term in terms:
			if not term in new_terms_map:
				new_terms_map[term] = len(new_terms_map)
		new_term_ids = [new_terms_map[term] for term in terms]
                tags = [terms_map[int(term)] for term in tokens[4].split(',')]
		raw_data.append(new_term_ids)
		target.append(tags)
	data = lil_matrix( (len(raw_data), len(new_terms_map)) )
	count = 0
	for cur_vector in raw_data:
		for point in cur_vector:
			data[(count, point)] += 1
		count += 1
	pca = RandomizedPCA (n_components=100)
	transformed_data = pca.fit_transform(data) 
	
	xs = []
	ys = []
	count = 0
	for datum in transformed_data:
		for tag in target[count]:
			if (len(tag) > 1) and tag[1:] in clusters:
				xs.append(datum)
				ys.append(clusters[tag[1:]])
		count += 1

	del transformed_data
	return xs, ys	
Exemplo n.º 4
0
def build_classifier(train_data_x_in, train_data_y, classifier_in="svc_basic"):
    print "Attempting to build classifier."
    train_data_x = train_data_x_in
    transformer = ""
    # classifier = grid_search.GridSearchCV(svm.SVC(), parameters).fit(train_data_x, train_data_y)
    if classifier_in == "svc_basic":
        classifier = svm.SVC()
        print "Selection was basic svm.SVC."
    elif classifier_in == "svc_extensive":
        classifier = svm.SVC(kernel="linear", C=0.025, gamma=0.01)
        print "Selection was extensive svm.SVC, with linear kernel, C==0.025 and gamma==0.01."
    elif classifier_in == "kneighbors_basic":
        transformer = RandomizedPCA(n_components=2000)
        train_data_x = transformer.fit_transform(train_data_x)
        classifier = KNeighborsClassifier()
        print "Selection was KNeighbors basic, using RandomizedPCA to transform data first. n_components==2000."
    elif classifier_in == "bagging_basic":
        classifier = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
        print "Selection was Bagging basic, with max_samples==0.5 and max_features==0.5."
    elif classifier_in == "spectral_basic":
        transformer = SpectralEmbedding(n_components=2000)
        train_data_x = transformer.fit_transform(train_data_x)
        classifier = KNeighborsClassifier()
        print "Selection was Spectral basic, using svm.SVC with Spectral data fitting. n_components==2000."
    # default to SVC in case of any sort of parsing error.
    else:
        print "Error in selecting classifier class. Reverting to SVC."
        classifier = svm.SVC()
    classifier.fit(train_data_x, train_data_y)
    print "Doing classifier estimation."
    return classifier, train_data_x, transformer
Exemplo n.º 5
0
    def _prepare_pca(self, data, max_n_components):
        """ Helper Function """
        from sklearn.decomposition import RandomizedPCA

        # sklearn < 0.11 does not support random_state argument
        kwargs = {'n_components': max_n_components, 'whiten': False}

        aspec = inspect.getargspec(RandomizedPCA.__init__)
        if 'random_state' not in aspec.args:
            warnings.warn('RandomizedPCA does not support random_state '
                          'argument. Use scikit-learn to version 0.11 '
                          'or newer to get reproducible results.')
        else:
            kwargs['random_state'] = 0

        pca = RandomizedPCA(**kwargs)
        pca_data = pca.fit_transform(data.T)

        if self._explained_var > 1.0:
            if self.n_components is not None:  # normal n case
                self._comp_idx = np.arange(self.n_components)
                to_ica = pca_data[:, self._comp_idx]
            else:  # None case
                to_ica = pca_data
                self.n_components = pca_data.shape[1]
                self._comp_idx = np.arange(self.n_components)
        else:  # float case
            expl_var = pca.explained_variance_ratio_
            self._comp_idx = (np.where(expl_var.cumsum() <
                                      self._explained_var)[0])
            to_ica = pca_data[:, self._comp_idx]
            self.n_components = len(self._comp_idx)

        return to_ica, pca
Exemplo n.º 6
0
def test_explained_variance():
    """Check that PCA output has unit-variance"""
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 80

    X = rng.randn(n_samples, n_features)

    pca = PCA(n_components=2).fit(X)
    rpca = RandomizedPCA(n_components=2, random_state=42).fit(X)
    assert_array_almost_equal(pca.explained_variance_,
                              rpca.explained_variance_, 1)
    assert_array_almost_equal(pca.explained_variance_ratio_,
                              rpca.explained_variance_ratio_, 3)

    # compare to empirical variances
    X_pca = pca.transform(X)
    assert_array_almost_equal(pca.explained_variance_,
                              np.var(X_pca, axis=0))

    X_rpca = rpca.transform(X)
    assert_array_almost_equal(rpca.explained_variance_,
                              np.var(X_rpca, axis=0))

    # Compare with RandomizedPCA using sparse data
    X = csr_matrix(X)
    rpca = assert_warns(DeprecationWarning, rpca.fit, X)
    assert_array_almost_equal(pca.explained_variance_,
                              rpca.explained_variance_, 1)
    assert_array_almost_equal(pca.explained_variance_ratio_,
                              rpca.explained_variance_ratio_, 3)
Exemplo n.º 7
0
def rpca(numpy_file='../data/Paintings/two_class/Paintings_train.csv'):
    """ Performs randomized PCA on given numpy file.

    Given a numpy file of n-rows and n-cols, where the last column is
    the label and rest are features,n-rows are the samples.

    :type numpy_file: string
    :param numpy_file: The file name of numpy file to be analyzed.
    """
    import numpy as np
    import matplotlib.pyplot as pl
    import pandas as pd
    from sklearn.decomposition import RandomizedPCA

    all_data = np.loadtxt(numpy_file,delimiter=',')
    data = all_data[:,:-1]
    y = all_data[:,-1]
    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(data)
    df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1],\
                    "label":np.where(y==1, "realism", "abstract")})
    colors = ["red", "yellow"]
    for label, color in zip(df['label'].unique(), colors):
        mask = df['label']==label
        pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
    pl.legend()
    pl.title('Randomized PCA analysis')
    pl.show()
Exemplo n.º 8
0
def SVM(X_train, y_train, X_test):
    print("SVM with PCA of rbf, writening all on, no normalize")
    preprocessing.normalize(X_train, 'max')
    preprocessing.normalize(X_test, 'max')
    #preprocessing.robust_scale(X, axis=1, with_centering = True) #bad
    X_train = equalize_hist(X_train)
    X_test = equalize_hist(X_test)
    '''X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=TRAIN_TEST_SPLIT_RATIO)'''

    n_components = 147

    print("Extracting the top %d eigenfaces from %d faces"
          % (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=False).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    classifier13.fit(X_train_pca, y_train)
    return list(classifier13.predict(X_test_pca))
Exemplo n.º 9
0
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("-i", "--image", required = True, help = "Path to the image")
    args = vars(ap.parse_args())

    image = cv2.imread(args["image"])
    rects, img = detect(image)

    cropped = []

    for idx, (x1, y1, x2, y2) in enumerate(rects):
        crop_img = image[y1:y1 + (y2 - y1), x1:x1 + (x2 - x1)]
        crop_img = cv2.resize(crop_img, (100,100), interpolation = cv2.INTER_AREA)
        cv2.imshow("image" + str(idx), crop_img)
        new_img = crop_img.reshape(crop_img.shape[0] * crop_img.shape[1], 3)
        cropped.append(new_img.flatten())

    # reduce feature size
    cropped_pca = []
    pca = RandomizedPCA(n_components=100)
    cropped_pca = pca.fit_transform(cropped)

    # training (hardcoded for now)
    clf   = SVC(probability=True)
    train = cropped_pca[:7]
    test  = cropped_pca[7:13]
    # clf.fit([[0,0],[1,1]], [1, 2])
    clf.fit(train, [1,2,2,1,2,1,1])

    for item in test:
        print clf.predict_proba(item)
        print clf.predict(item)

    cv2.waitKey(0)
Exemplo n.º 10
0
def SVM(X_data, y_data):

	X_data = equalize_hist(X_data) 
	preprocessing.normalize(X_data, 'max')
	preprocessing.scale(X_data, axis=1)
	# preprocessing.normalize(X_data, 'max')
	# X_data = equalize_hist(X_data) 

	# divide our data set into a training set and a test set
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_data, y_data, test_size=TRAIN_TEST_SPLIT_RATIO)

	n_components = 120

	print("Extracting the top %d eigenfaces from %d faces"
		% (n_components, X_train.shape[0]))
	pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)

	print("Projecting the input data on the eigenfaces orthonormal basis")
	X_train_pca = pca.transform(X_train)
	X_test_pca = pca.transform(X_test)
	print("done ")

	param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
	'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
	classifier = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
	classifier.fit(X_train_pca, y_train)



	print("====== PCA 150 ========")
	print('TRAIN SCORE', classifier.score(X_train_pca, y_train))
	print('TEST SCORE', classifier.score(X_test_pca, y_test))
Exemplo n.º 11
0
    def detect(self, imageURLs, params):

        array = []
        for param in params:
            img = self.img_to_matrix(param['imageURL'])
            data = self.flatten_image(img)
            array.append(data)
        array = np.array(array)

        pca = RandomizedPCA(n_components=5)
        n_data = pca.fit_transform(array)

        clf = joblib.load('src/resource/models/model.pkl')
        result = clf.predict(n_data).tolist()

        for param, r in zip(params, result):
            raw_img = urllib2.urlopen(param['imageURL']).read()
            if r == 1:
                cntr = len([i for i in os.listdir("test/images/rain/") if 'rain' in i]) + 1
                path = "static/images/rain_" + str(cntr) + '.jpg'
                f = open(path, 'wb')
                f.write(raw_img)
                f.close()
                # イベント情報作成
                when = {'type': 'timestamp', 'time':param['time']}
                where = { "type": "Point", "coordinates": [param['longitude'], param['latitude']]}
                what = {'topic': {'value':u'雨'}, 'tweet': param['value']}
                who = [{"type": "url", "value": param['imageURL']},
                       {"value": "evwh <*****@*****.**>", "type": "author"}]
                event = {'observation':{'what': what, 'when': when, 'where': where, 'who': who}}
                self.connection['event']['TwitterImageRainSensor'].insert(event)
Exemplo n.º 12
0
def test_sparse_randomized_pca_inverse():
    """Test that RandomizedPCA is inversible on sparse data"""
    rng = np.random.RandomState(0)
    n, p = 50, 3
    X = rng.randn(n, p)  # spherical data
    X[:, 1] *= .00001  # make middle component relatively small
    # no large means because the sparse version of randomized pca does not do
    # centering to avoid breaking the sparsity
    X = csr_matrix(X)

    # same check that we can find the original data from the transformed signal
    # (since the data is almost of rank n_components)
    pca = RandomizedPCA(n_components=2, random_state=0)
    assert_warns(DeprecationWarning, pca.fit, X)
    Y = pca.transform(X)

    Y_inverse = pca.inverse_transform(Y)
    assert_almost_equal(X.todense(), Y_inverse, decimal=2)

    # same as above with whitening (approximate reconstruction)
    pca = assert_warns(DeprecationWarning, RandomizedPCA(n_components=2,
                       whiten=True, random_state=0).fit, X)
    Y = pca.transform(X)
    Y_inverse = pca.inverse_transform(Y)
    relative_max_delta = (np.abs(X.todense() - Y_inverse)
                          / np.abs(X).mean()).max()
    # XXX: this does not seam to work as expected:
    assert_almost_equal(relative_max_delta, 0.91, decimal=2)
Exemplo n.º 13
0
def pca_data(test_x, train_x, params):
    print 'pcaing data ...'
    components = int(params['components'])
    pca = RandomizedPCA(components, whiten=True).fit(train_x)
    pca_train_x = pca.transform(train_x)
    pca_test_x  = pca.transform(test_x)
    return pca_test_x, pca_train_x
Exemplo n.º 14
0
def compute_pca(reception_stats,n_components=5):
    reception_mean = reception_stats.mean(axis=0)
    pca = RandomizedPCA(n_components-1)
    pca.fit(reception_stats)
    pca_components = np.vstack([reception_mean,pca.components_])

    return pca,pca_components
Exemplo n.º 15
0
def getPrincipleComponents(xtr, xte, n_components=50):
    train = np.array(xtr)
    test = np.array(xte)
    pca = RandomizedPCA(n_components=n_components).fit(train)
    xtrain = pca.transform(train)
    xtest = pca.transform(test)
    return xtrain, xtest
Exemplo n.º 16
0
def do_pca(corr_matrix: _nested_ndarray, num_dim: int,
    min_var_explanation: float =0.7) -> _nested_ndarray:
    '''
    This method performs PCA on a self-correlation matrix, reducing the number of columns to `num_dim`.
    If such analysis does not sufficiently explain the underlying variance in the data, an exception is
    thrown.
    
    Args:

    * `corr_matrix` - a square matrix of correlations
    * `num_dim` - the number of dimensions to which the data should be reduced
    * `min_var_explanation` - the minimum fraction of the underlying data variance that should be explained

    Returns:

    > A matrix of the PCA result on `corr_matrix`.
    '''

    num_dim = int(num_dim)
    pca = PCA(n_components=num_dim, random_state=0)
    pca_result = pca.fit_transform(corr_matrix)
    var_ratio = pca.explained_variance_ratio_
    if sum(var_ratio) < min_var_explanation:
        raise PcaAccuracyException(
            'PCA doesn\'t explain enough of the variance in the data')

    return pca_result
Exemplo n.º 17
0
Arquivo: odr.py Projeto: caoym/odr
    def fit(self):

        wordids_map = NameToIndex()
        labs_map = NameToIndex()

        wordscount = self._word_cluster.get_words_count()
        print "start compute_tfidf ..."
        #计算文档的词袋模型
        docs = self._word_cluster.get_samples()
        count =0
        bow = []
        labs = []

        for k,v in docs.iteritems():
            vec = numpy.zeros(wordscount).tolist()
            for i in v:
                vec[wordids_map.map(i)]+=1
            bow.append(vec)
            labs.append(labs_map.map(k[0]))

        labs = numpy.array(labs)

        tfidf = TfidfTransformer(smooth_idf=True, sublinear_tf=True,use_idf=True)
        datas = numpy.array(tfidf.fit_transform(bow).toarray())

        print "compute_tfidf done"
        pca = RandomizedPCA(n_components=20, whiten=True).fit(datas)
        svc = train_svc(numpy.array(labs_map.names), labs, pca.transform(datas))

        self._tfidf = tfidf
        self._svc = svc
        self._labs_map = labs_map
        self._wordids_map = wordids_map
        self._pca = pca
Exemplo n.º 18
0
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
Exemplo n.º 19
0
def do_nbnn(train_folder, test_folder):
    train = load_patches(args.train_folder)
    test = load_patches(args.test_folder)
    if options.relu:
        get_logger().info("Applying RELU")
        for class_data in train:
            class_data.patches = class_data.patches.clip(min=0)
        for class_data in test:
            class_data.patches = class_data.patches.clip(min=0)
    if options.scale:
        get_logger().info("Applying standardization")
        scaler = StandardScaler(copy=False)
        scaler.fit(np.vstack([t.patches for t in train]))
        for class_data in train:
            class_data.patches = scaler.transform(class_data.patches)
        for class_data in test:
            class_data.patches = scaler.transform(class_data.patches)
    if options.pca:
        get_logger().info("Calculating PCA")
        pca = RandomizedPCA(n_components=options.pca)
        pca.fit(np.vstack([t.patches for t in train]))
        #for class_data in train:
            #get_logger().info("Fitting class " + class_data.name)
            #pca.partial_fit(class_data.patches)
        get_logger().info("Keeping " + str(pca.explained_variance_ratio_.sum()) + " variance (" + str(options.pca) +
             ") components\nApplying PCA")
        for class_data in train:
            class_data.patches = pca.transform(class_data.patches)
        for class_data in test:
            class_data.patches = pca.transform(class_data.patches)
    nbnn(train, test, NN_Engine())
Exemplo n.º 20
0
def main():
    img_dir = 'images/'
    images = [img_dir + f for f in os.listdir(img_dir)]
    labels = [f.split('/')[-1].split('_')[0] for f in images]
    label2ids = {v: i for i, v in enumerate(sorted(set(labels),
                                                   key=labels.index))}
    y = np.array([label2ids[l] for l in labels])

    data = []
    for image_file in images:
        img = img_to_matrix(image_file)
        img = flatten_image(img)
        data.append(img)
    data = np.array(data)

    # training samples
    is_train = np.random.uniform(0, 1, len(data)) <= 0.7
    train_X, train_y = data[is_train], y[is_train]

    # training a classifier
    pca = RandomizedPCA(n_components=5)
    train_X = pca.fit_transform(train_X)
    multi_svm = OneVsRestClassifier(LinearSVC())
    multi_svm.fit(train_X, train_y)

    # evaluating the model
    test_X, test_y = data[is_train == False], y[is_train == False]
    test_X = pca.transform(test_X)
    print pd.crosstab(test_y, multi_svm.predict(test_X),
                      rownames=['Actual'], colnames=['Predicted'])
def test_explained_variance():
    # Check that PCA output has unit-variance
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 80

    X = rng.randn(n_samples, n_features)

    pca = PCA(n_components=2).fit(X)
    rpca = RandomizedPCA(n_components=2, random_state=rng).fit(X)
    assert_array_almost_equal(pca.explained_variance_ratio_,
                              rpca.explained_variance_ratio_, 1)

    # compare to empirical variances
    X_pca = pca.transform(X)
    assert_array_almost_equal(pca.explained_variance_,
                              np.var(X_pca, axis=0))

    X_rpca = rpca.transform(X)
    assert_array_almost_equal(rpca.explained_variance_, np.var(X_rpca, axis=0),
                              decimal=1)

    # Same with correlated data
    X = datasets.make_classification(n_samples, n_features,
                                     n_informative=n_features-2,
                                     random_state=rng)[0]

    pca = PCA(n_components=2).fit(X)
    rpca = RandomizedPCA(n_components=2, random_state=rng).fit(X)
    assert_array_almost_equal(pca.explained_variance_ratio_,
                              rpca.explained_variance_ratio_, 5)
Exemplo n.º 22
0
def reduce_features(features, var_explained=0.9, n_components=0, verbose=False):
	"""
	Performs feature reduction using PCA. Automatically selects nr. components
	for explaining min_var_explained variance.
	:param features: Features.
	:param var_explained: Minimal variance explained.
	:param n_components: Nr. of components.
	:param exclude_columns: Columns to exclude.
	:param verbose: Verbosity.
	:return: Reduced feature set.
	"""
	if n_components == 0:
		# Run full PCA to estimate nr. components for explaining given
		# percentage of variance.
		estimator = RandomizedPCA()
		estimator.fit_transform(features)
		variance = 0.0
		for i in range(len(estimator.explained_variance_ratio_)):
			variance += estimator.explained_variance_ratio_[i]
			if variance > var_explained:
				n_components = i + 1
				if verbose:
					print('{} % of variance explained using {} components'.format(var_explained, n_components))
				break
	# Re-run PCA with only estimated nr. components
	estimator = RandomizedPCA(n_components=n_components)
	features = estimator.fit_transform(features)
	return features
Exemplo n.º 23
0
def SVM(X, y):

	X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=TRAIN_TEST_SPLIT_RATIO)
	print(len(X_train))

    # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
    # dataset): unsupervised feature extraction / dimensionality reduction
	n_components = 150
	pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)


	print("Projecting the input data on the eigenfaces orthonormal basis")
	X_train_pca = pca.transform(X_train)
	X_test_pca = pca.transform(X_test)
	print("done ")

	X_train_pca = equalize_hist(X_train_pca)
	preprocessing.scale(X_train_pca * 1.0, axis=1)
	X_test_pca = equalize_hist(X_test_pca)
	preprocessing.scale(X_test_pca * 1.0, axis=1)

    # classifier = svm.SVC(kernel='poly', degree = 3)
    # classifier.fit(X_train, y_train)
    # # print("======",3,"========")
    # print('TRAIN SCORE', classifier.score(X_train, y_train))
    # print('TEST SCORE', classifier.score(X_test, y_test))


	param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
	classifier2 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
	classifier2.fit(X_train_pca, y_train)
	# print("======",3,"========")
	print('TRAIN SCORE', classifier2.score(X_train_pca, y_train))
	print('TEST SCORE', classifier2.score(X_test_pca, y_test))
Exemplo n.º 24
0
def dimentionality_reduction(train_x , test_x):
	print "Dimentionality reduction to 10D on training and test data...."
	pca = RandomizedPCA(n_components=10)
	train_x = pca.fit_transform(train_x)
	test_x = pca.transform(test_x)
	print "Done."
	return train_x , test_x
Exemplo n.º 25
0
Arquivo: PCA.py Projeto: himl/boson
def pca_estimator(data, targets, estimator, components_number=DEFAULT_COMPONENTS_NUMBER,
                  folds_number=DEFAULT_FOLDS_NUMBER):

    kf = KFold(len(targets), n_folds=folds_number)

    # 'scores' is numpy array. An index is a number of a fold. A value is a percent of right
    # predicted samples from a test.
    scores = np.zeros(folds_number)

    start = time()

    index = 0
    for train, test in kf:
        x_train, x_test, y_train, y_test = data[train], data[test], targets[train], targets[test]

        pca = RandomizedPCA(n_components=components_number, whiten=True).fit(x_train)
        x_train_pca = pca.transform(x_train)
        x_test_pca = pca.transform(x_test)

        clf = estimator.fit(x_train_pca, y_train)
        scores[index] = clf.score(x_test_pca, y_test)
        index += 1
        # print("Iteration %d from %d has done! Score: %f" % (index, folds_number,
        #                                                     scores[index - 1]))
    finish = time()

    return scores.mean(), scores.std() * 2, (finish - start)
Exemplo n.º 26
0
def scatter(data, labels=None, title=None, name=None):
    """2d PCA scatter plot with optional class info

    Return the pca model to be able to introspect the components or transform
    new data with the same model.
    """
    data = atleast2d_or_csr(data)

    if data.shape[1] == 2:
        # No need for a PCA:
        data_2d = data
    else:
        pca = RandomizedPCA(n_components=2)
        data_2d = pca.fit_transform(data)

    for i, c, m in zip(np.unique(labels), cycle(COLORS), cycle(MARKERS)):
        plt.scatter(data_2d[labels == i, 0], data_2d[labels == i, 1],
                    c=c, marker=m, label=i, alpha=0.5)

    plt.legend(loc='best')
    if title is None:
        title = "2D PCA scatter plot"
        if name is not None:
            title += " for " + name
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.title(title)

    return pca
def LogisticRegressionPCA(X, y):

	# divide our data set into a training set and a test set
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    									X, y, test_size=TRAIN_TEST_SPLIT_RATIO)

	# get randomized PCA model
	num_components = 147
	print("Extracting the top %d eigenfaces from %d faces"
          % (num_components, X_train.shape[0]))
	pca = RandomizedPCA(n_components=num_components, whiten=True).fit(X_train)

    # use the PCA model on our training set and test set.
	print("Projecting the input data on the eigenfaces orthonormal basis")
	X_train_pca = pca.transform(X_train)
	X_test_pca = pca.transform(X_test)
	print("done ")

	h = .02  # step size in the mesh

	logistic_regression = linear_model.LogisticRegression(C=1e5)

	# we create an instance of Neighbours Classifier and fit the data.
	logistic_regression.fit(X, y)

	# print the performance of logistic regression 
	print("====== Logistic Regression with PCA ========")
	print('TRAIN SCORE', logistic_regression.score(X_train, y_train))
	print('TEST SCORE', logistic_regression.score(X_test, y_test))
Exemplo n.º 28
0
def calc_hog(fpaths, save=False):
    '''
    Compute histogram of gradients (HOG). Saves in batches to prevent memory issues.
    Input:
        fpaths : files on which HOG will be computed
        save : if true, output is saved to disk
    '''

    hogs = np.empty((len(fpaths), 15876))

    for i, fpath in enumerate(fpaths):
        img = imread(os.path.join(imgdir, fpath))
        if len(img.shape)==3:
            img = rgb2gray(img)
        # rescale so all feature vectors are the same length
        img_resize = resize(img, (128, 128))
        img_hog = hog(img_resize)

        hogs[i, :] = img_hog

    hogs_sc = scale(hogs)
    n_components = 15
    pca = RandomizedPCA(n_components=n_components)
    hogs_decomp = pca.fit_transform(hogs_sc)

    df = pd.DataFrame(hogs_decomp, index=[os.path.split(i)[1] for i in fpaths])
    df.index.name='fpath'
    df.columns = ['feat_hog_%2.2u' % i for i in range(1, n_components+1)]
    if save: df.to_csv('hog.csv')
    
    return df
Exemplo n.º 29
0
    def pca_linear_initialization(self, data):
        """
        We initialize the map, just by using the first two first eigen vals and eigenvectors
        Further, we create a linear combination of them in the new map by giving values from -1 to 1 in each

        X = UsigmaWT
        XTX = Wsigma^2WT
        T = XW = Usigma

        // Transformed by W EigenVector, can be calculated by multiplication PC matrix by eigenval too
        // Further, we can get lower ranks by using just few of the eigen vevtors

        T(2) = U(2)sigma(2) = XW(2) ---> 2 is the number of selected eigenvectors

        (*) Note that 'X' is the covariance matrix of original data

        :param data: data to use for the initialization
        :returns: initialized matrix with same dimension as input data
        """
        cols = self.mapsize[1]
        coord = None
        pca_components = None

        if np.min(self.mapsize) > 1:
            coord = np.zeros((self.nnodes, 2))
            pca_components = 2

            for i in range(0, self.nnodes):
                coord[i, 0] = int(i / cols)  # x
                coord[i, 1] = int(i % cols)  # y

        elif np.min(self.mapsize) == 1:
            coord = np.zeros((self.nnodes, 1))
            pca_components = 1

            for i in range(0, self.nnodes):
                coord[i, 0] = int(i % cols)  # y

        mx = np.max(coord, axis=0)
        mn = np.min(coord, axis=0)
        coord = (coord - mn)/(mx-mn)
        coord = (coord - .5)*2
        me = np.mean(data, 0)
        data = (data - me)
        tmp_matrix = np.tile(me, (self.nnodes, 1))

        pca = RandomizedPCA(n_components=pca_components)  # Randomized PCA is scalable
        pca.fit(data)
        eigvec = pca.components_
        eigval = pca.explained_variance_
        norms = np.sqrt(np.einsum('ij,ij->i', eigvec, eigvec))
        eigvec = ((eigvec.T/norms)*eigval).T

        for j in range(self.nnodes):
            for i in range(eigvec.shape[0]):
                tmp_matrix[j, :] = tmp_matrix[j, :] + coord[j, i]*eigvec[i, :]

        self.matrix = np.around(tmp_matrix, decimals=6)
        self.initialized = True
Exemplo n.º 30
0
def rpca(train_X, test_X, n):
	start_time = time.time()
	pca = RandomizedPCA(n_components=n)
	pca.fit(train_X.toarray())
	train_X_pca = pca.transform(train_X.toarray())
	test_X_pca = pca.transform(test_X.toarray())
	print("--- %s seconds ---" % (time.time() - start_time))
	return pca, train_X_pca, test_X_pca
Exemplo n.º 31
0
def SVM(X, y):
    print("SVM with PCA of rbf, writening all on, no normalize")
    preprocessing.normalize(X, 'max')
    #preprocessing.robust_scale(X, axis=1, with_centering = True) #bad
    X = equalize_hist(X)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=TRAIN_TEST_SPLIT_RATIO)

    n_components = 120

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=False).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier13.fit(X_train_pca, y_train)

    print("====== PCA 120 ========")
    print('TRAIN SCORE', classifier13.score(X_train_pca, y_train))
    print('TEST SCORE', classifier13.score(X_test_pca, y_test))

    n_components = 130

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=False).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier13.fit(X_train_pca, y_train)

    print("====== PCA 130 ========")
    print('TRAIN SCORE', classifier13.score(X_train_pca, y_train))
    print('TEST SCORE', classifier13.score(X_test_pca, y_test))

    n_components = 147

    print("Extracting the top %d eigenfaces from %d faces" %
          (n_components, X_train.shape[0]))
    pca = RandomizedPCA(n_components=n_components, whiten=False).fit(X_train)

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done ")

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                                param_grid)
    classifier13.fit(X_train_pca, y_train)

    print("====== PCA 147 ========")
    print('TRAIN SCORE', classifier13.score(X_train_pca, y_train))
    print('TEST SCORE', classifier13.score(X_test_pca, y_test))
    '''
Exemplo n.º 32
0
def test_SVM(face_profile_data, face_profile_name_index, face_dim, face_profile_names):
    """
    Testing: Build the SVM classification modle using the face_profile_data matrix (numOfFace X numOfPixel) and face_profile_name_index array, face_dim is a tuple of the dimension of each image(h,w) Returns the SVM classification modle
    Parameters
    ----------
    face_profile_data : ndarray (number_of_images_in_face_profiles, width * height of the image)
        The pca that contains the top eigenvectors extracted using approximated Singular Value Decomposition of the data

    face_profile_name_index : ndarray
        The name corresponding to the face profile is encoded in its index

    face_dim : tuple (int, int)
        The dimension of the face data is reshaped to

    face_profile_names: ndarray
        The names corresponding to the face profiles
    Returns
    -------
    clf : theano object
        The trained SVM classification model

    pca : theano ojbect
        The pca that contains the top 150 eigenvectors extracted using approximated Singular Value Decomposition of the data

    """
    X = face_profile_data
    y = face_profile_name_index

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
    # dataset): unsupervised feature extraction / dimensionality reduction
    n_components = 150 # maximum number of components to keep

    print("\nExtracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]))

    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    eigenfaces = pca.components_.reshape((n_components, face_dim[0], face_dim[1]))

    # This portion of the code is used if the data is scarce, it uses the number 
    # of imputs as the number of features
    # pca = RandomizedPCA(n_components=None, whiten=True).fit(X_train)
    # eigenfaces = pca.components_.reshape((pca.components_.shape[0], face_dim[0], face_dim[1]))

    print("\nProjecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test) 

    # Train a SVM classification model

    print("\nFitting the classifier to the training set")
    param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
                  'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
    # clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    # Train_pca Test Error Rate:  0.0670016750419
    # Train_pca Test Recognition Rate:  0.932998324958



    # clf = SVC(kernel='linear', C=1)
    # 2452  samples from  38  people are loaded
    # Extracting the top 150 eigenfaces from 1839 faces
    # Extracting the top 150 eigenfaces from 1790 faces
    # Train_pca Test Error Rate:  0.0904522613065
    # Train_pca Test Recognition Rate:  0.909547738693

    # clf = SVC(kernel='poly')
    # Train_pca Test Error Rate:  0.201005025126
    # Train_pca Test Recognition Rate:  0.798994974874

    # clf = SVC(kernel='sigmoid')
    # Train_pca Test Error Rate:  0.985318107667
    # Train_pca Test Recognition Rate:  0.0146818923328
    

    # clf = SVC(kernel='rbf').fit(X_train, y_train)
    # Train_pca Test Error Rate:  0.0619765494137
    # Train_pca Test Recognition Rate:  0.938023450586



    # Best Estimator found using Radial Basis Function Kernal:
    clf = SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
    # Train_pca with Alex Test Error Rate:  0.088424437299
    # Train_pca with Alex Test Recognition Rate:  0.911575562701

    clf = clf.fit(X_train_pca, y_train)
    # print("\nBest estimator found by grid search:")
    # print(clf.best_estimator_)

    ###############################################################################
    # Quantitative evaluation of the model quality on the test set
    print("\nPredicting people's names on the test set")
    t0 = time()
    y_pred = clf.predict(X_test_pca)
    print("\nPrediction took %0.8f second per sample on average" % ((time() - t0)/y_pred.shape[0]*1.0))

    # print "predicated names: ", y_pred
    # print "actual names: ", y_test
    error_rate = errorRate(y_pred, y_test)
    print ("\nTest Error Rate: %0.4f %%" % (error_rate * 100))
    print ("Test Recognition Rate: %0.4f %%" % ((1.0 - error_rate) * 100))

    ###############################################################################
    # Testing

    # X_test_pic1 = X_test[0]
    # X_test_pic1_for_display = np.reshape(X_test_pic1, face_dim)

    # t0 = time()
    # pic1_pred_name = predict(clf, pca, X_test_pic1, face_profile_names)
    # print("\nPrediction took %0.3fs" % (time() - t0))
    # print "\nPredicated result for picture_1 name: ", pic1_pred_name
    # for i in range(1,3): print ("\n")

    # Display the picture
    # plt.figure(1)
    # plt.title(pic1_pred_name)
    # plt.subplot(111)
    # plt.imshow(X_test_pic1_for_display)
    # plt.show()


    ###############################################################################
    # Qualitative evaluation of the predictions using matplotlib
    # import matplotlib.pyplot as plt

    # def plot_gallery(images, titles, face_dim, n_row=3, n_col=4):
    #     """Helper function to plot a gallery of portraits"""
    #     plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    #     plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    #     for i in range(n_row * n_col):
    #         plt.subplot(n_row, n_col, i + 1)
    #         plt.imshow(images[i].reshape(face_dim), cmap=plt.cm.gray)
    #         plt.title(titles[i], size=12)
    #         plt.xticks(())
    #         plt.yticks(())


    # # plot the result of the prediction on a portion of the test set

    # def title(y_pred, y_test, face_profile_names, i):
    #     pred_name = face_profile_names[y_pred[i]].rsplit(' ', 1)[-1]
    #     true_name = face_profile_names[y_test[i]].rsplit(' ', 1)[-1]
    #     return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

    # prediction_titles = [title(y_pred, y_test, face_profile_names, i)
    #                      for i in range(y_pred.shape[0])]

    # plot_gallery(X_test, prediction_titles, face_dim)

    # # plot the gallery of the most significative eigenfaces

    # eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
    # plot_gallery(eigenfaces, eigenface_titles, face_dim)

    # plt.show()


    return clf, pca
Exemplo n.º 33
0
def build_SVC(face_profile_data, face_profile_name_index, face_dim):
    """
    Build the SVM classification modle using the face_profile_data matrix (numOfFace X numOfPixel) and face_profile_name_index array, face_dim is a tuple of the dimension of each image(h,w) Returns the SVM classification modle
    Parameters
    ----------
    face_profile_data : ndarray (number_of_images_in_face_profiles, width * height of the image)
        The pca that contains the top eigenvectors extracted using approximated Singular Value Decomposition of the data

    face_profile_name_index : ndarray
        The name corresponding to the face profile is encoded in its index

    face_dim : tuple (int, int)
        The dimension of the face data is reshaped to

    Returns
    -------
    clf : theano object
        The trained SVM classification model

    pca : theano ojbect
        The pca that contains the top 150 eigenvectors extracted using approximated Singular Value Decomposition of the data

    """

    X = face_profile_data
    y = face_profile_name_index

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
    # dataset): unsupervised feature extraction / dimensionality reduction
    n_components = 10 # maximum number of components to keep

    print("\nExtracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]))

    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    eigenfaces = pca.components_.reshape((n_components, face_dim[0], face_dim[1]))

    # This portion of the code is used if the data is scarce, it uses the number 
    # of imputs as the number of features
    # pca = RandomizedPCA(n_components=None, whiten=True).fit(X_train)
    # eigenfaces = pca.components_.reshape((pca.components_.shape[0], face_dim[0], face_dim[1]))

    print("\nProjecting the input data on the eigenfaces orthonormal basis")
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test) 

    # Train a SVM classification model

    print("\nFitting the classifier to the training set")
    param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
                  'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
    # clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)

    # Best Estimator found using Radial Basis Function Kernal:
    clf = SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
    # Train_pca with Alex Test Error Rate:  0.088424437299
    # Train_pca with Alex Test Recognition Rate:  0.911575562701

    clf = clf.fit(X_train_pca, y_train)
    # print("\nBest estimator found by grid search:")
    # print(clf.best_estimator_)

    ###############################################################################
    # Quantitative evaluation of the model quality on the test set
    print("\nPredicting people's names on the test set")
    t0 = time()
    y_pred = clf.predict(X_test_pca)
    print("\nPrediction took %s per sample on average" % ((time() - t0)/y_pred.shape[0]*1.0))

    # print "predicated names: ", y_pred
    # print "actual names: ", y_test
    error_rate = errorRate(y_pred, y_test)
    print ("\nTest Error Rate: %0.4f %%" % (error_rate * 100))
    print ("Test Recognition Rate: %0.4f %%" % ((1.0 - error_rate) * 100))

    return clf, pca
 mapping_targets = np.hstack(np.array(mapping_targets))

 first_half = np.hstack(Data[0:no_mappings/2,:,:,:])
 first_half = np.vstack(first_half)
 second_half = np.hstack(Data[no_mappings/2:no_mappings,:,:,:])
 second_half = np.vstack(second_half)
 Data = np.vstack([first_half,second_half])
 # for true targets uncomment next line
 targets = np.hstack([targets,targets])

 #for random targets uncomment next line
 #targets = np.random.randint(1,no_locations+1,no_mappings*no_locations*no_thwacks)


 lda = LDA(n_components=14)
 pca = RandomizedPCA(n_components = 125)
 classifier =  KNeighborsClassifier(8)
 proj = pca.fit_transform(Data)
 proj = lda.fit_transform(proj,targets)
 proj1 = pca.fit_transform(Data)
 proj1 = lda.fit_transform(proj1,mapping_targets)
 print(file)
 plt.clf()
 plt.scatter(proj[0:proj.shape[0]/2,0],proj[0:proj.shape[0]/2,1],c=targets[0:targets.shape[0]/2])
 plt.title(file.rsplit('_')[0]+'_'+file.rsplit('_')[1]+" Before "+file.rsplit('_')[2]+" injection")
 plt.colorbar()
 plt.ylabel("LD1")
 plt.xlabel("LD2")
 plt.savefig(file.rsplit('_')[0]+'_'+file.rsplit('_')[1]+" Before "+file.rsplit('_')[2]+file[-11:-4]+" injection.svg")
 plt.show()
 plt.clf()
Exemplo n.º 35
0
            color='w',
            zorder=10)
plt.title('Kmeans clustering on Pima dataset after ICA\n'
          'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
file_name = 'Plots/Kmeans Pima after ICA ' + str(n_components) + '.png'
fig.savefig(file_name)
plt.close()

##############################################################################
# Visualize the results on RP-reduced data

reduced_data = RandomizedPCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="random", n_clusters=n_components, n_init=10)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02  # point in the mesh [x_min, m_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max()
y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max()
print(x_min, x_max, y_min, y_max)
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
fig = plt.figure()
plt.clf()
Exemplo n.º 36
0
    def pca_linear_initialization(self, data):
        """
        We initialize the map, just by using the first two first eigen vals and eigenvectors
        Further, we create a linear combination of them in the new map by giving values from -1 to 1 in each

        X = UsigmaWT
        XTX = Wsigma^2WT
        T = XW = Usigma

        // Transformed by W EigenVector, can be calculated by multiplication PC matrix by eigenval too
        // Further, we can get lower ranks by using just few of the eigen vevtors

        T(2) = U(2)sigma(2) = XW(2) ---> 2 is the number of selected eigenvectors

        (*) Note that 'X' is the covariance matrix of original data

        :param data: data to use for the initialization
        :returns: initialized matrix with same dimension as input data
        """
        cols = self.mapsize[1]
        coord = None
        pca_components = None

        if np.min(self.mapsize) > 1:
            coord = np.zeros((self.nnodes, 2))
            pca_components = 2

            for i in range(0, self.nnodes):
                coord[i, 0] = int(i / cols)  # x
                coord[i, 1] = int(i % cols)  # y

        elif np.min(self.mapsize) == 1:
            coord = np.zeros((self.nnodes, 1))
            pca_components = 1

            for i in range(0, self.nnodes):
                coord[i, 0] = int(i % cols)  # y

        mx = np.max(coord, axis=0)
        mn = np.min(coord, axis=0)
        coord = (coord - mn) / (mx - mn)
        coord = (coord - .5) * 2
        me = np.mean(data, 0)
        data = (data - me)
        tmp_matrix = np.tile(me, (self.nnodes, 1))

        pca = RandomizedPCA(
            n_components=pca_components)  # Randomized PCA is scalable
        pca.fit(data)
        eigvec = pca.components_
        eigval = pca.explained_variance_
        norms = np.sqrt(np.einsum('ij,ij->i', eigvec, eigvec))
        eigvec = ((eigvec.T / norms) * eigval).T

        for j in range(self.nnodes):
            for i in range(eigvec.shape[0]):
                tmp_matrix[
                    j, :] = tmp_matrix[j, :] + coord[j, i] * eigvec[i, :]

        self.matrix = np.around(tmp_matrix, decimals=6)
        self.initialized = True
Exemplo n.º 37
0
def lininit(self):
    #X = UsigmaWT
    #XTX = Wsigma^2WT
    #T = XW = Usigma #Transformed by W EigenVector, can be calculated by
    #multiplication PC matrix by eigenval too
    #Furthe, we can get lower ranks by using just few of the eigen vevtors
    #T(2) = U(2)sigma(2) = XW(2) ---> 2 is the number of selected eigenvectors
    # This is how we initialize the map, just by using the first two first eigen vals and eigenvectors
    # Further, we create a linear combination of them in the new map by giving values from -1 to 1 in each
    #Direction of SOM map
    # it shoud be noted that here, X is the covariance matrix of original data

    msize =  getattr(self, 'mapsize')
    rows = msize[0]
    cols = msize[1]
    nnodes = getattr(self, 'nnodes')

    if np.min(msize)>1:
        # set coordinates of the nodes in the grid (row, col)
        coord = np.zeros((nnodes, 2))
        for i in range(0, nnodes):
            coord[i,0] = int(i / cols) #x
            coord[i,1] = int(i % cols) #y
        mx = np.max(coord, axis = 0)
        mn = np.min(coord, axis = 0)
        # normalize the coordinates between [-1,1]
        coord = (coord - mn) / (mx - mn)
        coord = (coord - .5) * 2
        # for each column, shift data around its mean
        data = getattr(self, 'data')
        me = np.mean(data, 0)
        data = (data - me)
        # initialize codebook as a matrix dim * nnodes
        codebook = np.tile(me, (nnodes, 1))

        # pca
        pca = RandomizedPCA(n_components=2) #Randomized PCA is scalable
        #pca = PCA(n_components=2)
        pca.fit(data)
        eigvec = pca.components_
        eigval = pca.explained_variance_
        # compute the norms of the eigenvectors, normalize and multiply by eigenvalue
        norms = np.sqrt(np.einsum('ij,ij->i', eigvec, eigvec))
        eigvec = ((eigvec.T / norms) * eigval).T

        # add the normalized eigenvector
        for j in range(nnodes):
            for i in range(eigvec.shape[0]):
                codebook[j,:] += coord[j,i] * eigvec[i,:]
        return np.around(codebook, decimals = 6)

    elif np.min(msize) == 1:
        coord = np.arange(nnodes)[:, np.newaxis]
        mx = np.max(coord, axis = 0)
        mn = np.min(coord, axis = 0)

        # normalize the coordinates between [-1,1]
        coord = (coord - mn) / (mx - mn)
        coord = (coord - .5) * 2
        # for each column, shift data around its mean
        data = getattr(self, 'data')
        me = np.mean(data, 0)
        data = (data - me)
        # initialize codebook as a matrix dim * nnodes
        codebook = np.tile(me, (nnodes,1))

        # pca
        pca = RandomizedPCA(n_components=1) #Randomized PCA is scalable
        pca.fit(data)
        eigvec = pca.components_
        eigval = pca.explained_variance_
        # compute the norms of the eigenvectors, normalize and multiply by eigenvalue
        norms = np.sqrt(np.einsum('ij,ij->i', eigvec, eigvec))
        eigvec = ((eigvec.T/norms)*eigval).T; eigvec.shape

        # add the normalized eigenvector
        for j in range(nnodes):
            for i in range(eigvec.shape[0]):
                codebook[j,:] += coord[j,i]*eigvec[i,:]
        return np.around(codebook, decimals = 6)
"""

from sklearn.datasets import fetch_lfw_people
from sklearn.decomposition import PCA as RandomizedPCA
import matplotlib.pyplot as plt

faces = fetch_lfw_people(min_faces_per_person=60)

print(faces.target_names)
print(faces.images.shape)
n_samples, h, w = faces.images.shape
print(n_samples)

n_components = 150
pca = RandomizedPCA(
    n_components=n_components,
    svd_solver='randomized')  ##Randomized PCA for the the first 150 components
pca.fit(faces.data)

print(pca.components_)  ##These are the first 150 Principal Components

pcacomponents25 = pca.components_[0:25]  ##First 25 Principal Components

eigenfaces = pca.components_.reshape(
    (n_components, h, w))  ##Eigenfaces for 150 PCs

eigenfaces25 = pcacomponents25.reshape(
    (25, h, w))  ##Eigenfaces for first 25 PCs

## Plotting EigenFaces for First 25 PCs
fig, axes = plt.subplots(3,
Exemplo n.º 39
0
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

from sklearn.datasets import fetch_lfw_people

faces = fetch_lfw_people(min_faces_per_person=60)
print(faces.target_names)
print(faces.images.shape)

from sklearn.decomposition import RandomizedPCA

pca = RandomizedPCA(150)
pca.fit(faces.data)

fig, axes = plt.subplots(3,
                         8,
                         figsize=(9, 4),
                         subplot_kw={
                             'xticks': [],
                             'yticks': []
                         },
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
    ax.imshow(pca.components_[i].reshape(62, 47), cmap='bone')
plt.show()

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
    img = img_to_matrix(image)
    img = flatten_image(img)
    data.append(img)
    
data = np.array(data)

is_train = np.random.uniform(0, 1, len(data)) <= 0.7
y = np.where(np.array(labels)==query, 1, 0)

train_x, train_y = data[is_train], y[is_train]
test_x, test_y = data[is_train==False], y[is_train==False]

#add input to specify number of components to determine
UniqueImageComponents = int(raw_input("How many unique features are needed to distinguish between your image types? Choose a number between 2 and 6. "))

pca = RandomizedPCA(n_components=UniqueImageComponents)
X = pca.fit_transform(data)


make_plot(pd)
pl.show()

train_x = pca.fit_transform(train_x)
test_x = pca.transform(test_x)
print "Training and Test sets are created."
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
print "Running your machine learning model on the test set."
knn.fit(train_x, train_y)
result = knn.predict(test_x)
Exemplo n.º 41
0
def find_and(n_components, plot):
    print "Extracting the top %d eigenfaces from %d faces" % (n_components,
                                                              X_train.shape[0])
    t0 = time()
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
    print "done in %0.3fs" % (time() - t0)

    eigenfaces = pca.components_.reshape((n_components, h, w))

    # print 'components', pca.explained_variance_ratio_, pca.components_,
    print 'components1', pca.explained_variance_ratio_[0]
    print 'components1', pca.explained_variance_ratio_[1]

    print "Projecting the input data on the eigenfaces orthonormal basis"
    t0 = time()
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print "done in %0.3fs" % (time() - t0)

    ###############################################################################
    # Train a SVM classification model

    print "Fitting the classifier to the training set"
    t0 = time()
    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
    }
    # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
    clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
    clf = clf.fit(X_train_pca, y_train)
    print "done in %0.3fs" % (time() - t0)
    print "Best estimator found by grid search:"
    print clf.best_estimator_

    ###############################################################################
    # Quantitative evaluation of the model quality on the test set

    print "Predicting the people names on the testing set"
    t0 = time()
    y_pred = clf.predict(X_test_pca)
    print "done in %0.3fs" % (time() - t0)

    print classification_report(y_test, y_pred, target_names=target_names)
    print confusion_matrix(y_test, y_pred, labels=range(n_classes))

    prediction_titles = [
        title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0])
    ]

    if plot == True:
        plot_gallery(X_test, prediction_titles, h, w)

        # plot the gallery of the most significative eigenfaces

        eigenface_titles = [
            "eigenface %d" % i for i in range(eigenfaces.shape[0])
        ]
        plot_gallery(eigenfaces, eigenface_titles, h, w)

        pl.show()
Exemplo n.º 42
0
random_pca_data_50 = normalization('gene_IndividualsArr.pkl',
                                   'top10Genes_Indiv.pkl')
random_pca_data_25 = normalization('gene_IndividualsArr.pkl',
                                   'top10Genes_Indiv.pkl')
random_pca_data_10 = normalization('gene_IndividualsArr.pkl',
                                   'top10Genes_Indiv.pkl')
pca_data = normalization('gene_IndividualsArr.pkl', 'top10Genes_Indiv.pkl')
sparse_pca_data = normalization('gene_IndividualsArr.pkl',
                                'top10Genes_Indiv.pkl')
kernel_pca_data = normalization('gene_IndividualsArr.pkl',
                                'top10Genes_Indiv.pkl')
fast_ica_data = normalization('gene_IndividualsArr.pkl',
                              'top10Genes_Indiv.pkl')
nmf_data = normalization('gene_IndividualsArr.pkl', 'top10Genes_Indiv.pkl')

random_pca_50 = RandomizedPCA(n_components=50)
random_pca_model_50 = random_pca_50.fit(random_pca_data_50)
random_X_new_50 = random_pca_50.fit_transform(X)
print 'random_pca_50 explained', random_pca_50.explained_variance_ratio_
print 'random_pca_50 explained sum', sum(
    random_pca_50.explained_variance_ratio_)
joblib.dump(random_pca_model_50, 'random_pca_model_50.pkl')
joblib.dump(random_pca_50.explained_variance_ratio_,
            'random_pca_50.explained_variance_ratio_.pkl')
joblib.dump(random_X_new_50, 'random_X_new_50.pkl')

random_pca_25 = RandomizedPCA(n_components=25)
random_pca_model_25 = random_pca_25.fit(random_pca_data_25)
random_X_new_25 = random_pca_25.fit_transform(X)
print 'random_pca_25 explained', random_pca_25.explained_variance_ratio_
print 'random_pca_25 explained sum', sum(
Exemplo n.º 43
0
print "n_features: %d" % n_features
print "n_classes: %d" % n_classes


###############################################################################
# Split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

###############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 250

print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
print "done in %0.3fs" % (time() - t0)

eigenfaces = pca.components_.reshape((n_components, h, w))

print "Projecting the input data on the eigenfaces orthonormal basis"
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print "done in %0.3fs" % (time() - t0)


###############################################################################
# Train a SVM classification model

print "Fitting the classifier to the training set"
Exemplo n.º 44
0
print "n_features: %d" % n_features
print "n_classes: %d" % n_classes


###############################################################################
# Split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

###############################################################################
# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150

print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
t0 = time()
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
print "done in %0.3fs" % (time() - t0)

eigenfaces = pca.components_.reshape((n_components, h, w))

print "pca component variance ", pca.explained_variance_[:2]

print "Projecting the input data on the eigenfaces orthonormal basis"
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print "done in %0.3fs" % (time() - t0)


###############################################################################
# Train a SVM classification model
Exemplo n.º 45
0
# Encode the dependent variable
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
Y_Results = np.array(['0=Block', '1=NB-No_Block', '2=NB-Wait', '3=No_Block'])
print(Y_Results)
print(Y[:10])

# Part B:  Run Random Component Analysis (RCA) algorithm

# Scale the independent variobles
sc = StandardScaler()
X = sc.fit_transform(X)

# Apply RCA to the independent variables
rca = RCA(random_state=1)
X_new = rca.fit_transform(X)
var = rca.explained_variance_ratio_
print(pd.DataFrame(var[:10]))

# Part C:  Use dimensionally reduced dataset to cluster

# Using the elbow method to find the optimal number of clusters
wcss = []
for i in range(1, 15):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 1)
    kmeans.fit(X_new)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 15), wcss)
plt.title('Finding the Best K:  The Elbow Method')
plt.xlabel('Number of clusters')
Use PCA to reconstruct some of the MNIST test digits.
"""

# My libraries
import mnist_loader

# Third-party libraries
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import RandomizedPCA

# Training
training_data, test_inputs, actual_test_results = mnist_loader.load_data_nn()
pca = RandomizedPCA(n_components=30)
nn_images = [x for (x, y) in training_data]
pca_images = np.concatenate(nn_images, axis=1).transpose()
pca_r = pca.fit(pca_images)

# Try PCA on first ten test images
test_images = np.array(test_inputs[:10]).reshape((10, 784))
test_outputs = pca_r.inverse_transform(pca_r.transform(test_images))

# Plot the first ten test images and the corresponding outputs
fig = plt.figure()
ax = fig.add_subplot(111)
images_in = [test_inputs[j].reshape(-1, 28) for j in range(10)]
images_out = [test_outputs[j].reshape(-1, 28) for j in range(10)]
image_in = np.concatenate(images_in, axis=1)
image_out = np.concatenate(images_out, axis=1)
plot_digits(filtered)
show()

#Example Eigenfaces

#get out face data
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
print(faces.target_names)
print(faces.images.shape)

#we will use RandomizedPCA since this is a large dataset
#we will reduce from near 3000 to 150 components

from sklearn.decomposition import PCA as RandomizedPCA
pca = RandomizedPCA(150)
pca.fit(faces.data)

fig, axes = plt.subplots(3, 8, figsize=(9, 4),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
    ax.imshow(pca.components_[i].reshape(62, 47), cmap='bone')

show()
#lets check the cumulative variance
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');
#150 turns out to be around 90% of variance
#lets compare to the full data
 ## Do the tensor factorization
 np.random.seed(seed)
 M, cpstats, mstats = CP_APR.cp_apr(trainX, R, maxiters=outerIter, maxinner=10)
 M.normalize_sort(1)
 # zero out the small factors
 for n in range(1,2):
     zeroIdx = np.where(M.U[n] < zeroThr)
     M.U[n][zeroIdx] = 0
 klp = KLProjection.KLProjection(M.U, M.R)
 ptfFeat = klp.projectSlice(X, 0)
 ptfMatrix = khatrirao.khatrirao(M.U[1], M.U[2])
 dbOutput = getDBEntry("CP-APR", ptfMatrix)
 
 ## now we want to do PCA and NMF as well
 flatX =  sptenmat.sptenmat(X, [0]).tocsrmat() # matricize along the first mode
 pcaModel = RandomizedPCA(n_components=R)
 pcaModel.fit(flatX[train, :])
 pcaFeat = pcaModel.transform(flatX)
 pcaBasis = pcaModel.components_
 dbOutput = np.vstack((dbOutput, getDBEntry("PCA", pcaBasis)))
 
 nmfModel = nimfa.mf(flatX[train,:], method="nmf", max_iter=outerIter, rank=R)
 nmfResult = nimfa.mf_run(nmfModel)
 nmfFeat = nmfTransform(R, nmfResult, flatX)
 ## get the basis to be stored off
 nmfBasis = nmfResult.coef().transpose()
 nmfBasis = preprocessing.normalize(nmfBasis, norm="l1", axis=0)
 nmfBasis = nmfBasis.toarray()
 zeroIdx = np.where(nmfBasis < zeroThr*zeroThr)
 nmfBasis[zeroIdx]= 0
 dbOutput = np.vstack((dbOutput, getDBEntry("NMF", nmfBasis)))
Exemplo n.º 49
0
print(faces.target_names)
print(faces.images.shape)


fig, ax = plt.subplots(3, 5)
for i, axi in enumerate(ax.flat):
    axi.imshow(faces.images[i], cmap='bone')
    axi.set(xticks=[], yticks=[],
            xlabel=faces.target_names[faces.target[i]])


from sklearn.svm import SVC
from sklearn.decomposition import RandomizedPCA
from sklearn.pipeline import make_pipeline

pca = RandomizedPCA(n_components=150, whiten=True, random_state=42)
svc = SVC(kernel='rbf', class_weight='balanced')
model = make_pipeline(pca, svc)


from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target,
                                                random_state=42)


from sklearn.grid_search import GridSearchCV
param_grid = {'svc__C': [1, 5, 10, 50],
              'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid)

get_ipython().run_line_magic("time", " grid.fit(Xtrain, ytrain)")
Exemplo n.º 50
0
 def fit_deprecated(X):
     global Y
     rpca = RandomizedPCA(random_state=0)
     Y = rpca.fit_transform(X)
Exemplo n.º 51
0
                cells_per_block=self.cells_per_block,
            )
            result.append(features)
        return np.array(result)


MODELS = {
    'linearsvc': (
        LinearSVC(),
        {
            'C': [0.01, 0.1, 1.0]
        },
    ),
    'linearsvc-pca': (
        Pipeline([
            ('pca', RandomizedPCA(n_components=100, whiten=True)),
            ('clf', LinearSVC(C=1.0)),
        ]),
        {
            'pca__n_components': [10, 30, 100],
            'clf__C': [0.01, 0.1, 1.0]
        },
    ),
    'linearsvc-hog': (
        Pipeline([
            ('hog',
             HOGFeatures(
                 orientations=8,
                 pixels_per_cell=(4, 4),
                 cells_per_block=(3, 3),
             )),
Exemplo n.º 52
0
plot_digits(noisy)

pca = PCA(.5).fit(noisy)
pca.n_components_

components = pca.transform(noisy)
filtered = pca.inverse_transform(components)
plot_digits(filtered)

from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
print(faces.target_names)
print(faces.images.shape)

from sklearn.decomposition import RandomizedPCA
pca = RandomizedPCA(150)
pca.fit(faces.data)

# print eigenfaces
fig, axes = plt.subplots(3, 8, figsize=(9, 4),
                         subplot_kw = {'xticks' : [], 'yticks' : []},
                         gridspec_kw = dict(hspace=.1, wspace=.1))
for i, ax in enumerate(axes.flat):
    ax.imshow(pca.components_[i].reshape(62,47), cmap='bone')


plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

pca = RandomizedPCA(150).fit(faces.data)
Exemplo n.º 53
0
test_files_count = data['test_files_count']
validationOriginalImage = data['validationOriginalImage']
#print valid_files
print train_data.shape
print valid_data.shape
# record time used for training
start = time.clock()
for i in range(0, len(superpixels)):
    print np.max(superpixels[i][0])
# Preprocessing normalize data
scaler = StandardScaler()
scaler.fit(train_data)
train_data = scaler.transform(train_data)

# Preprocessing RandomizePCA
pca = RandomizedPCA(n_components=15)
pca.fit(train_data)
#train_data = pca.transform(train_data)
print train_data.shape

# set classifier and fit data
clf = chooseClassification('RF')
clf = clf.fit(train_data, train_labels.ravel())
#scores = cross_val_score(clf, train_data, train_label)
#scores.mean()

# benchmark using validation data
valid_data = scaler.transform(valid_data)
#valid_data = pca.transform(valid_data)
#print clf.predict_proba(valid_data[0])
#wait = input("PRESS ENTER TO CONTINUE.")
Exemplo n.º 54
0
import input_data_svm
datasets = input_data_svm.read_data_sets()

X = np.vstack((datasets.train_set.inputs(), datasets.validation_set.inputs()))
y = np.hstack((datasets.train_set.targets(), datasets.validation_set.targets()))

# X = datasets.train_set.inputs()
# y = datasets.train_set.targets()


X = X[:]
y = y[:]

# Reduce the dimensionality of the dataset
print("Applying PCA to reduce dimensions")
pca = RandomizedPCA(n_components=PCA_COMPONENTS, whiten=True).fit(X)
# eigenfaces = pca.componenets_reshape((PCA_COMPONENTS, h, w))
X = pca.transform(X)
print("Finished PCA preprocessing")

# Normalize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

##############################################################################
# Train classifiers
#
# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.
Exemplo n.º 55
0
import numpy as np

from sklearn.datasets import make_circles
from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier
from sklearn.decomposition import RandomizedPCA
from sklearn.naive_bayes import BernoulliNB

# make a synthetic dataset
X, y = make_circles(factor=0.5, random_state=0, noise=0.05)

# use RandomTreesEmbedding to transform data
hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3)
X_transformed = hasher.fit_transform(X)

# Visualize result using PCA
pca = RandomizedPCA(n_components=2)
X_reduced = pca.fit_transform(X_transformed)

# Learn a Naive Bayes classifier on the transformed data
nb = BernoulliNB()
nb.fit(X_transformed, y)

# Learn an ExtraTreesClassifier for comparison
trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0)
trees.fit(X, y)

# scatter plot of original and reduced data
fig = pl.figure(figsize=(9, 8))

ax = pl.subplot(221)
ax.scatter(X[:, 0], X[:, 1], c=y, s=50)
Exemplo n.º 56
0
    discretizer3=discretizer(3),
    discretizer10=discretizer(10),
    kmeans3=clusterizer(MiniBatchKMeans(3)),
    kmeans10=clusterizer(MiniBatchKMeans(10)),
    kmeans_gap=clusterizer(FitClusterer(min_clusters=3)),
    ward3=clusterizer(Ward(3)),
    ward10=clusterizer(Ward(10)),
    meanshift=clusterizer(MeanShift()),
    # spectral3=clusterizer(SpectralClustering(3)),  # FIXME
    # spectral10=clusterizer(SpectralClustering(10)), # FIXME
    affinity_prop=clusterizer(AffinityPropagation()),
    dbscan=clusterizer(DBSCAN()),
)

BINARY_TO_NUMERICAL = dict(identity=identity, )

BINARY_TO_CATEGORICAL = dict(identity=identity, )

CATEGORICAL_TO_NUMERICAL = dict(
    noop=identity,
    binarize=binary_transformer(),
    pca1=binary_transformer(RandomizedPCA(1)),
    # ica1=binary_transformer(FastICA(1)),  # FIXME
    median_ordinal_pred=discrete_ordinal_predictor("median"),
    mean_ordinal_pred=discrete_ordinal_predictor("mean"),
    max_ordinal_pred=discrete_ordinal_predictor("max"),
    min_ordinal_pred=discrete_ordinal_predictor("min"),
)

CATEGORICAL_TO_CATEGORICAL = dict(identity=identity, )
Exemplo n.º 57
0
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    make_union(
        VotingClassifier([("est",
                           KNeighborsClassifier(n_neighbors=5,
                                                weights="uniform"))]),
        FunctionTransformer(lambda X: X)), RandomizedPCA(iterated_power=1),
    RandomForestClassifier(n_estimators=500))

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features_scaled, labels, test_size=0.25, random_state=42)
##features_train_pca = pca.transform(features_train)
##features_test_pca = pca.transform(features_test)

#from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.grid_search import GridSearchCV

estimator_svm = [('reduce_dim', RandomizedPCA()),
                 ('clf_svm',
                  SVC(kernel='rbf', class_weight='balanced', gamma=0.1,
                      C=1000))]
estimator_tree = [('reduce_dim', RandomizedPCA()),
                  ('clf_tree',
                   DecisionTreeClassifier(criterion='entropy',
                                          max_features='sqrt',
                                          splitter='best'))]
#estimator_knn = [('reduce_dim', PCA()), ('clf_knn', KNeighborsClassifier())]
#estimator_rf = [('reduce_dim', PCA()), ('clf_rf', RandomForestClassifier())]
#estimator_ab = [('reduce_dim', PCA()), ('clf_ab', AdaBoostClassifier())]

#param_svm = {
#             'kernel': ['linear', 'poly', 'rbf'],
#             'C': [10, 100, 1000, 10000],
Exemplo n.º 59
0
# Create an array with flattened images X
# and an array with ID of the people on each image y
X = np.zeros([NUM_TRAINIMAGES, IMG_RES], dtype='int8')
names = []

# Populate training array with flattened imags from subfolders of train_faces and names
c = 0
for x, folder in enumerate(folders):
    train_faces = glob.glob(folder + '/*')
    for i, face in enumerate(train_faces):
        X[c, :] = prepare_image(face)
        names.append(ID_from_filename(face))
        c = c + 1

# perform principal component analysis on the images
pca = RandomizedPCA(n_components=NUM_EIGENFACES, whiten=True).fit(X)
X_pca = pca.transform(X)

#'''
while (True):
    _, frame = cap.read()
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    # Detect faces in the image
    faces = faceCascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=5)
    for (x, y, w, h) in faces:
        if len(faces) == 1:
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 0, 255), 2)
            gray = gray[y:y + h, x:x + w]
            s = cv2.resize(gray, (92, 112))
            Snap(s)
Exemplo n.º 60
0
def doPCA(data, dimensions=2):
    from sklearn.decomposition import RandomizedPCA
    model = RandomizedPCA(n_components=dimensions)
    model.fit(data)
    return model