Exemplo n.º 1
def build_classifier(train_data_x_in, train_data_y, classifier_in="svc_basic"):
    print "Attempting to build classifier."
    train_data_x = train_data_x_in
    transformer = ""
    # classifier = grid_search.GridSearchCV(svm.SVC(), parameters).fit(train_data_x, train_data_y)
    if classifier_in == "svc_basic":
        classifier = svm.SVC()
        print "Selection was basic svm.SVC."
    elif classifier_in == "svc_extensive":
        classifier = svm.SVC(kernel="linear", C=0.025, gamma=0.01)
        print "Selection was extensive svm.SVC, with linear kernel, C==0.025 and gamma==0.01."
    elif classifier_in == "kneighbors_basic":
        transformer = RandomizedPCA(n_components=2000)
        train_data_x = transformer.fit_transform(train_data_x)
        classifier = KNeighborsClassifier()
        print "Selection was KNeighbors basic, using RandomizedPCA to transform data first. n_components==2000."
    elif classifier_in == "bagging_basic":
        classifier = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
        print "Selection was Bagging basic, with max_samples==0.5 and max_features==0.5."
    elif classifier_in == "spectral_basic":
        transformer = SpectralEmbedding(n_components=2000)
        train_data_x = transformer.fit_transform(train_data_x)
        classifier = KNeighborsClassifier()
        print "Selection was Spectral basic, using svm.SVC with Spectral data fitting. n_components==2000."
    # default to SVC in case of any sort of parsing error.
        print "Error in selecting classifier class. Reverting to SVC."
        classifier = svm.SVC()
    classifier.fit(train_data_x, train_data_y)
    print "Doing classifier estimation."
    return classifier, train_data_x, transformer
Exemplo n.º 2
def reduce_features(features,
	Performs feature reduction using PCA. Automatically selects nr. components
	for explaining min_var_explained variance.
	:param features: Features.
	:param var_explained: Minimal variance explained.
	:param n_components: Nr. of components.
	:param exclude_columns: Columns to exclude.
	:param verbose: Verbosity.
	:return: Reduced feature set.
    if n_components == 0:
        # Run full PCA to estimate nr. components for explaining given
        # percentage of variance.
        estimator = RandomizedPCA()
        variance = 0.0
        for i in range(len(estimator.explained_variance_ratio_)):
            variance += estimator.explained_variance_ratio_[i]
            if variance > var_explained:
                n_components = i + 1
                if verbose:
                    print('{} % of variance explained using {} components'.
                          format(var_explained, n_components))
    # Re-run PCA with only estimated nr. components
    estimator = RandomizedPCA(n_components=n_components)
    features = estimator.fit_transform(features)
    return features
Exemplo n.º 3
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def pca(imageData=[]):
    labels = ["shoe", "shirt"]
    is_train = np.random.uniform(0, 1, len(imageData)) <= 0.7
    y = np.where(np.array(labels) == "shirt", 1, 0)

    train_x, train_y = imageData[is_train], imageData[is_train]
    test_x, test_y = imageData[is_train == False], y[is_train == False]
    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(imageData)
    df = pd.DataFrame({
        "x": X[:, 0],
        "y": X[:, 1],
        "label": np.where(y == 1, "shoe", "shirt")
    colors = ["red", "yellow"]
    for label, color in zip(df['label'].unique(), colors):
        mask = df['label'] == label
        pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)

    pca2 = RandomizedPCA(n_components=5)
    train_x = pca2.fit_transform(train_x)
    test_x = pca2.transform(test_x)

    print train_x[:5]
    knn = KNeighborsClassifier()
    knn.fit(train_x, train_y)
    return 0
Exemplo n.º 5
def reduce_features(features, var_explained=0.9, n_components=0, verbose=False):
	Performs feature reduction using PCA. Automatically selects nr. components
	for explaining min_var_explained variance.
	:param features: Features.
	:param var_explained: Minimal variance explained.
	:param n_components: Nr. of components.
	:param exclude_columns: Columns to exclude.
	:param verbose: Verbosity.
	:return: Reduced feature set.
	if n_components == 0:
		# Run full PCA to estimate nr. components for explaining given
		# percentage of variance.
		estimator = RandomizedPCA()
		variance = 0.0
		for i in range(len(estimator.explained_variance_ratio_)):
			variance += estimator.explained_variance_ratio_[i]
			if variance > var_explained:
				n_components = i + 1
				if verbose:
					print('{} % of variance explained using {} components'.format(var_explained, n_components))
	# Re-run PCA with only estimated nr. components
	estimator = RandomizedPCA(n_components=n_components)
	features = estimator.fit_transform(features)
	return features
Exemplo n.º 6
def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = RandomizedPCA(n_components=2, random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
Exemplo n.º 7
def pcaAndPlot(X, x_to_centroids, centroids, no_dims = 2):
    pca = RandomizedPCA(n_components=no_dims)
    x_trans = pca.fit_transform(X)
    x_sizes = np.full((x_trans.shape[0]), 30, dtype=np.int)
    plt.scatter(x_trans[:, 0], x_trans[:, 1], s=x_sizes, c=x_to_centroids)
    centroids_trans = pca.fit_transform(centroids)
    centroids_col = np.arange(centroids.shape[0])
    centroids_sizes = np.full((centroids.shape[0]), 70, dtype=np.int)
    plt.scatter(centroids_trans[:, 0], centroids_trans[:, 1], s=centroids_sizes, c=centroids_col)
Exemplo n.º 8
def principal_component_analysis(x):
    sizes = np.shape(x)
    cols = sizes[1]

    # Obtain the Principal Components, which are ordered by eigenvalues
    principal_components = RandomizedPCA(n_components=cols)
    eigenvalues = principal_components.explained_variance_

    # Maximum eigenvalues reflect importance of each feature
    feature_order = np.argsort(eigenvalues)[::-1][:cols]
    return feature_order
Exemplo n.º 9
def read_data_sets():
	class DataSets(object):

	start = time.time()
	data_sets = DataSets()

	# Load the training data
	mat_contents = sio.loadmat('labeled_images.mat')
	train_labels = mat_contents['tr_labels']
	train_identities = mat_contents['tr_identity']
	train_images = mat_contents['tr_images']

	# Load the test data
	mat_contents = sio.loadmat('public_test_images.mat')
	test_images = mat_contents['public_test_images']
	test_set_length = len(test_images[0][0])

	# Flatten images
	test_images = flattenImages(test_images)
	train_images = flattenImages(train_images)

	# Split train into validation set of size ~ test_set_length
	train_images, train_labels, validation_images, validation_labels = splitSet(

	# Convert labels to one hot vectors
	train_labels = convertToOneHot(train_labels, NUM_CLASSES)
	validation_labels = convertToOneHot(validation_labels, NUM_CLASSES)

	# Normalize the images
	sd = np.sqrt(np.var(train_images) + 0.01)
	train_images = (train_images - np.mean(train_images)) / sd
	sd = np.sqrt(np.var(validation_images) + 0.01)
	validation_images = (validation_images - np.mean(validation_images)) / sd

	pca = RandomizedPCA(n_components=15)
	train_images = pca.fit_transform(train_images)
	validation_images = pca.fit_transform(validation_images)

	# Setup the matrixes into an accessible data set class
	data_sets.train_set = DataSet(train_images, train_labels)
	data_sets.validation_set = DataSet(validation_images, validation_labels)
	data_sets.test_set = DataSet(test_images, np.zeros((len(test_images), NUM_CLASSES)))

	print('Finished setting up data! Took {} seconds'.format(time.time() - start))

	return data_sets
Exemplo n.º 10
def get_features_from_images_PCA(img_dir,data_set):
    Takes in a directory and gets all the images from
    it and extracts the pixel values, flattens the matrix
    into an array and performs principle component analysis
    to get representative subset of features from the pixel
    values of the image.
    print "\nExtracting features from given images..."
    img_names = [f for f in os.listdir(img_dir)]
    images = [img_dir+ f for f in os.listdir(img_dir)]
    #print images
    print "\nConverting images to vectors"
    data = []
    for image in images:
#        print image
        img = img_to_matrix(image)
        img = flatten_image(img)
    print "Converting image data to numpy array"
    data = np.array(data)
    print "Finished Conversion"
    print "\nPerforming PCA to get reqd features"
    features = []
    pca = RandomizedPCA(n_components=14)
    for i in xrange(len(data)/100):
        if features == []:
            split = data[0:100]
            features = pca.fit_transform(split)
            split = data[100*i:100*(i+1)]
            features = np.concatenate((features,pca.fit_transform(split)),axis=0)
    print "Writing feature data to file"
    f = open(data_set+"_extracted_features.txt","w")  
    for i in xrange(len(img_names)):
        s = str(img_names[i])
        for value in features[i]:
            s += " "+str(value)
        s += "\n"
    print "Write completed"
Exemplo n.º 11
def pcaAndPlot(X, x_to_centroids, centroids, no_dims=2):
    pca = RandomizedPCA(n_components=no_dims)
    x_trans = pca.fit_transform(X)
    x_sizes = np.full((x_trans.shape[0]), 30, dtype=np.int)
    plt.scatter(x_trans[:, 0], x_trans[:, 1], s=x_sizes, c=x_to_centroids)
    centroids_trans = pca.fit_transform(centroids)
    centroids_col = np.arange(centroids.shape[0])
    centroids_sizes = np.full((centroids.shape[0]), 70, dtype=np.int)
    plt.scatter(centroids_trans[:, 0],
                centroids_trans[:, 1],
Exemplo n.º 12
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("-i", "--image", required = True, help = "Path to the image")
    args = vars(ap.parse_args())

    image = cv2.imread(args["image"])
    rects, img = detect(image)

    cropped = []

    for idx, (x1, y1, x2, y2) in enumerate(rects):
        crop_img = image[y1:y1 + (y2 - y1), x1:x1 + (x2 - x1)]
        crop_img = cv2.resize(crop_img, (100,100), interpolation = cv2.INTER_AREA)
        cv2.imshow("image" + str(idx), crop_img)
        new_img = crop_img.reshape(crop_img.shape[0] * crop_img.shape[1], 3)

    # reduce feature size
    cropped_pca = []
    pca = RandomizedPCA(n_components=100)
    cropped_pca = pca.fit_transform(cropped)

    # training (hardcoded for now)
    clf   = SVC(probability=True)
    train = cropped_pca[:7]
    test  = cropped_pca[7:13]
    # clf.fit([[0,0],[1,1]], [1, 2])
    clf.fit(train, [1,2,2,1,2,1,1])

    for item in test:
        print clf.predict_proba(item)
        print clf.predict(item)

Exemplo n.º 13
    def _prepare_pca(self, data, max_n_components):
        """ Helper Function """
        from sklearn.decomposition import RandomizedPCA

        # sklearn < 0.11 does not support random_state argument
        kwargs = {'n_components': max_n_components, 'whiten': False}

        aspec = inspect.getargspec(RandomizedPCA.__init__)
        if 'random_state' not in aspec.args:
            warnings.warn('RandomizedPCA does not support random_state '
                          'argument. Use scikit-learn to version 0.11 '
                          'or newer to get reproducible results.')
            kwargs['random_state'] = 0

        pca = RandomizedPCA(**kwargs)
        pca_data = pca.fit_transform(data.T)

        if self._explained_var > 1.0:
            if self.n_components is not None:  # normal n case
                self._comp_idx = np.arange(self.n_components)
                to_ica = pca_data[:, self._comp_idx]
            else:  # None case
                to_ica = pca_data
                self.n_components = pca_data.shape[1]
                self._comp_idx = np.arange(self.n_components)
        else:  # float case
            expl_var = pca.explained_variance_ratio_
            self._comp_idx = (np.where(expl_var.cumsum() <
            to_ica = pca_data[:, self._comp_idx]
            self.n_components = len(self._comp_idx)

        return to_ica, pca
Exemplo n.º 14
    def detect(self, imageURLs, params):

        array = []
        for param in params:
            img = self.img_to_matrix(param['imageURL'])
            data = self.flatten_image(img)
        array = np.array(array)

        pca = RandomizedPCA(n_components=5)
        n_data = pca.fit_transform(array)

        clf = joblib.load('src/resource/models/model.pkl')
        result = clf.predict(n_data).tolist()

        for param, r in zip(params, result):
            raw_img = urllib2.urlopen(param['imageURL']).read()
            if r == 1:
                cntr = len([i for i in os.listdir("test/images/rain/") if 'rain' in i]) + 1
                path = "static/images/rain_" + str(cntr) + '.jpg'
                f = open(path, 'wb')
                # イベント情報作成
                when = {'type': 'timestamp', 'time':param['time']}
                where = { "type": "Point", "coordinates": [param['longitude'], param['latitude']]}
                what = {'topic': {'value':u'雨'}, 'tweet': param['value']}
                who = [{"type": "url", "value": param['imageURL']},
                       {"value": "evwh <*****@*****.**>", "type": "author"}]
                event = {'observation':{'what': what, 'when': when, 'where': where, 'who': who}}
Exemplo n.º 15
def rpca(numpy_file='../data/Paintings/two_class/Paintings_train.csv'):
    """ Performs randomized PCA on given numpy file.

    Given a numpy file of n-rows and n-cols, where the last column is
    the label and rest are features,n-rows are the samples.

    :type numpy_file: string
    :param numpy_file: The file name of numpy file to be analyzed.
    import numpy as np
    import matplotlib.pyplot as pl
    import pandas as pd
    from sklearn.decomposition import RandomizedPCA

    all_data = np.loadtxt(numpy_file,delimiter=',')
    data = all_data[:,:-1]
    y = all_data[:,-1]
    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(data)
    df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1],\
                    "label":np.where(y==1, "realism", "abstract")})
    colors = ["red", "yellow"]
    for label, color in zip(df['label'].unique(), colors):
        mask = df['label']==label
        pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
    pl.title('Randomized PCA analysis')
Exemplo n.º 16
def main():
    img_dir = 'images/'
    images = [img_dir + f for f in os.listdir(img_dir)]
    labels = [f.split('/')[-1].split('_')[0] for f in images]
    label2ids = {v: i for i, v in enumerate(sorted(set(labels),
    y = np.array([label2ids[l] for l in labels])

    data = []
    for image_file in images:
        img = img_to_matrix(image_file)
        img = flatten_image(img)
    data = np.array(data)

    # training samples
    is_train = np.random.uniform(0, 1, len(data)) <= 0.7
    train_X, train_y = data[is_train], y[is_train]

    # training a classifier
    pca = RandomizedPCA(n_components=5)
    train_X = pca.fit_transform(train_X)
    multi_svm = OneVsRestClassifier(LinearSVC())
    multi_svm.fit(train_X, train_y)

    # evaluating the model
    test_X, test_y = data[is_train == False], y[is_train == False]
    test_X = pca.transform(test_X)
    print pd.crosstab(test_y, multi_svm.predict(test_X),
                      rownames=['Actual'], colnames=['Predicted'])
Exemplo n.º 17
def rpca(numpy_file='../data/Paintings/two_class/Paintings_train.csv'):
    """ Performs randomized PCA on given numpy file.

    Given a numpy file of n-rows and n-cols, where the last column is
    the label and rest are features,n-rows are the samples.

    :type numpy_file: string
    :param numpy_file: The file name of numpy file to be analyzed.
    import numpy as np
    import matplotlib.pyplot as pl
    import pandas as pd
    from sklearn.decomposition import RandomizedPCA

    all_data = np.loadtxt(numpy_file, delimiter=',')
    data = all_data[:, :-1]
    y = all_data[:, -1]
    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(data)
    df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1],\
                    "label":np.where(y==1, "realism", "abstract")})
    colors = ["red", "yellow"]
    for label, color in zip(df['label'].unique(), colors):
        mask = df['label'] == label
        pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
    pl.title('Randomized PCA analysis')
def make_pca_datapoints(terms_map, stopwords, clusters):
	new_terms_map = {}
	raw_data = []
	target = []
	for line in open(tweets_file):
		tokens = line.split()
		terms = [terms_map[int(term)] for term in tokens[3].split(',') if terms_map[int(term)] not in stopwords]
		for term in terms:
			if not term in new_terms_map:
				new_terms_map[term] = len(new_terms_map)
		new_term_ids = [new_terms_map[term] for term in terms]
                tags = [terms_map[int(term)] for term in tokens[4].split(',')]
	data = lil_matrix( (len(raw_data), len(new_terms_map)) )
	count = 0
	for cur_vector in raw_data:
		for point in cur_vector:
			data[(count, point)] += 1
		count += 1
	pca = RandomizedPCA (n_components=100)
	transformed_data = pca.fit_transform(data) 
	xs = []
	ys = []
	count = 0
	for datum in transformed_data:
		for tag in target[count]:
			if (len(tag) > 1) and tag[1:] in clusters:
		count += 1

	del transformed_data
	return xs, ys	
Exemplo n.º 19
def do_pca(corr_matrix: _nested_ndarray, num_dim: int,
    min_var_explanation: float =0.7) -> _nested_ndarray:
    This method performs PCA on a self-correlation matrix, reducing the number of columns to `num_dim`.
    If such analysis does not sufficiently explain the underlying variance in the data, an exception is

    * `corr_matrix` - a square matrix of correlations
    * `num_dim` - the number of dimensions to which the data should be reduced
    * `min_var_explanation` - the minimum fraction of the underlying data variance that should be explained


    > A matrix of the PCA result on `corr_matrix`.

    num_dim = int(num_dim)
    pca = PCA(n_components=num_dim, random_state=0)
    pca_result = pca.fit_transform(corr_matrix)
    var_ratio = pca.explained_variance_ratio_
    if sum(var_ratio) < min_var_explanation:
        raise PcaAccuracyException(
            'PCA doesn\'t explain enough of the variance in the data')

    return pca_result
Exemplo n.º 20
def calc_hog(fpaths, save=False):
    Compute histogram of gradients (HOG). Saves in batches to prevent memory issues.
        fpaths : files on which HOG will be computed
        save : if true, output is saved to disk

    hogs = np.empty((len(fpaths), 15876))

    for i, fpath in enumerate(fpaths):
        img = imread(os.path.join(imgdir, fpath))
        if len(img.shape)==3:
            img = rgb2gray(img)
        # rescale so all feature vectors are the same length
        img_resize = resize(img, (128, 128))
        img_hog = hog(img_resize)

        hogs[i, :] = img_hog

    hogs_sc = scale(hogs)
    n_components = 15
    pca = RandomizedPCA(n_components=n_components)
    hogs_decomp = pca.fit_transform(hogs_sc)

    df = pd.DataFrame(hogs_decomp, index=[os.path.split(i)[1] for i in fpaths])
    df.columns = ['feat_hog_%2.2u' % i for i in range(1, n_components+1)]
    if save: df.to_csv('hog.csv')
    return df
Exemplo n.º 21
    def Q4():

        data = datasets.fetch_olivetti_faces(shuffle=True, random_state=0)
        X = data.data
        y = data.target

        image_shape = (64, 64)

        n = X.shape[0]
        n_components = 10

        model = RandomizedPCA(n_components=n_components)
        Z = model.fit_transform(X)
        Z_c = Z  #- Z.mean(axis=1).reshape((n, 1))  !!!! ERROR IN COURSERA
        Z_c = Z_c * Z_c
        Z_tot = Z_c.sum(axis=1).reshape((n, 1))

        Cos = Z_c / Z_tot

        i_s = []
        for j in range(n_components):
            i = np.argmax(Cos[:, j])
            image = X[i, :].reshape(image_shape)

        utils.PATH.SAVE_RESULT((3, 2), (1, 4), i_s)

Exemplo n.º 22
def scatter(data, labels=None, title=None, name=None):
    """2d PCA scatter plot with optional class info

    Return the pca model to be able to introspect the components or transform
    new data with the same model.
    data = atleast2d_or_csr(data)

    if data.shape[1] == 2:
        # No need for a PCA:
        data_2d = data
        pca = RandomizedPCA(n_components=2)
        data_2d = pca.fit_transform(data)

    for i, c, m in zip(np.unique(labels), cycle(COLORS), cycle(MARKERS)):
        plt.scatter(data_2d[labels == i, 0], data_2d[labels == i, 1],
                    c=c, marker=m, label=i, alpha=0.5)

    if title is None:
        title = "2D PCA scatter plot"
        if name is not None:
            title += " for " + name
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')

    return pca
def main():
    protein = sys.argv[1]

    X = load_file(protein)
	scores = np.loadtxt("../LSDMap/{protein}.scores.txt".format(**locals()))

	if scores.shape[0] != RMSD.shape[0]:
		scores = scores[-RMSD.shape[0]:]
		print("selecting last N")

	models = select_N_models(RMSD[1:,1:], scores, 10000)
	keep = np.r_[0, models + 1]
	n_neigh = np.min(np.sum(RMSD < 6, axis=0)[models + 1])
	RMSD = RMSD[keep,:][:,keep]
    #models = np.arange(N)
    #np.savetxt("output/{protein}/pca/kept.txt".format(**locals()), models)
    #np.save("output/{protein}/pca/RMSD.npy".format(**locals()), RMSD[0,:])

    pca = RandomizedPCA(n_components=100, copy=False)
    proj = pca.fit_transform(X)
    acc_var = calcAccumVar(pca.explained_variance_ratio_)

    np.savetxt("output/{protein}/pca/acc_var.txt".format(**locals()), acc_var)
    np.save("output/%s/pca/proj.npy" % protein, proj)
    np.save("output/{protein}/pca/proj2D.npy".format(**locals()), proj[:, :2])
Exemplo n.º 24
    def pca(self, y):

        # select a random subset of Y dimensions (possibly gives robustness as well as speed)
        rand_dims = np.sort(
        y_dim_subset = y.take(rand_dims, 1)

        pca = RandomizedPCA(n_components=1)  # compute for all components

        # optional: select a subset of exs (not so important if PCA is fast)
        if self.tree_params['sub_sample_exs_pca']:
            rand_exs = np.sort(
            pca.fit(y_dim_subset.take(rand_exs, 0))
            return pca.transform(y_dim_subset)

            # perform PCA
            return pca.fit_transform(y_dim_subset)
Exemplo n.º 25
def dimentionality_reduction(train_x , test_x):
	print "Dimentionality reduction to 10D on training and test data...."
	pca = RandomizedPCA(n_components=10)
	train_x = pca.fit_transform(train_x)
	test_x = pca.transform(test_x)
	print "Done."
	return train_x , test_x
Exemplo n.º 26
def randomized_pca(train_data_images, train_data_split_images,
                   test_data_images, IMG_SIZE):
    train_data_features = []
    test_data_features = []
    train_data = []
    test_data = []
    train_data_split_crossfold = []

    for image in train_data_images:
        img = img_to_matrix(image, IMG_SIZE)
        img = flatten_image(img)

    for image in train_data_split_images:
        img = img_to_matrix(image, IMG_SIZE)
        img = flatten_image(img)

    for image in test_data_images:
        img = img_to_matrix(image, IMG_SIZE)
        img = flatten_image(img)

    pca = RandomizedPCA(50)
    return (pca.fit_transform(train_data), pca.transform(test_data))
Exemplo n.º 27
def dimentionality_reduction(train_x, test_x):
    print "Dimentionality reduction to 10D on training and test data...."
    pca = RandomizedPCA(n_components=10)
    train_x = pca.fit_transform(train_x)
    test_x = pca.transform(test_x)
    print "Done."
    return train_x, test_x
def pca_knn():
    Xtrain,ytrain,Xtest,ytest = getSplitData()
    Xtrain, Xtest = getScaledData(Xtrain, Xtest)
    ntest = Xtest.shape[0]
    #Your code here
    for n in range(5,8):
        pca = RandomizedPCA(n_components=n)
        pca_Xtrain = pca.fit_transform(Xtrain, ytrain)
        pca_Xtest = pca.fit_transform(Xtest) 
        neigh = KNeighborsClassifier(n_neighbors=5)
        neigh.fit(pca_Xtrain, ytrain)
        yPredict = neigh.predict(pca_Xtest)
        print "parameter: n_components = ",n
        print "parameter: n_neighbors = 5"
        print "pca_knn classification accuracy: ", accuracy_score(ytest,yPredict)
Exemplo n.º 29
    def run_pca(self, features):
        """Run a principal component analysis on the training data

        pca = RandomizedPCA(n_components=5)
        feautres_pca = pca.fit_transform(features)

        return feautres_pca
Exemplo n.º 30
def main():

    #get the file path from the command prompt
    if len(sys.argv) > 1:
        TEST_FILE = sys.argv[1]
        print("error: lease specify a file path")

    print("TRAINING STARTED!")

    print("pulling images from files...")
    #Store image paths and labels
    images = []
    rawlabels = []
    for subdir, dirs, files in os.walk(DATA_DIR):
        for file in files:
            if (subdir.split('/')[1]) != "test":
                images.append(os.path.join(subdir, file))

    print("converting images to arrays...")
    #Create a massive data array
    data = []
    labels = []
    counter = 0
    for imagePath in images:
        #print imagePath
        img = []
            img = imgToArray(imagePath)
        except IOError:
        counter += 1
    data = np.array(data)

    print("reducing arrays using randomizedPCA...")
    #randomizedPCA on training set
    #this reduces the huge amount of data points
    pca = RandomizedPCA(n_components=4)
    data = pca.fit_transform(data)

    #generate a 2D plot that shows the groupings

    print("using K-closest neighbors to classify data...")
    #fit the KNeighbors classifier
    knn = KNeighborsClassifier()
    knn.fit(data, labels)

    print("TESTING STARTED!")
    #test the image
    print "The test image, " + TEST_FILE + " is a:"
    test = string_to_img(TEST_FILE, pca)
    print classify_image(test, knn)
Exemplo n.º 31
def rca1_decompose(dataset, n):
    rca = RandomizedPCA(n_components=n)
    reduced_features = rca.fit_transform(dataset.all.features)
    training_size = dataset.training_size
    training = Data(reduced_features[:training_size, :],
    testing = Data(reduced_features[training_size:, :],
    return DataSet(training, testing)
def pca_knn():
    Xtrain, ytrain, Xtest, ytest = getSplitData()
    Xtrain, Xtest = getScaledData(Xtrain, Xtest)
    ntest = Xtest.shape[0]
    #Your code here
    for n in range(5, 8):
        pca = RandomizedPCA(n_components=n)
        pca_Xtrain = pca.fit_transform(Xtrain, ytrain)
        pca_Xtest = pca.fit_transform(Xtest)
        neigh = KNeighborsClassifier(n_neighbors=5)
        neigh.fit(pca_Xtrain, ytrain)

        yPredict = neigh.predict(pca_Xtest)

        print "parameter: n_components = ", n
        print "parameter: n_neighbors = 5"
        print "pca_knn classification accuracy: ", accuracy_score(
            ytest, yPredict)
Exemplo n.º 33
	def fit(self, X, y=None, c=None):
		"""Fit the model using X as training data.

		X : array, shape (n_samples, n_features) or (n_samples, n_samples)
			If the metric is 'precomputed' X must be a square distance
			matrix. Otherwise it contains a sample per row.
		X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64)
		random_state = check_random_state(self.random_state)

		if self.early_exaggeration < 1.0:
			raise ValueError("early_exaggeration must be at least 1, but is "
							 "%f" % self.early_exaggeration)

		if self.n_iter < 200:
			raise ValueError("n_iter should be at least 200")

		if self.metric == "precomputed":
			if self.init == 'pca':
				raise ValueError("The parameter init=\"pca\" cannot be used "
								 "with metric=\"precomputed\".")
			if X.shape[0] != X.shape[1]:
				raise ValueError("X should be a square distance matrix")
			distances = X
			if self.verbose:
				print("[t-SNE] Computing pairwise distances...")

			if self.metric == "euclidean":
				distances = pairwise_distances(X, metric=self.metric, squared=True)
				distances = pairwise_distances(X, metric=self.metric)

		# Degrees of freedom of the Student's t-distribution. The suggestion
		# alpha = n_components - 1 comes from "Learning a Parametric Embedding
		# by Preserving Local Structure" Laurens van der Maaten, 2009.
		alpha = max(self.n_components - 1.0, 1)
		n_samples = X.shape[0]
		self.training_data_ = X

		P = _joint_probabilities(distances, self.perplexity, self.verbose)
		self.P = deepcopy(P)
		if self.init == 'pca':
			pca = RandomizedPCA(n_components=self.n_components,
			X_embedded = pca.fit_transform(X)
		elif self.init == 'random':
			X_embedded = None
			raise ValueError("Unsupported initialization scheme: %s"
							 % self.init)

		self.embedding_ = self._tsne(P, alpha, n_samples, random_state,
									 X_embedded=X_embedded, c=c)
Exemplo n.º 34
    def trainset(data, labels):
        pca = RandomizedPCA(n_components=10)
        std = StandardScaler()
        data = np.reshape(data, (data.shape[0], -1))
        data = pca.fit_transform(data)
        data = std.fit_transform(data)
        knn = KNeighborsClassifier()
        knn.fit(data, labels)

        return pca, std, knn
Exemplo n.º 35
def visualize():
    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(data)
    df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1], "label": labels})
    colors = ["red", "yellow"]
    for label, color in zip(df['label'].unique(), colors):
        mask = df['label'] == label
        pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
Exemplo n.º 36
def visualize():
    pca = RandomizedPCA(n_components=2)
    X = pca.fit_transform(data)
    df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1], "label":labels})
    colors = ["red", "yellow"]
    for label, color in zip(df['label'].unique(), colors):
        mask = df['label']==label
        pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
Exemplo n.º 37
def HSV_PCA(image_paths, hue_bins = 180, sat_bins = 256, val_bins = 256):

  hsv_hists = HSV_hists(image_paths, hue_bins, sat_bins, val_bins)

  pca = RandomizedPCA(n_components=3)

  hue_pca = pca.fit_transform(np.log(hsv_hists[0]))
  sat_pca = pca.fit_transform(np.log(hsv_hists[1]))
  val_pca = pca.fit_transform(np.log(hsv_hists[2]))

  hsv_df = pd.DataFrame(data = np.hstack((hue_pca, sat_pca, val_pca)))
  h_cols = ["HuePC" + str(i) for i in range(1,4)]
  s_cols = ["SatPC" + str(i) for i in range(1,4)]
  v_cols = ["ValPC" + str(i) for i in range(1,4)]
  hsv_df.columns = h_cols + s_cols + v_cols

  df_res = pd.concat([pd.DataFrame({'image_paths': image_paths}), hsv_df], axis = 1)

  return df_res
def get_pca(data, num_components=2):
    Perform a PCA transformation
    data: Values to transform
    num_components: Number of dimension of the data
    pca = RandomizedPCA(n_components=num_components, whiten=False)
    data = pca.fit_transform(data)
    return data, pca.explained_variance_ratio_
Exemplo n.º 39
def get_input_pca(imgs, labels, pca=None):
    I = np.rollaxis(imgs, 2)
    I = np.reshape(I, (I.shape[0], -1))

    if not pca:
        pca = RandomizedPCA(n_components=None, copy=False, iterated_power=3, whiten=False)

    I = pca.fit_transform(I)
    L = np.ravel(labels)

    return I, L, pca
Exemplo n.º 40
def RPCA(model_data, components=None, transform_data=None):
    t0 = time()
    rpca = RandomizedPCA(n_components=components)
    if transform_data == None:
        projection = rpca.fit_transform(model_data)
        projection = rpca.transform(transform_data)
    print "Randomized PCA Explained Variance: ", rpca.explained_variance_ratio_
    print "Randomized PCA Time: %0.3f" % (time() - t0)
    return projection
 def PlotPCA(self):
     pca = RandomizedPCA(n_components=1)
     print shape(self.fmri_train)
     print shape(pca.components_)
     trainingVector = pca.fit_transform(self.fmri_train)
     #print pca.get_params()
     print shape(trainingVector)
     io.mmwrite('fmri_train_240samples_1components.out', trainingVector, field='real', precision=25)
Exemplo n.º 42
def RPCA(model_data, components = None, transform_data = None):
    t0 = time()
    rpca = RandomizedPCA(n_components=components)
    if transform_data == None:
        projection = rpca.fit_transform(model_data)
        projection = rpca.transform(transform_data)
    print "Randomized PCA Explained Variance: ", rpca.explained_variance_ratio_
    print "Randomized PCA Time: %0.3f" % (time() - t0)
    return projection
Exemplo n.º 43
def pca_LG(train, test):
    y = []
    x_train, y_train, x_test, y_test = split_data(train, test)
    pca = RandomizedPCA(n_components=500)
    x_train = pca.fit_transform(x_train)
    x_test = pca.transform(x_test)
    lr = LogisticRegression()
    lr.fit(x_train, y_train)
    y = lr.predict(x_test)
    return format_y(y)
Exemplo n.º 44
def pca_knn(train, test):
    y = []
    x_train, y_train, x_test, y_test = split_data(train, test)
    pca = RandomizedPCA(n_components=2)
    x_train = pca.fit_transform(x_train)
    x_test = pca.transform(x_test)
    knn = KNeighborsClassifier()
    knn.fit(x_train, y_train)
    y = knn.predict(x_test)
    return format_y(y)
Exemplo n.º 45
    def test_do_pca(self):
        pca_res = do_pca(self.dup_data, 3)

        for datum in pca_res.reshape(-1, 1):
            self.assertAlmostEqual(datum[0], 0.)

        pca_res = do_pca(self.data, 2).reshape(1, -1)[0]
        expected_pca = PCA(n_components = 2)
        expected_res = expected_pca.fit_transform(self.data).reshape(1, -1)[0]

        for expected, actual in zip(expected_res, pca_res):
            self.assertAlmostEqual(expected, actual)
Exemplo n.º 46
    def maybeReduceDimensionality(self, img_data):
        """Dimensional reduction of 3D image matrix (numpy array)."""
        # Iterating through a ndimensional array produces slices along
        # the last axis. This is equivalent to data[i,:,:] in this case
        img_data = img_data[::self.n_slices]

        if self.reduction is None:
            """No Reduction"""
            return img_data

        elif self.reduction == "H":
            from sklearn import preprocessing

            img_data = np.asarray(img_data, dtype=float).flat

            min_max_scaler = preprocessing.MinMaxScaler()
            scaled_data = min_max_scaler.fit_transform(img_data)

            hist = np.histogram(scaled_data,

            return hist.reshape(1, hist.shape[0])

        elif self.reduction == "P":
            """Slice-wise (randomized) Principal Component Analysis"""
            from sklearn.preprocessing import normalize
            from sklearn.decomposition import RandomizedPCA

            proj_data = []
            for img_slice in img_data:
                norm_data = normalize(img_slice)

                shaped_data = np.reshape(norm_data, norm_data.size)
                # shaped_data.shape

                rpca = RandomizedPCA(
                proj_slice = rpca.fit_transform(norm_data)
                # plt.imshow(proj_data)

                # feat_data = rpca.inverse_transform(proj_data)
                # plt.imshow(feat_data)
                # plt.imshow(norm_data)


            return proj_data
Exemplo n.º 47
def pca(data,ncomp=100,whiten=False):
    pt4 = time.time()
    print 'import and normalization took time {0}'.format(pt4 - pt0)
    if whiten == True:   #if data needs to be pca whitened, whiten data
       pca = RandomizedPCA(n_components=ncomp, whiten=True)  #create pca object to pca whiten features
       X = pca.fit_transform(data)
       X = data         #else return data as is
    pt5 = time.time()
    print 'array cast and pca whitening took time {0}'.format(pt5 - pt2)
    print 'total time taken {0}'.format(pt5-pt0)
    return X
Exemplo n.º 48
def plot_for_2d(data , y):
	print "Reducing dimension to 2D for visualization...."
	pca = RandomizedPCA(n_components=2)
	X = pca.fit_transform(data)
	df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1], "label":np.where(y==1, "Sphere", "cube")})
	colors = ["red", "yellow"]
	print "Displaying plot...."
	for label, color in zip(df['label'].unique(), colors):
		mask = df['label'] == label
		pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label)
	print "Done."
Exemplo n.º 49
def reduce_dimensions(data, n, random_state=None):
	Reduces the input data's dimension to 'n'.

		data: An M x N matrix, where M is the number of samples and N is the number
			of features. The dimensions will be reduced from N to n.
		n: The new number of dimensions
		data: An M x n reduced dimension matrix.
    pca = RandomizedPCA(n_components=n, random_state=random_state)
    return pca.fit_transform(data)
Exemplo n.º 50
def pcaPic(data, label):
    n_components =100
    print("train pca!!")
    pca = RandomizedPCA(n_components=n_components, whiten=True).fit(data)
    X_train_pca = pca.fit_transform(data)
    y_train  = label
    print("Fitting the classifier to the training set")
    param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
    clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
    clf = clf.fit(X_train_pca, y_train)
    return pca, clf
Exemplo n.º 51
def compute_pca(raw_data):
    # randomly order the data
    # seed(0)
    print('shuffling data...')
    # pull out the features and the labels
    print('pulling out data to run PCA...')
    data = np.array([cd for (cd, _y, f) in raw_data])
    print('finding principal components...')
    pca = RandomizedPCA(n_components=N_COMPONENTS, random_state=0)
    X = pca.fit_transform(data)

    return raw_data, data, pca, X
Exemplo n.º 52
def preprocess_data():
    datasets = sio.loadmat('../multi_data/Hyper_01_Urban.mat')
    hypercube = datasets['Hypercube']

    datasets = sio.loadmat('../multi_data/Hyper_01_Urban_GroundTruth.mat')
    ground_truth = datasets['Ground_Truth']

    del datasets

    hypercube_1D = np.reshape(hypercube, (-1, hypercube.shape[2]))
    rpca = RandomizedPCA(n_components=10, whiten=True)
    hypercube_1D_reduced = rpca.fit_transform(hypercube_1D)
    hypercube_reduced = np.reshape(
        hypercube_1D_reduced, (hypercube.shape[0], hypercube.shape[1], -1))

    print rpca.explained_variance_ratio_.sum()

    window_sz = 5
    window_pad = 2
    dataset_matrix_size = ((hypercube_reduced.shape[0] - window_pad) *
                           (hypercube_reduced.shape[1] - window_pad),
                           window_sz, window_sz, hypercube_reduced.shape[2])
    dataset_matrix = np.zeros(dataset_matrix_size)
    label_vector = np.zeros((dataset_matrix.shape[0], ))

    data_index = 0
    for r in range(hypercube_reduced.shape[0]):
        if r < window_pad or r > hypercube_reduced.shape[0] - window_pad - 1:
        for c in range(hypercube_reduced.shape[1]):
            if c < window_pad or c > hypercube_reduced.shape[
                    1] - window_pad - 1:

            patch = hypercube_reduced[r - window_pad:r + window_pad + 1,
                                      c - window_pad:c + window_pad + 1]
            dataset_matrix[data_index, :, :, :] = patch
            label_vector[data_index] = ground_truth[r, c]

            data_index = data_index + 1

    dataset_matrix_r = dataset_matrix[label_vector > 0, :, :, :]
    label_vector_r = label_vector[label_vector > 0]

    rand_perm = np.random.permutation(label_vector_r.shape[0])
    dataset_matrix_r = dataset_matrix_r[rand_perm, :, :, :]
    label_vector_r = label_vector_r[rand_perm]

    label_vector_r = label_vector_r - 1.0

    return dataset_matrix, label_vector, dataset_matrix_r, label_vector_r
Exemplo n.º 53
def c_random_pca():
    pca_2 = RandomizedPCA(n_components=2)
    X_pca = pca_2.fit_transform(iris.data)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=iris.target, edgecolors="none")

    # Percentage of variance explained by each of the selected components.
    #     If all components are stored, the sum of explained variances is equal
    #     to 1.0.
    # Principal axes in feature space, representing the directions of
    #     maximum variance in the data
Exemplo n.º 54
def pca(all_corr, pc_start, pc_end):
    pca_components = pc_end - pc_start
    pca = RandomizedPCA(n_components=pca_components, whiten=False)
    print 'reducing dimensions to ' + str(pca_components) + ' PCA components'
    pc_idx = range(pc_start, pc_end)
    pca_xform = pca.fit_transform(all_corr)
    all_corr_pca = pca_xform[:, pc_idx]  #do not whiten PCA-space data
    eig = pca.components_
    variances = pca.explained_variance_ratio_
    eigenmaps = np.zeros([pca_components, masky * maskx])
    eigenmaps[:] = np.nan
    eigenmaps[:, pushmask] = eig
    eigenmaps_img = eigenmaps.reshape(pca_components, masky, maskx)
    return eigenmaps_img, all_corr_pca, variances
Exemplo n.º 55
def test_non_square_infomax():
    """ Test non-square infomax
    from sklearn.decomposition import RandomizedPCA

    rng = np.random.RandomState(0)

    n_samples = 200
    # Generate two sources:
    t = np.linspace(0, 100, n_samples)
    s1 = np.sin(t)
    s2 = np.ceil(np.sin(np.pi * t))
    s = np.c_[s1, s2].T
    s1, s2 = s

    # Mixing matrix
    n_observed = 6
    mixing = rng.randn(n_observed, 2)
    for add_noise in (False, True):
        m = np.dot(mixing, s)

        if add_noise:
            m += 0.1 * rng.randn(n_observed, n_samples)

        pca = RandomizedPCA(n_components=2, whiten=True, random_state=rng)
        m = m.T
        m = pca.fit_transform(m)
        # we need extended since input signals are sub-gaussian
        unmixing_ = infomax(m, random_state=rng, extended=True)
        s_ = np.dot(unmixing_, m.T)
        # Check that the mixing model described in the docstring holds:
        mixing_ = linalg.pinv(unmixing_.T)

        assert_almost_equal(m, s_.T.dot(mixing_))

        s1_, s2_ = s_
        # Check to see if the sources have been estimated
        # in the wrong order
        if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)):
            s2_, s1_ = s_
        s1_ *= np.sign(np.dot(s1_, s1))
        s2_ *= np.sign(np.dot(s2_, s2))

        # Check that we have estimated the original sources
        if not add_noise:
            assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2)
            assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2)
def get_pca_data_batch(imgs, hight=resize_hight, width=resize_width):
    newsize = (hight, width)

    rImgs = [lib_cv2.resize(e, newsize) for e in imgs]
    rImgs = [lib_cv2.cvtColor(e, lib_cv2.COLOR_BGR2GRAY) for e in rImgs]
    rImgs = [e.ravel() for e in rImgs]

    pca = RandomizedPCA(n_components=200, whiten=True)

    pImgs = pca.fit_transform(rImgs)

    return pImgs