예제 #1
1
파일: preproc.py 프로젝트: 121onto/noaa
def compute_pca(data_path=os.path.join(BASE_DIR, 'data/memmap/'),
                  out_path=os.path.join(BASE_DIR, 'data/'),
                  batch_size=500, image_size=3*300*300):

    ipca = IncrementalPCA(n_components=3, batch_size=batch_size)

    path = os.path.join(data_path, 'tn_x.dat')
    train = np.memmap(path, dtype=theano.config.floatX, mode='r+', shape=(4044,image_size))
    n_samples, _ = train.shape

    for batch_num, batch in enumerate(gen_batches(n_samples, batch_size)):
        X = train[batch,:]
        X = np.reshape(X, (X.shape[0], 3, int(image_size/3)))
        X = X.transpose(0, 2, 1)
        X = np.reshape(X, (reduce(np.multiply, X.shape[:2]), 3))
        ipca.partial_fit(X)

    path = os.path.join(data_path, 'v_x.dat')
    valid = np.memmap(path, dtype=theano.config.floatX, mode='r+', shape=(500,image_size))
    n_samples, _ = valid.shape


    for batch_num, batch in enumerate(gen_batches(n_samples, batch_size)):
        X = valid[batch,:]
        X = np.reshape(X, (X.shape[0], 3, int(image_size/3)))
        X = X.transpose(0, 2, 1)
        X = np.reshape(X, (reduce(np.multiply, X.shape[:2]), 3))
        ipca.partial_fit(X)

    eigenvalues, eigenvectors = np.linalg.eig(ipca.get_covariance())
    eigenvalues.astype('float32').dump(os.path.join(out_path, 'eigenvalues.dat'))
    eigenvectors.astype('float32').dump(os.path.join(out_path, 'eigenvectors.dat'))
 def reduceDataset(self,nr=3,method='PCA'):
     '''It reduces the dimensionality of a given dataset using different techniques provided by Sklearn library
      Methods available:
                         'PCA'
                         'FactorAnalysis'
                         'KPCArbf','KPCApoly'
                         'KPCAcosine','KPCAsigmoid'
                         'IPCA'
                         'FastICADeflation'
                         'FastICAParallel'
                         'Isomap'
                         'LLE'
                         'LLEmodified'
                         'LLEltsa'
     '''
     dataset=self.ModelInputs['Dataset']
     #dataset=self.dataset[Model.in_columns]
     #dataset=self.dataset[['Humidity','TemperatureF','Sea Level PressureIn','PrecipitationIn','Dew PointF','Value']]
     #PCA
     if method=='PCA':
         sklearn_pca = sklearnPCA(n_components=nr)
         reduced = sklearn_pca.fit_transform(dataset)
     #Factor Analysis
     elif method=='FactorAnalysis':
         fa=FactorAnalysis(n_components=nr)
         reduced=fa.fit_transform(dataset)
     #kernel pca with rbf kernel
     elif method=='KPCArbf':
         kpca=KernelPCA(nr,kernel='rbf')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with poly kernel
     elif method=='KPCApoly':
         kpca=KernelPCA(nr,kernel='poly')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with cosine kernel
     elif method=='KPCAcosine':
         kpca=KernelPCA(nr,kernel='cosine')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with sigmoid kernel
     elif method=='KPCAsigmoid':
         kpca=KernelPCA(nr,kernel='sigmoid')
         reduced=kpca.fit_transform(dataset)
     #ICA
     elif method=='IPCA':
         ipca=IncrementalPCA(nr)
         reduced=ipca.fit_transform(dataset)
     #Fast ICA
     elif method=='FastICAParallel':
         fip=FastICA(nr,algorithm='parallel')
         reduced=fip.fit_transform(dataset)
     elif method=='FastICADeflation':
         fid=FastICA(nr,algorithm='deflation')
         reduced=fid.fit_transform(dataset)
     elif method == 'All':
         self.dimensionalityReduction(nr=nr)
         return self
     
     self.ModelInputs.update({method:reduced})
     self.datasetsAvailable.append(method)
     return self
예제 #3
0
def ipca(mov, components = 50, batch =1000):
    # vectorize the images
    num_frames, h, w = mov.shape
    frame_size = h * w
    frame_samples = np.reshape(mov, (num_frames, frame_size)).T
    
    # run IPCA to approxiate the SVD
    
    ipca_f = IncrementalPCA(n_components=components, batch_size=batch)
    ipca_f.fit(frame_samples)
    
    # construct the reduced version of the movie vectors using only the 
    # principal component projection
    
    proj_frame_vectors = ipca_f.inverse_transform(ipca_f.transform(frame_samples))
        
    # get the temporal principal components (pixel time series) and 
    # associated singular values
    
    eigenseries = ipca_f.components_.T

    # the rows of eigenseries are approximately orthogonal
    # so we can approximately obtain eigenframes by multiplying the 
    # projected frame matrix by this transpose on the right
    
    eigenframes = np.dot(proj_frame_vectors, eigenseries)

    return eigenseries, eigenframes, proj_frame_vectors        
예제 #4
0
def get_pca_array(list_chunks, topology):
    """
    Takes a list of mdtraj.Trajectory objects and featurize them to backbone -
    Alpha Carbons pairwise distances. Perform 2 component Incremental
    PCA on the featurized trajectory.

    Parameters
    ----------
    list_chunks: list of mdTraj.Trajectory objects
    topology: str
            Name of the Topology file

    Returns
    -------
    Y: np.array shape(frames, features)

    """
    pca = IncrementalPCA(n_components=2)
    top = md.load_prmtop(topology)
    ca_backbone = top.select("name CA")
    pairs = top.select_pairs(ca_backbone, ca_backbone)
    pair_distances = []
    for chunk in list_chunks:
        X = md.compute_distances(chunk, pairs)
        pair_distances.append(X)
    distance_array = np.concatenate(pair_distances)
    print("No. of data points: %d" % distance_array.shape[0])
    print("No. of features (pairwise distances): %d" % distance_array.shape[1])
    Y = pca.fit_transform(distance_array)
    return Y
def ipca():
	train_features, test_features = gf.get_tfidf()
	vectorizer = gf.get_tfidf()
	n_components = 250
	ipca = IncrementalPCA(n_components=n_components, batch_size=1250)
	start_time = time.time()
	print 'start ipca on train'
	X_ipca = ipca.fit_transform(train_features)
	runtime = time.time() - start_time
	print '-----'
	print '%.2f seconds to ipca on train' % runtime
	print '-----'
	train_features = None
	
	print 'ipca train done'
	np.savetxt('train_features.csv', X_ipca, fmt='%.8e', delimiter=",")
	X_ipca = None
	print 'ipca train file done'
	test_features = gf.get_tfidf(vectorizer, False)
	Y_ipca = ipca.fit_transform(test_features)
	test_features, vectorizer = None, None
	print 'ipca test done'
	np.savetxt('test_features.csv', Y_ipca, fmt='%.8e', delimiter=",")
	svd_test_features = None
	print 'ipca test file done'
 def dimensionalityReduction(self,nr=5):
     '''It applies all the dimensionality reduction techniques available in this class:
     Techniques available:
                         'PCA'
                         'FactorAnalysis'
                         'KPCArbf','KPCApoly'
                         'KPCAcosine','KPCAsigmoid'
                         'IPCA'
                         'FastICADeflation'
                         'FastICAParallel'
                         'Isomap'
                         'LLE'
                         'LLEmodified'
                         'LLEltsa'
     '''
     dataset=self.ModelInputs['Dataset']
     sklearn_pca = sklearnPCA(n_components=nr)
     p_components = sklearn_pca.fit_transform(dataset)
     fa=FactorAnalysis(n_components=nr)
     factors=fa.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='rbf')
     rbf=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='poly')
     poly=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='cosine')
     cosine=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='sigmoid')
     sigmoid=kpca.fit_transform(dataset)
     ipca=IncrementalPCA(nr)
     i_components=ipca.fit_transform(dataset)
     fip=FastICA(nr,algorithm='parallel')
     fid=FastICA(nr,algorithm='deflation')
     ficaD=fip.fit_transform(dataset)
     ficaP=fid.fit_transform(dataset)
     '''isomap=Isomap(n_components=nr).fit_transform(dataset)
     try:
         lle1=LocallyLinearEmbedding(n_components=nr).fit_transform(dataset)
     except ValueError:
         lle1=LocallyLinearEmbedding(n_components=nr,eigen_solver='dense').fit_transform(dataset)
     try:
         
         lle2=LocallyLinearEmbedding(n_components=nr,method='modified').fit_transform(dataset)
     except ValueError:
         lle2=LocallyLinearEmbedding(n_components=nr,method='modified',eigen_solver='dense').fit_transform(dataset) 
     try:
         lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa').fit_transform(dataset)
     except ValueError:
         lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa',eigen_solver='dense').fit_transform(dataset)'''
     values=[p_components,factors,rbf,poly,cosine,sigmoid,i_components,ficaD,ficaP]#,isomap,lle1,lle2,lle3]
     keys=['PCA','FactorAnalysis','KPCArbf','KPCApoly','KPCAcosine','KPCAsigmoid','IPCA','FastICADeflation','FastICAParallel']#,'Isomap','LLE','LLEmodified','LLEltsa']
     self.ModelInputs.update(dict(zip(keys, values)))
     [self.datasetsAvailable.append(key) for key in keys ]
     
     #debug
     #dataset=pd.DataFrame(self.ModelInputs['Dataset'])
     #dataset['Output']=self.ModelOutput
     #self.debug['Dimensionalityreduction']=dataset
     ###
     return self
def get_pca(file_dir, s, t, i):
    from sklearn.decomposition import IncrementalPCA

    ipca = IncrementalPCA(n_components=48)
    for counter in range(s, t, i):
        features_file = np.load(file_dir + "/pca" + str(counter) + "_code.npy")
        ipca.partial_fit(features_file[:, 0:4096])
    return ipca
예제 #8
0
def test_incremental_pca_num_features_change():
    """Test that changing n_components will raise an error."""
    rng = np.random.RandomState(1999)
    n_samples = 100
    X = rng.randn(n_samples, 20)
    X2 = rng.randn(n_samples, 50)
    ipca = IncrementalPCA(n_components=None)
    ipca.fit(X)
    assert_raises(ValueError, ipca.partial_fit, X2)
예제 #9
0
def train_pca(file_dir, s, t, i):
    from sklearn.decomposition import IncrementalPCA
    global timer_pca
    timer_pca = Timer()	
    timer_pca.tic()
    ipca = IncrementalPCA(n_components=pca_dimensions)
    for counter in range(s, t, i):
        features_file = np.load(file_dir + '/pca' + str(counter) + '_code.npy')
	ipca.partial_fit(features_file[:, 0:4096])
	timer_pca.toc()
    return ipca
예제 #10
0
def create_pool_pca_from_files(file_dir, dir_output, s, t, i):
    from sklearn.decomposition import IncrementalPCA
    ipca = IncrementalPCA(n_components=number_dim_pca)
    for counter in range(s, t, i):
        features_file = np.load(file_dir + '/pca' + str(counter) + '_code.npy')
	ipca.partial_fit(features_file[:, 0:4096])
    for counter in range(s, t, i):
        out_file = dir_output + 'pca_red_' + str(counter) + '_code.npy'
	features_file = np.load(file_dir + '/pca' + str(counter) + '_code.npy') 
	features_red = ipca.transform(features_file[:, 0:4096])
	np.save(out_file, np.append(features_red, features_file[:, 4096:], axis=1))
def ipca(data, labels, new_dimension):
    print "start incremental pca..."

    if hasattr(data, "todense"):
        data = np.array(data.todense())

    start = time.time()
    pca = IncrementalPCA(n_components=new_dimension)
    reduced = pca.fit_transform(data)
    end = time.time()
    return (reduced, end-start)
def PCA_Train(data, result_fold, n_components=128):
    print_info("PCA training (n_components=%d)..." % n_components)

    pca = IncrementalPCA(n_components=n_components)
    pca.fit(data)

    joblib.dump(pca, result_fold + "pca_model.m")

    print_info("PCA done.")

    return pca
def train_pca_model(collection_name, feature_name, n_components, iterations=100, batch_size=20):
    collection = collection_from_name(collection_name)
    model = IncrementalPCA(n_components=n_components)

    partial_unpickle_data = partial(unpickle_data, feature_name=feature_name)

    for _ in range(iterations):
        feature = map(partial_unpickle_data, collection.aggregate([{'$sample': {'size': batch_size}}]))
        feature = np.hstack(feature).T

        model.partial_fit(feature)

    return model
예제 #14
0
def test_incremental_pca_inverse():
    """Test that the projection of data can be inverted."""
    rng = np.random.RandomState(1999)
    n, p = 50, 3
    X = rng.randn(n, p)  # spherical data
    X[:, 1] *= .00001  # make middle component relatively small
    X += [5, 4, 3]  # make a large mean

    # same check that we can find the original data from the transformed
    # signal (since the data is almost of rank n_components)
    ipca = IncrementalPCA(n_components=2, batch_size=10).fit(X)
    Y = ipca.transform(X)
    Y_inverse = ipca.inverse_transform(Y)
    assert_almost_equal(X, Y_inverse, decimal=3)
def test_singular_values():
    # Check that the IncrementalPCA output has the correct singular values

    rng = np.random.RandomState(0)
    n_samples = 1000
    n_features = 100

    X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0,
                                      effective_rank=10, random_state=rng)

    pca = PCA(n_components=10, svd_solver='full', random_state=rng).fit(X)
    ipca = IncrementalPCA(n_components=10, batch_size=100).fit(X)
    assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2)

    # Compare to the Frobenius norm
    X_pca = pca.transform(X)
    X_ipca = ipca.transform(X)
    assert_array_almost_equal(np.sum(pca.singular_values_**2.0),
                              np.linalg.norm(X_pca, "fro")**2.0, 12)
    assert_array_almost_equal(np.sum(ipca.singular_values_**2.0),
                              np.linalg.norm(X_ipca, "fro")**2.0, 2)

    # Compare to the 2-norms of the score vectors
    assert_array_almost_equal(pca.singular_values_,
                              np.sqrt(np.sum(X_pca**2.0, axis=0)), 12)
    assert_array_almost_equal(ipca.singular_values_,
                              np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2)

    # Set the singular values and see what we get back
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 110

    X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0,
                                      effective_rank=3, random_state=rng)

    pca = PCA(n_components=3, svd_solver='full', random_state=rng)
    ipca = IncrementalPCA(n_components=3, batch_size=100)

    X_pca = pca.fit_transform(X)
    X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
    X_pca[:, 0] *= 3.142
    X_pca[:, 1] *= 2.718

    X_hat = np.dot(X_pca, pca.components_)
    pca.fit(X_hat)
    ipca.fit(X_hat)
    assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
    assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
예제 #16
0
def generate_pca_compression(X, n_components=16, batch_size=100):
    """
    Compresses the data using sklearn PCA implementation.

    :param X: Data (n_samples, n_features)
    :param n_components: Number of dimensions for PCA to keep
    :param batch_size: Batch size for incrimental PCA

    :return: X_prime (the compressed representation), pca
    """

    pca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
    pca.fit(X)

    return pca.transform(X), pca
예제 #17
0
파일: pca.py 프로젝트: DaMSL/ddc
 def __init__(self, components):
   PCAnalyzer.__init__(self)
   if isinstance(components, int):
     self.n_components = components
   self.pca = IncrementalPCA(n_components=components, batch_size=500)
   self.num_seen = 0
   self.type = 'incremental'
def test_n_components_none():
    # Ensures that n_components == None is handled correctly
    rng = np.random.RandomState(1999)
    for n_samples, n_features in [(50, 10), (10, 50)]:
        X = rng.rand(n_samples, n_features)
        ipca = IncrementalPCA(n_components=None)

        # First partial_fit call, ipca.n_components_ is inferred from
        # min(X.shape)
        ipca.partial_fit(X)
        assert ipca.n_components_ == min(X.shape)

        # Second partial_fit call, ipca.n_components_ is inferred from
        # ipca.components_ computed from the first partial_fit call
        ipca.partial_fit(X)
        assert ipca.n_components_ == ipca.components_.shape[0]
예제 #19
0
def test_incremental_pca_partial_fit():
    """Test that fit and partial_fit get equivalent results."""
    rng = np.random.RandomState(1999)
    n, p = 50, 3
    X = rng.randn(n, p)  # spherical data
    X[:, 1] *= .00001  # make middle component relatively small
    X += [5, 4, 3]  # make a large mean

    # same check that we can find the original data from the transformed
    # signal (since the data is almost of rank n_components)
    batch_size = 10
    ipca = IncrementalPCA(n_components=2, batch_size=batch_size).fit(X)
    pipca = IncrementalPCA(n_components=2, batch_size=batch_size)
    # Add one to make sure endpoint is included
    batch_itr = np.arange(0, n + 1, batch_size)
    for i, j in zip(batch_itr[:-1], batch_itr[1:]):
        pipca.partial_fit(X[i:j, :])
    assert_almost_equal(ipca.components_, pipca.components_, decimal=3)
예제 #20
0
파일: mypca.py 프로젝트: avg14/galaxyzoo
class MyPCA:

	def __init__(self, filename=None):
		if not filename:
			self.model = IncrementalPCA(NUM_COMP)
		else:
			with open(filename, 'r') as f:
				self.model = pickle.load(f)

	def train(self, X):
		self.model.partial_fit(X)

	def transform(self, X):
		return self.model.transform(X)	

	def dump(self, filename):
		with open(filename, 'w') as f:
			pickle.dump(self.model, f)
예제 #21
0
def test_whitening():
    """Test that PCA and IncrementalPCA transforms match to sign flip."""
    X = datasets.make_low_rank_matrix(1000, 10, tail_strength=0.,
                                      effective_rank=2, random_state=1999)
    prec = 3
    n_samples, n_features = X.shape
    for nc in [None, 9]:
        pca = PCA(whiten=True, n_components=nc).fit(X)
        ipca = IncrementalPCA(whiten=True, n_components=nc,
                              batch_size=250).fit(X)

        Xt_pca = pca.transform(X)
        Xt_ipca = ipca.transform(X)
        assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec)
        Xinv_ipca = ipca.inverse_transform(Xt_ipca)
        Xinv_pca = pca.inverse_transform(Xt_pca)
        assert_almost_equal(X, Xinv_ipca, decimal=prec)
        assert_almost_equal(X, Xinv_pca, decimal=prec)
        assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
예제 #22
0
def run_pca(n_components,n_sites,order_dict,sim_mat):
   
   	output_file = open('pca_100000_100','w')
   	
        ipca = IncrementalPCA(n_components=n_components,batch_size=8000)
	sim_mat_ipca = ipca.fit_transform(sim_mat)
	var_sim_ipca = ipca.explained_variance_ratio_
	
	output_file.write(",".join(str(x) for x in var_sim_ipca)+'\n')

	for siteid in order_dict:
		stringa = ' '.join(
			[siteid,
        	str(sim_mat_ipca[order_dict[siteid], 0]),
        	str(sim_mat_ipca[order_dict[siteid], 1]),
         	str(sim_mat_ipca[order_dict[siteid], 2]),
         	str(sim_mat_ipca[order_dict[siteid], 3]),
         	str(sim_mat_ipca[order_dict[siteid], 4]),
         	str(sim_mat_ipca[order_dict[siteid], 5]),
         	str(sim_mat_ipca[order_dict[siteid], 6])
        	])
		output_file.write(stringa +'\n')
    	
	n_bins = 1000.
	binned = np.empty((n_sites,5)).astype(np.int32)
	for k in range(5):
		delta = (sim_mat_ipca[:, k].max()-sim_mat_ipca[:, k].min())/n_bins
		min_k = sim_mat_ipca[:, k].min()
		for i in range(n_sites):
			binned[i,k] = int((sim_mat_ipca[i, k]-min_k)/delta)
        	
	f = open('pc_100000_100.csv','w')
	for siteid in order_dict:
		stringa = ' '.join(
			[siteid,
        	str(binned[order_dict[siteid], 0]),
        	str(binned[order_dict[siteid], 1]),
         	str(binned[order_dict[siteid], 2]),
         	str(binned[order_dict[siteid], 3]),
         	str(binned[order_dict[siteid], 4])    
        	])
    	f.write(stringa +'\n')
	f.close()
예제 #23
0
def reduce_data(features, out_dir, dim=10, first_column=True):
    array = np.load(features)
    subarray = array
    if not first_column:
        subarray = array[:, 1:]

    ipca = IncrementalPCA(n_components=dim, copy=False, batch_size=500000)
    ipca.fit_transform(subarray)
    new_array = subarray
    # when it cannot fit into memory do it incrementally like below
    # new_array_1 = tsvd.fit_transform(subarray[:1500000, :])
    # new_array_2 = tsvd.fit_transform(subarray[1500000:3400000, :])
    # new_array_3 = tsvd.fit_transform(subarray[3400000:, :])
    # new_array = np.vstack([new_array_1, new_array_2, new_array_3])
    if not first_column:
        new_array = np.c_[array[:, 0], new_array]

    assert new_array.shape[0] == array.shape[0]
    np.save(os.path.join(out_dir, os.path.basename(features) + "_pca"), new_array)
 def PCA_train(self):
     pcafun = None
     if self.pca == None:            
         (a,b) = self.descriptors.shape
         self.pca = IncrementalPCA(n_components = int(b*self.pca_ratio))
         pcafun = self.pca.fit
     else:
         pcafun = self.pca.partial_fit
     pcafun(self.descriptors)
     self.PCA_common()
예제 #25
0
	def ipca(self, X, n_components=100):
		from sklearn.decomposition import IncrementalPCA
		# trials = h5py.File(self.path + "/trials.hdf5", 'r')
		# scaled_meg = trials['scaled_meg'] # it's ok, the dataset is not fetched to memory yet
		# scaled_meeg = trials['scaled_meeg']

		n1 = X.shape[0] # how many rows we have in the dataset
		chunk_size = 1000 # how many rows we feed to IPCA at a time, the divisor of n
		ipca = IncrementalPCA(n_components=n_components)

		for i in range(0, n1//chunk_size):
			print("{} to {} out of {}.".format(i*chunk_size,(i+1)*chunk_size,n1))
			print(X[i*chunk_size : (i+1)*chunk_size].shape)
			ipca.partial_fit(X[i*chunk_size : (i+1)*chunk_size])

		x = ipca.transform(X)
		print(x.shape)
		# n_comp = sum(i > 10.0e-05 for i in ipca.explained_variance_ratio_)
		# print(n_comp)
		return x
예제 #26
0
class PCASK(AbstractFeature):
    def __init__(self, n_components):
        AbstractFeature.__init__(self)
        self.n_components = n_components
        #for key in options:
            #setattr(self,key,options[key])

    def compute(self,X,y):
        if X.ndim == 3:
            X = X.reshape((X.shape[0],X.shape[1]*X.shape[2]))
        self.ipca = IncrementalPCA(n_components=self.n_components, batch_size=None)
        return self.ipca.fit_transform(X)


    def extract(self,X):
        if X.ndim == 2:
            X = X.reshape((X.shape[0]*X.shape[1]))
        return list(self.ipca.transform([X])[0])

    def __repr__(self):
        return "PCASK"
예제 #27
0
파일: gwpca.py 프로젝트: Acellera/htmd
    def project(self, ndim=None):
        """ Projects the data object given to the constructor onto `ndim` dimensions

        Parameters
        ----------
        ndim : int
            The number of dimensions we want to project the data on.

        Returns
        -------
        dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object
            A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the projected data

        Example
        -------
        >>> gw = GWPCA(data)
        >>> dataproj = gw.project(5)
        """
        from sklearn.decomposition import IncrementalPCA
        from htmd.progress.progress import ProgressBar
        from htmd.metricdata import MetricData

        pca = IncrementalPCA(n_components=ndim, batch_size=10000)
        p = ProgressBar(len(self.data.dat))
        for d in self.data.dat:
            pca.partial_fit(d * self.weights)
            p.progress()
        p.stop()

        projdata = self.data.copy()
        p = ProgressBar(len(self.data.dat))
        for i, d in enumerate(self.data.dat):
            projdata.dat[i] = pca.transform(d * self.weights)
            p.progress()
        p.stop()

        # projdataconc = pca.fit_transform(self.weighedconcat)
        # projdata.dat = projdata.deconcatenate(projdataconc)
        return projdata
예제 #28
0
    def IPCA(self, components = 50, batch =1000):
        '''
        Iterative Principal Component analysis, see sklearn.decomposition.incremental_pca
        Parameters:
        ------------
        components (default 50) = number of independent components to return
        batch (default 1000)  = number of pixels to load into memory simultaneously in IPCA. More requires more memory but leads to better fit
        Returns
        -------
        eigenseries: principal components (pixel time series) and associated singular values
        eigenframes: eigenframes are obtained by multiplying the projected frame matrix by the projected movie (whitened frames?)
        proj_frame_vectors:the reduced version of the movie vectors using only the principal component projection
        '''
        # vectorize the images
        num_frames, h, w = np.shape(self);
        frame_size = h * w;
        frame_samples = np.reshape(self, (num_frames, frame_size)).T

        # run IPCA to approxiate the SVD
        ipca_f = IncrementalPCA(n_components=components, batch_size=batch)
        ipca_f.fit(frame_samples)

        # construct the reduced version of the movie vectors using only the
        # principal component projection

        proj_frame_vectors = ipca_f.inverse_transform(ipca_f.transform(frame_samples))

        # get the temporal principal components (pixel time series) and
        # associated singular values

        eigenseries = ipca_f.components_.T

        # the rows of eigenseries are approximately orthogonal
        # so we can approximately obtain eigenframes by multiplying the
        # projected frame matrix by this transpose on the right

        eigenframes = np.dot(proj_frame_vectors, eigenseries)

        return eigenseries, eigenframes, proj_frame_vectors
def PCA(source, num_components, chuck_size):
    image_path = sorted(list(source), key = lambda x: (int(x.split('_')[0]), x.split('_')[1]))
    size, images = 0, []
    n_chunks = len(image_path)//chunk_size
    pca = IncrementalPCA(n_components=num_components, batch_size=chunk_size)
    for i in range(n_chunks):
        print('Chunk:', i, '\tIndex:', i * chunk_size + size)
        while size < chunk_size:
            images.append(imread(source+image_path[i * chunk_size + size]).flatte())
            size += 1
        pca.partial_fit(np.asarray(images))
        size, images = 0, []

        if i == n_chunks - 1:
            i += 1
            print('chunk:', i, 'index:', i * chunk_size + size)
            transformed = pca.transform(np.asarray(images))
            if xTransformed is None:
                xTransformed = transformed
            else:
                xTransformed = np.vstack((xTransformed, transformed))
            size, images = 0, []
            if i == n_chunks - 1:
                i += 1
                while i * chunk_size + size < len(image_path):
                    images.append(imread(source+image_path[i * chunk_size]).flatten())
                    size += 1
                trasformed = pca.transform(np.asarray(images))
                xTransformed = np.vstack((xTransformed, transformed))
            print("\nTransformed matrix shape:", xTransformed.shape)
            return xTransformed
        if __name__ == "__main__":
            source = './train/right'
            new_size = '32x32'
            pool = Pool()
            start = time.time()
            pool.map(imageResize, zip(itertools.repeat(source), listdir(source), itertools.repeat(new_size)))
            print("Resized Images in {0} seconds".formate(time.time() - start))
def performPCA(source, num_components, chunk_size):
  image_paths = sorted(listdir(source), key=lambda x: (int(x.split('_')[0]), x.split('_')[1]))
  size, images = 0, []
  n_chunks = len(image_paths)//chunk_size
  pca = IncrementalPCA(n_components=num_components, batch_size=chunk_size)

  # Read in all images and do a partial fit on the PCA model.
  for i in range(n_chunks):
    print 'Chunk:', i, 'Index:', i * chunk_size + size
    while size < chunk_size:
      images.append(imread(source+image_paths[i * chunk_size + size]).flatten())
      size += 1

    pca.partial_fit(np.asarray(images))
    size, images = 0, []

    if i == n_chunks - 1:
      i += 1
      while i * chunk_size + size < len(image_paths):
        images.append(imread(source+image_paths[i * chunk_size + size]).flatten())
        size += 1
      pca.partial_fit(np.asarray(images))

  # Only works with Python 3
  #print("\nExplained variance ratios: {0}".format(pca.explained_variance_ratio_))
  #print("Sum of variance captured by components: {0}\n".format(sum(pca.explained_variance_ratio_)))

  xTransformed = None

  # Read in all images again and transform them using the PCA model.
  for i in range(n_chunks):
    while size < chunk_size:
      images.append(imread(source+image_paths[i * chunk_size + size]).flatten())
      size += 1
    print 'Chunk:', i, 'index:', i * chunk_size + size
    transformed = pca.transform(np.asarray(images))
    if xTransformed is None:
      xTransformed = transformed
    else:
      xTransformed = np.vstack((xTransformed, transformed))
    size, images = 0, []

    if i == n_chunks - 1:
      i += 1
      while i * chunk_size + size < len(image_paths):
        images.append(imread(source+image_paths[i * chunk_size + size]).flatten())
        size += 1
      transformed = pca.transform(np.asarray(images))
      xTransformed = np.vstack((xTransformed, transformed))

  print "\nTransformed matrix shape:", xTransformed.shape
  return xTransformed
# from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from pprint import pprint
import numpy as np
import pickle
import os
from tqdm import tqdm

diri = '/home/dpappas/bioasq_all/bert_elmo_embeds/'
filename = '/home/dpappas/bioasq_all/pca_elmo_transformer.sav'
mat, m = None, 0

if (not os.path.exists(filename)):
    transformer = IncrementalPCA(n_components=50)
    for f in tqdm(os.listdir(diri), ascii=True):
        m += 1
        fpath = os.path.join(diri, f)
        d = pickle.load(open(fpath, 'rb'))
        #
        if (mat is None):
            mat = np.concatenate(d['title_sent_elmo_embeds'] +
                                 d['abs_sent_elmo_embeds'],
                                 axis=0)
        else:
            mat = np.concatenate([mat] + d['title_sent_elmo_embeds'] +
                                 d['abs_sent_elmo_embeds'],
                                 axis=0)
        if (mat.shape[0] > 1000):
            transformer.partial_fit(mat)
            mat = None
    pickle.dump(transformer, open(filename, 'wb'))
예제 #32
0
파일: lazy.py 프로젝트: chrinide/hyperspy
    def decomposition(self,
                      output_dimension,
                      normalize_poissonian_noise=False,
                      algorithm='PCA',
                      signal_mask=None,
                      navigation_mask=None,
                      get=threaded.get,
                      num_chunks=None,
                      reproject=True,
                      bounds=True,
                      **kwargs):
        """Perform Incremental (Batch) decomposition on the data, keeping n
        significant components.

        Parameters
        ----------
        output_dimension : int
            the number of significant components to keep
        normalize_poissonian_noise : bool
            If True, scale the SI to normalize Poissonian noise
        algorithm : str
            One of ('PCA', 'ORPCA', 'ONMF'). By default ('PCA') IncrementalPCA
            from scikit-learn is run.
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain atleast output_dimension signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
            decompostion.
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
            decomposition.
        reproject : bool
            Reproject data on the learnt components (factors) after learning.
        bounds : {tuple, bool}
            The (min, max) values of the data to normalize before learning.
            If tuple (min, max), those values will be used for normalization.
            If True, extremes will be looked up (expensive), default.
            If False, no normalization is done (learning may be very slow).
            If normalize_poissonian_noise is True, this cannot be True.
        **kwargs
            passed to the partial_fit/fit functions.

        Notes
        -----
        Various algorithm parameters and their default values:
            ONMF:
                lambda1=1,
                kappa=1,
                robust=False,
                store_r=False
                batch_size=None
            ORPCA:
                fast=True,
                lambda1=None,
                lambda2=None,
                method=None,
                learning_rate=None,
                init=None,
                training_samples=None,
                momentum=None
            PCA:
                batch_size=None,
                copy=True,
                white=False


        """
        explained_variance = None
        explained_variance_ratio = None
        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])
        if blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)
        blocksize *= num_chunks

        # LEARN
        if algorithm == 'PCA':
            from sklearn.decomposition import IncrementalPCA
            obj = IncrementalPCA(n_components=output_dimension)
            method = partial(obj.partial_fit, **kwargs)
            reproject = True

        elif algorithm == 'ORPCA':
            from hyperspy.learn.rpca import ORPCA
            kwg = {'fast': True}
            kwg.update(kwargs)
            obj = ORPCA(output_dimension, **kwg)
            method = partial(obj.fit, iterating=True)

        elif algorithm == 'ONMF':
            from hyperspy.learn.onmf import ONMF
            batch_size = kwargs.pop('batch_size', None)
            obj = ONMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)

        else:
            raise ValueError('algorithm not known')

        original_data = self.data
        try:
            if normalize_poissonian_noise:
                if bounds is True:
                    bounds = False
                    # warnings.warn?
                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                    da.zeros(self.axes_manager.navigation_shape[::-1],
                             chunks=nav_chunks) if navigation_mask is None else
                    to_array(navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                    da.zeros(self.axes_manager.signal_shape[::-1],
                             chunks=sig_chunks) if signal_mask is None else
                    to_array(signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(data.sum(axis=range(ndim)),
                                    data.sum(axis=range(ndim, ndim + sdim)))
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) + (None, ) * rbH.ndim] *\
                    rbH[(None, ) * raG.ndim + (...,)]
                coeff.map_blocks(np.nan_to_num)
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # normalize the data for learning algs:
            if bounds:
                if bounds is True:
                    _min, _max = da.compute(self.data.min(), self.data.max())
                else:
                    _min, _max = bounds
                self.data = (self.data - _min) / (_max - _min)

            # LEARN
            this_data = []
            try:
                for chunk in progressbar(self._block_iterator(
                        flat_signal=True,
                        get=get,
                        signal_mask=signal_mask,
                        navigation_mask=navigation_mask),
                                         total=nblocks,
                                         leave=True,
                                         desc='Learn'):
                    this_data.append(chunk)
                    if len(this_data) == num_chunks:
                        thedata = np.concatenate(this_data, axis=0)
                        method(thedata)
                        this_data = []
                if len(this_data):
                    thedata = np.concatenate(this_data, axis=0)
                    method(thedata)
            except KeyboardInterrupt:
                pass

            # GET ALREADY CALCULATED RESULTS
            if algorithm == 'PCA':
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == 'ORPCA':
                _, _, U, S, V = obj.finish()
                factors = U * S
                loadings = V
                explained_variance = S**2 / len(factors)

            elif algorithm == 'ONMF':
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == 'PCA':
                    method = obj.transform
                    post = lambda a: np.concatenate(a, axis=0)
                elif algorithm == 'ORPCA':
                    method = obj.project
                    obj.R = []
                    post = lambda a: obj.finish()[4]
                elif algorithm == 'ONMF':
                    method = obj.project
                    post = lambda a: np.concatenate(a, axis=1).T

                _map = map(
                    lambda thing: method(thing),
                    self._block_iterator(flat_signal=True,
                                         get=get,
                                         signal_mask=signal_mask,
                                         navigation_mask=navigation_mask))
                H = []
                try:
                    for thing in progressbar(_map,
                                             total=nblocks,
                                             desc='Project'):
                        H.append(thing)
                except KeyboardInterrupt:
                    pass
                loadings = post(H)

            if explained_variance is not None and \
                    explained_variance_ratio is None:
                explained_variance_ratio = \
                    explained_variance / explained_variance.sum()

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            try:
                loadings = _reshuffle_mixed_blocks(loadings, ndim,
                                                   (output_dimension, ),
                                                   nav_chunks).reshape(
                                                       (-1, output_dimension))
            except ValueError:
                # In case the projection step was not finished, it's left
                # as scrambled
                pass
        finally:
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio
def main(_):
    z_size = 2
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train = x_train.reshape(-1, 28**2)
    y_train = y_train
    x_test = x_test.reshape(-1, 28**2)
    x_train_normed, mu_train = normalize(x_train)
    x_test_normed, mu_test = normalize(x_test)
    batch_size = 4096

    # numpy pure pca
    #####################################################################
    # for PCA it is important to have 0 mean otherwise it does not work #
    #####################################################################
    u, s, v = np.linalg.svd(x_train_normed, full_matrices=False)
    z_pca_train = (x_train_normed @ v.T)[:, :z_size]
    z_pca_test = (x_test_normed @ v.T)[:, :z_size]

    r_pca_train = denormalize(z_pca_train @ v[:z_size, :],
                              mu_train)  # reconstruction
    r_pca_test = denormalize(z_pca_test @ v[:z_size, :],
                             mu_test)  # reconstruction
    err_train = np.sum(
        (x_train - r_pca_train).astype(np.int64)**2) / r_pca_train.size
    err_test = np.sum(
        (x_test - r_pca_test).astype(np.int64)**2) / r_pca_test.size
    print('PCA train reconstruction error with 2 PCs: ' +
          str(round(err_train, 3)))
    print('PCA test reconstruction error with 2 PCs: ' +
          str(round(err_test, 3)))

    for i in range(z_size):
        plt.imshow(v.reshape(-1, 28, 28)[i], cmap="gray")
        plt.show()

    visualize_data(x_train, y_train, r_pca_train, z_pca_train, 'train_pca')
    visualize_data(x_test, y_test, r_pca_test, z_pca_test, 'test_pca')

    # # scikit-learn pca
    pca = PCA(n_components=z_size)
    z_pca_train = pca.fit_transform(x_train)
    z_pca_test = pca.transform(x_test)
    r_pca_train = pca.inverse_transform(z_pca_train)
    r_pca_test = pca.inverse_transform(z_pca_test)
    err_train = np.sum(
        (x_train - r_pca_train).astype(np.int64)**2) / r_pca_train.size
    err_test = np.sum(
        (x_test - r_pca_test).astype(np.int64)**2) / r_pca_test.size
    print('scikit-learn PCA train reconstruction error with 2 PCs: ' +
          str(round(err_train, 3)))
    print('scikit-learn PCA test reconstruction error with 2 PCs: ' +
          str(round(err_test, 3)))

    for i in range(z_size):
        plt.imshow(pca.components_.reshape(-1, 28, 28)[i], cmap="gray")
        plt.show()

    visualize_data(x_train, y_train, r_pca_train, z_pca_train,
                   'train_sklearn_pca')
    visualize_data(x_test, y_test, r_pca_test, z_pca_test, 'test_sklearn_pca')

    # scikit-learn incremental pca
    pca = IncrementalPCA(n_components=z_size, batch_size=100)
    z_pca_train = pca.fit_transform(x_train)
    z_pca_test = pca.transform(x_test)
    r_pca_train = pca.inverse_transform(z_pca_train)
    r_pca_test = pca.inverse_transform(z_pca_test)
    err_train = np.sum(
        (x_train - r_pca_train).astype(np.int64)**2) / r_pca_train.size
    err_test = np.sum(
        (x_test - r_pca_test).astype(np.int64)**2) / r_pca_test.size
    print(
        'scikit-learn incremental PCA train reconstruction error with 2 PCs: '
        + str(round(err_train, 3)))
    print(
        'scikit-learn incremental PCA test reconstruction error with 2 PCs: ' +
        str(round(err_test, 3)))

    for i in range(z_size):
        plt.imshow(pca.components_.reshape(-1, 28, 28)[i], cmap="gray")
        plt.show()

    visualize_data(x_train, y_train, r_pca_train, z_pca_train, 'train')
    visualize_data(x_test, y_test, r_pca_test, z_pca_test, 'test')

    # keras pca using autoencoder
    m = Sequential()
    m.add(
        Dense(z_size,
              activation='linear',
              input_shape=(784, ),
              name='bottleneck'))
    m.add(Dense(784, activation='linear', name='decoder'))
    m.compile(loss='mean_squared_error', optimizer=Adam())
    print(m.summary())
    tensorboard = TensorBoard(log_dir='logs/ae_pca', histogram_freq=5)
    history = m.fit(x_train_normed,
                    x_train_normed,
                    batch_size=batch_size,
                    epochs=10,
                    verbose=1,
                    validation_data=(x_test_normed, x_test_normed),
                    callbacks=[tensorboard])
    eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed,
                      y_train, y_test, history, 'ae_pca')
    K.clear_session()

    # keras autoencoder with tanh, not centered, but normalized to [-1, 1]
    x_train_normed, mu_train = normalize(x_train, use_mean=False)
    x_test_normed, mu_test = normalize(x_test, use_mean=False)

    m = Sequential()
    m.add(Dense(512, activation='elu', input_shape=(784, )))
    m.add(Dense(128, activation='elu'))
    m.add(Dense(z_size, activation='linear', name='bottleneck'))
    m.add(Dense(128, activation='elu'))
    m.add(Dense(512, activation='elu'))
    m.add(Dense(784, activation='tanh', name='decoder'))
    m.compile(loss='mean_squared_error', optimizer=Adam())
    tensorboard = TensorBoard(log_dir='logs/ae_tanh_no_mean', histogram_freq=5)
    print(m.summary())
    history = m.fit(x_train_normed,
                    x_train_normed,
                    batch_size=batch_size,
                    epochs=50,
                    verbose=1,
                    validation_data=(x_test_normed, x_test_normed),
                    callbacks=[tensorboard])
    eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed,
                      y_train, y_test, history, 'ae_tanh_no_mean')
    K.clear_session()

    # keras autoencoder, centered
    x_train_normed, mu_train = normalize(x_train)
    x_test_normed, mu_test = normalize(x_test)

    m = Sequential()
    m.add(Dense(512, activation='elu', input_shape=(784, )))
    m.add(Dense(128, activation='elu'))
    m.add(Dense(z_size, activation='linear', name='bottleneck'))
    m.add(Dense(128, activation='elu'))
    m.add(Dense(512, activation='elu'))
    m.add(Dense(784, activation='linear', name='decoder'))
    m.compile(loss='mean_squared_error', optimizer=Adam())
    tensorboard = TensorBoard(log_dir='logs/ae', histogram_freq=5)
    print(m.summary())
    history = m.fit(x_train_normed,
                    x_train_normed,
                    batch_size=batch_size,
                    epochs=50,
                    verbose=1,
                    validation_data=(x_test_normed, x_test_normed),
                    callbacks=[tensorboard])
    eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed,
                      y_train, y_test, history, 'ae')
    K.clear_session()

    # keras autoencoder, not centered, but normalized to [-1, 1]
    x_train_normed, mu_train = normalize(x_train, use_mean=False)
    x_test_normed, mu_test = normalize(x_test, use_mean=False)

    m = Sequential()
    m.add(Dense(512, activation='elu', input_shape=(784, )))
    m.add(Dense(128, activation='elu'))
    m.add(Dense(z_size, activation='linear', name='bottleneck'))
    m.add(Dense(128, activation='elu'))
    m.add(Dense(512, activation='elu'))
    m.add(Dense(784, activation='linear', name='decoder'))
    m.compile(loss='mean_squared_error', optimizer=Adam())
    tensorboard = TensorBoard(log_dir='logs/ae_no_mean', histogram_freq=5)
    print(m.summary())
    history = m.fit(x_train_normed,
                    x_train_normed,
                    batch_size=batch_size,
                    epochs=50,
                    verbose=1,
                    validation_data=(x_test_normed, x_test_normed),
                    callbacks=[tensorboard])
    eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed,
                      y_train, y_test, history, 'ae_no_mean')
    K.clear_session()

    # keras autoencoder, not centered, but normalized to [-1, 1]
    x_train_normed, mu_train = normalize(x_train, use_mean=False)
    x_test_normed, mu_test = normalize(x_test, use_mean=False)

    regul_const = 10e-9
    m = Sequential()
    m.add(
        Dense(512,
              activation='elu',
              input_shape=(784, ),
              activity_regularizer=l1(regul_const)))
    m.add(Dense(128, activation='elu', activity_regularizer=l1(regul_const)))
    m.add(
        Dense(z_size,
              activation='linear',
              name='bottleneck',
              activity_regularizer=l1(regul_const)))
    m.add(Dense(128, activation='elu', activity_regularizer=l1(regul_const)))
    m.add(Dense(512, activation='elu', activity_regularizer=l1(regul_const)))
    m.add(
        Dense(784,
              activation='linear',
              name='decoder',
              activity_regularizer=l1(regul_const)))
    m.compile(loss='mean_squared_error', optimizer=Adam())
    tensorboard = TensorBoard(log_dir='logs/ae_no_mean_reg', histogram_freq=5)
    print(m.summary())
    history = m.fit(x_train_normed,
                    x_train_normed,
                    batch_size=batch_size,
                    epochs=50,
                    verbose=1,
                    validation_data=(x_test_normed, x_test_normed),
                    callbacks=[tensorboard])
    eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed,
                      y_train, y_test, history, 'ae_no_mean_reg')
    K.clear_session()

    # keras autoencoder, regularizing only latent space
    x_train_normed, mu_train = normalize(x_train, use_mean=False)
    x_test_normed, mu_test = normalize(x_test, use_mean=False)

    regul_const = 10e-6
    m = Sequential()
    m.add(Dense(512, activation='elu', input_shape=(784, )))
    m.add(Dense(128, activation='elu'))
    m.add(
        Dense(z_size,
              activation='linear',
              name='bottleneck',
              activity_regularizer=l1(regul_const)))
    m.add(Dense(128, activation='elu'))
    m.add(Dense(512, activation='elu'))
    m.add(Dense(784, activation='linear', name='decoder'))
    m.compile(loss='mean_squared_error', optimizer=Adam())
    tensorboard = TensorBoard(log_dir='logs/ae_no_mean_reg_lat_e6',
                              histogram_freq=5)
    print(m.summary())
    history = m.fit(x_train_normed,
                    x_train_normed,
                    batch_size=batch_size,
                    epochs=50,
                    verbose=1,
                    validation_data=(x_test_normed, x_test_normed),
                    callbacks=[tensorboard])
    eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed,
                      y_train, y_test, history, 'ae_no_mean_reg_lat_e6')
    K.clear_session()

    x_train_normed, mu_train = normalize(x_train, use_mean=False)
    x_test_normed, mu_test = normalize(x_test, use_mean=False)

    regul_const = 10e-7
    m = Sequential()
    m.add(Dense(512, activation='elu', input_shape=(784, )))
    m.add(Dense(128, activation='elu'))
    m.add(
        Dense(z_size,
              activation='linear',
              name='bottleneck',
              activity_regularizer=l1(regul_const)))
    m.add(Dense(128, activation='elu'))
    m.add(Dense(512, activation='elu'))
    m.add(Dense(784, activation='linear', name='decoder'))
    m.compile(loss='mean_squared_error', optimizer=Adam())
    tensorboard = TensorBoard(log_dir='logs/ae_no_mean_reg_lat_e7',
                              histogram_freq=5)
    print(m.summary())
    history = m.fit(x_train_normed,
                    x_train_normed,
                    batch_size=batch_size,
                    epochs=50,
                    verbose=1,
                    validation_data=(x_test_normed, x_test_normed),
                    callbacks=[tensorboard])
    eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed,
                      y_train, y_test, history, 'ae_no_mean_reg_lat_e7')
    K.clear_session()

    # trying on cifar 100
    z_size = 2
    (x_train, y_train), (x_test, y_test) = cifar100.load_data()
    x_train = x_train.reshape(-1, 32**2)
    y_train = y_train
    x_test = x_test.reshape(-1, 32**2)
    x_train_normed, mu_train = normalize(x_train, use_mean=False)
    x_test_normed, mu_test = normalize(x_test, use_mean=False)

    regul_const = 10e-7
    m = Sequential()
    m.add(Dense(512, activation='elu', input_shape=(32**2, )))
    m.add(Dense(128, activation='elu'))
    m.add(
        Dense(z_size,
              activation='linear',
              name='bottleneck',
              activity_regularizer=l1(regul_const)))
    m.add(Dense(128, activation='elu'))
    m.add(Dense(512, activation='elu'))
    m.add(Dense(32**2, activation='linear', name='decoder'))
    m.compile(loss='mean_squared_error', optimizer=Adam())
    tensorboard = TensorBoard(log_dir='logs/ae_cifar_100', histogram_freq=5)
    print(m.summary())
    history = m.fit(x_train_normed,
                    x_train_normed,
                    batch_size=batch_size,
                    epochs=50,
                    verbose=1,
                    validation_data=(x_test_normed, x_test_normed),
                    callbacks=[tensorboard])
    eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed,
                      y_train, y_test, history, 'ae_cifar_100', (32, 32))
    K.clear_session()

    print('done')
예제 #34
0
    dt_features = StandardScaler().fit_transform(dt_features)

    x_train, x_test, y_train, y_test = train_test_split(dt_features,
                                                        dt_target,
                                                        test_size=0.3,
                                                        random_state=42)

    print(x_train.shape)
    print(y_train.shape)

    # default n_components = min(n_muestras, n-features)
    pca = PCA(n_components=3)
    pca.fit(x_train)

    ipca = IncrementalPCA(n_components=3, batch_size=10)
    ipca.fit(x_train)

    plt.plot(range(len(pca.explained_variance_)),
             pca.explained_variance_ratio_)
    #plt.show()

    logistic = LogisticRegression(solver='lbfgs')

    dt_train = pca.transform(x_train)
    dt_test = pca.transform(x_test)
    logistic.fit(dt_train, y_train)
    print("SCORE PCA", logistic.score(dt_test, y_test))

    dt_train = ipca.transform(x_train)
    dt_test = ipca.transform(x_test)
예제 #35
0
    def renyi_select(self,X,represent_points,do_pca=False):
        """
        Takes in data and number of prototype vectors and returns the indices of the prototype vectors.
        The prototype vectors are selected based on maximization of quadratic renyi entropy, which can be 
        written in terms of log sum exp which is a tightly bounded by max operator. Now for rbf kernel,
        the max_{ij}(-\|x_i-x_j\|^2) is equivalent to min_{ij}(\|x_i-x_j\|^2).
        Parameters
        ----------
        X: np.ndarray
            shape = n_samples, n_features
        represent_points: int
            number of prototype vectors to return
        do_pca: boolean
            whether to perform incremental pca for dimensionality reduction before selecting prototype vectors
            
        Returns
        -------
        sv: list
            list of the prototype vector indices from the data array given by X
        """
#        do_pca = self.do_pca_in_selection
        N= X.shape[0]    
        capacity=represent_points
        selectionset=set([])
        set_full=set(list(range(N)))
        np.random.seed(1)
        if(len(selectionset)==0):
            selectionset = np.random.permutation(N)
            sv = list(selectionset)[0:capacity]        
        else:
            extrainputs = represent_points - len(selectionset)
            leftindices =list(set_full.difference(selectionset))
            info = np.random.permutation(len(leftindices))
            info = info[1:extrainputs]
            sv = selectionset.append(leftindices[info])
    
        if(do_pca == True):
            if(X.shape[1]>50): #takes more time
                n_components = 50
                ipca = IncrementalPCA(n_components=n_components, batch_size=np.min([128,X.shape[0]]))
                X = ipca.fit_transform(X)
            
        svX = X[sv,:]
        
        min_info = np.zeros((capacity,2))

        KsV = pairwise_distances(svX,svX)**2 #this is fast
        
        KsV[KsV==0] = np.inf
        min_info[:,1] = np.min(KsV,axis=1)
        min_info[:,0] = np.arange(capacity)
        minimum = np.min(min_info[:,1])
        counter = 0
        
        for i in range(N):
        #    find for which data the value is minimum
            replace = np.argmin(min_info[:,1])
            ids = int(min_info[min_info[:,0]==replace,0])
            #Subtract from totalcrit once for row 
            tempminimum = minimum - min_info[ids,1] 
            #Try to evaluate kernel function 
            
            tempsvX = np.zeros(svX.shape)
            tempsvX[:] = svX[:]
            inputX = X[i,:]
            tempsvX[replace,:] = inputX 
            tempK = pairwise_distances(tempsvX,np.reshape(inputX,(1,X.shape[1])))**2 #this is fast
            tempK[tempK==0] = np.inf
            distance_eval = np.min(tempK)
            tempminimum = tempminimum + distance_eval 
            if (minimum < tempminimum):
                minimum = tempminimum
                min_info[ids,1] = distance_eval
                svX[:] = tempsvX[:]
                sv[ids] = i
                counter +=1
        return sv
예제 #36
0
	datos_test.append((test_caras[i],1))
	datos_test.append((test_no_caras[i],0))

# Almacenamos el test para obtener lambda
f = file("datos_test.dat","wb")
pickle.dump(datos_test,f,2)
f.close()

print "Proyeccion PCA y Clustering (K-means)..."
# Nota: Imagenes de entrada son cuadradas
# Ventana subregion (cuadrada)
l_sr = len(l_imagenes_caras[0])/16
# Clusters para k means
clusters = 60
# PCA
ipca = IncrementalPCA(n_components=8)
# k means
kmeans = MiniBatchKMeans(n_clusters=clusters,random_state=1)

# Listas auxiliares
l_aux = []
l_pos = []
# Obtenemos el modelo de PCA (caras)
for img_cara in l_imagenes_caras:

	# Prepoceso PCA
	pos = 0
	for i in range(0,len(img_cara),l_sr):
		if i+l_sr <= len(img_cara):
			subregion = img_cara[i:i+l_sr]
			l_aux.append(subregion)
예제 #37
0
    """
    for color, label, class_name in zip(colors, labels, class_names):
        plt.scatter(X[y == label, 0], X[y == label, 1], color=color, label=class_name)
    plt.title(title)
    plt.legend(loc='best')


# 转换前的可视化, 只显示前两维度的数据
plt.figure(1)
plot_func('origin data')

# KernelPCA 是非线性降维, LDA 只能用于分类降维
# ICA 通常不用于降低维度,而是用于分离叠加信号
models_list = [('LDA', LinearDiscriminantAnalysis(n_components=2)), ('PCA', PCA(n_components=2, random_state=0)),
               ('PCARand', PCA(n_components=2, random_state=0, svd_solver='randomized')),
               ('IncrementalPCA', IncrementalPCA(n_components=2, batch_size=10, whiten=True)), ('FactorAnalysis', FactorAnalysis(n_components=2, max_iter=500)),
               ('FastICA', FastICA(n_components=2, random_state=0)), ('KernelPCA', KernelPCA(n_components=2, random_state=0, kernel='rbf')),
               ('SparsePCA', SparsePCA(n_components=2, random_state=0, verbose=True)),
               ('MiniBatchSparsePCA', MiniBatchSparsePCA(n_components=2, verbose=True, batch_size=10, random_state=0)),
               ('DictionaryLearning', DictionaryLearning(n_components=2, verbose=True, random_state=0)),
               ('MiniBatchDictionaryLearning', MiniBatchDictionaryLearning(n_components=2, batch_size=5, random_state=0, alpha=0.1))]

model = namedtuple('models', ['mod_name', 'mod_ins'])

for i in range(len(models_list)):
    mod = model(*models_list[i])
    if mod.mod_name == 'LDA':
        mod.mod_ins.fit(X, y)
        X_new = mod.mod_ins.transform(X)
    else:
        X_new = mod.mod_ins.fit_transform(X)
def use_incremental_pca() -> Pipeline:
    pipe = Pipeline([('cv', CountVectorizer()),
                     ('ipca', IncrementalPCA(n_components=2, batch_size=4))])
    return pipe
예제 #39
0
파일: _simple.py 프로젝트: mbuttner/scanpy
def pca(
    data: Union[AnnData, np.ndarray, spmatrix],
    n_comps: int = N_PCS,
    zero_center: Optional[bool] = True,
    svd_solver: str = 'auto',
    random_state: int = 0,
    return_info: bool = False,
    use_highly_variable: Optional[bool] = None,
    dtype: str = 'float32',
    copy: bool = False,
    chunked: bool = False,
    chunk_size: Optional[int] = None,
) -> Union[AnnData, np.ndarray, spmatrix]:
    """Principal component analysis [Pedregosa11]_.

    Computes PCA coordinates, loadings and variance decomposition. Uses the
    implementation of *scikit-learn* [Pedregosa11]_.

    Parameters
    ----------
    data
        The (annotated) data matrix of shape ``n_obs`` × ``n_vars``.
        Rows correspond to cells and columns to genes.
    n_comps
        Number of principal components to compute.
    zero_center
        If `True`, compute standard PCA from covariance matrix.
        If ``False``, omit zero-centering variables
        (uses :class:`~sklearn.decomposition.TruncatedSVD`),
        which allows to handle sparse input efficiently.
        Passing ``None`` decides automatically based on sparseness of the data.
    svd_solver
        SVD solver to use:

        ``'arpack'``
          for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`)

        ``'randomized'``
          for the randomized algorithm due to Halko (2009).

        ``'auto'`` (the default)
          chooses automatically depending on the size of the problem.

    random_state
        Change to use different initial states for the optimization.
    return_info
        Only relevant when not passing an :class:`~anndata.AnnData`:
        see “**Returns**”.
    use_highly_variable
        Whether to use highly variable genes only, stored in
        ``.var['highly_variable']``.
        By default uses them if they have been determined beforehand.
    dtype
        Numpy data type string to which to convert the result.
    copy
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned. Is ignored otherwise.
    chunked
        If ``True``, perform an incremental PCA on segments of ``chunk_size``.
        The incremental PCA automatically zero centers and ignores settings of
        ``random_seed`` and ``svd_solver``. If ``False``, perform a full PCA.
    chunk_size
        Number of observations to include in each chunk.
        Required if ``chunked=True`` was passed.

    Returns
    -------

    X_pca : :class:`scipy.sparse.spmatrix` or :class:`numpy.ndarray`
        If `data` is array-like and ``return_info=False`` was passed,
        this function only returns `X_pca`…
    adata : :class:`~anndata.AnnData`
        …otherwise if ``copy=True`` it returns or else adds fields to ``adata``:

        ``.obsm['X_pca']``
             PCA representation of data.

        ``.varm['PCs']``
             The principal components containing the loadings.

        ``.uns['pca']['variance_ratio']``)
             Ratio of explained variance.

        ``.uns['pca']['variance']``
             Explained variance, equivalent to the eigenvalues of the covariance matrix.
    """
    # chunked calculation is not randomized, anyways
    if svd_solver in {'auto', 'randomized'} and not chunked:
        logg.info(
            'Note that scikit-learn\'s randomized PCA might not be exactly '
            'reproducible across different computational platforms. For exact '
            'reproducibility, choose `svd_solver=\'arpack\'.` This will likely '
            'become the Scanpy default in the future.')

    data_is_AnnData = isinstance(data, AnnData)
    if data_is_AnnData:
        adata = data.copy() if copy else data
    else:
        adata = AnnData(data)

    logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4)

    if adata.n_vars < n_comps:
        n_comps = adata.n_vars - 1
        logg.msg('reducing number of computed PCs to',
                 n_comps,
                 'as dim of data is only',
                 adata.n_vars,
                 v=4)

    if use_highly_variable is True and 'highly_variable' not in adata.var.keys(
    ):
        raise ValueError(
            'Did not find adata.var[\'highly_variable\']. '
            'Either your data already only consists of highly-variable genes '
            'or consider running `pp.filter_genes_dispersion` first.')
    if use_highly_variable is None:
        use_highly_variable = True if 'highly_variable' in adata.var.keys(
        ) else False
    adata_comp = adata[:, adata.
                       var['highly_variable']] if use_highly_variable else adata

    if chunked:
        if not zero_center or random_state or svd_solver != 'auto':
            logg.msg('Ignoring zero_center, random_state, svd_solver', v=4)

        from sklearn.decomposition import IncrementalPCA

        X_pca = np.zeros((adata_comp.X.shape[0], n_comps), adata_comp.X.dtype)

        pca_ = IncrementalPCA(n_components=n_comps)

        for chunk, _, _ in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            pca_.partial_fit(chunk)

        for chunk, start, end in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            X_pca[start:end] = pca_.transform(chunk)
    else:
        if zero_center is None:
            zero_center = not issparse(adata_comp.X)
        if zero_center:
            from sklearn.decomposition import PCA
            if issparse(adata_comp.X):
                logg.msg(
                    '    as `zero_center=True`, '
                    'sparse input is densified and may '
                    'lead to huge memory consumption',
                    v=4)
                X = adata_comp.X.toarray(
                )  # Copying the whole adata_comp.X here, could cause memory problems
            else:
                X = adata_comp.X
            pca_ = PCA(n_components=n_comps,
                       svd_solver=svd_solver,
                       random_state=random_state)
        else:
            from sklearn.decomposition import TruncatedSVD
            logg.msg(
                '    without zero-centering: \n'
                '    the explained variance does not correspond to the exact statistical defintion\n'
                '    the first component, e.g., might be heavily influenced by different means\n'
                '    the following components often resemble the exact PCA very closely',
                v=4)
            pca_ = TruncatedSVD(n_components=n_comps,
                                random_state=random_state)
            X = adata_comp.X
        X_pca = pca_.fit_transform(X)

    if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype)

    if data_is_AnnData:
        adata.obsm['X_pca'] = X_pca
        if use_highly_variable:
            adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps))
            adata.varm['PCs'][
                adata.var['highly_variable']] = pca_.components_.T
        else:
            adata.varm['PCs'] = pca_.components_.T
        adata.uns['pca'] = {}
        adata.uns['pca']['variance'] = pca_.explained_variance_
        adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_
        logg.msg('    finished', t=True, end=' ', v=4)
        logg.msg(
            'and added\n'
            '    \'X_pca\', the PCA coordinates (adata.obs)\n'
            '    \'PC1\', \'PC2\', ..., the loadings (adata.var)\n'
            '    \'pca_variance\', the variance / eigenvalues (adata.uns)\n'
            '    \'pca_variance_ratio\', the variance ratio (adata.uns)',
            v=4)
        return adata if copy else None
    else:
        if return_info:
            return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_
        else:
            return X_pca
예제 #40
0
parser.add_argument('--npca',
                    default=-1,
                    type=int,
                    help='number of points used to calculate PCA')
args = parser.parse_args()

assert os.path.isfile('train_features.npz')
logging.info('Loading features file')
train_features = np.load('train_features.npz')
img_features = train_features['img_features']
tag_features = train_features['tag_features']

N_PCA = img_features.shape[0] if args.npca == -1 else args.npca
logging.info('Training: PCA of image features, N_PCA = %d', N_PCA)
start = time.time()
pca = IncrementalPCA(n_components=500, batch_size=512)
pca.fit(img_features[:N_PCA, :])
end = time.time()
logging.info('Time: %.4fm', (end - start) / 60)

logging.info('Apply PCA to image features')
start = time.time()
X = pca.transform(img_features)
end = time.time()
logging.info('Time: %.4fm', (end - start) / 60)

logging.info('Training: fit CCA')
start = time.time()
W_img, W_tag = cca.fit(X, tag_features, numCC=args.numCC, useGPU=args.gpu)
end = time.time()
logging.info('Time: %.4fm', (end - start) / 60)
예제 #41
0
    def btnConvert_click(self):
        msgBox = QMessageBox()

        # OutFile
        OutFile = ui.txtOutFile.text()
        if not len(OutFile):
            msgBox.setText("Please enter out file!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # InFile
        InFile = ui.txtInFile.text()
        if not len(InFile):
            msgBox.setText("Please enter input file!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        if not os.path.isfile(InFile):
            msgBox.setText("Input file not found!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        if ui.rbScale.isChecked() == True and ui.rbALScale.isChecked(
        ) == False:
            msgBox.setText(
                "Subject Level Normalization is just available for Subject Level Analysis!"
            )
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        InData = io.loadmat(InFile)
        OutData = dict()
        OutData["imgShape"] = InData["imgShape"]

        if not len(ui.txtData.currentText()):
            msgBox.setText("Please enter Data variable name!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        try:
            X = InData[ui.txtData.currentText()]

            if ui.cbScale.isChecked() and (not ui.rbScale.isChecked()):
                X = preprocessing.scale(X)
                print("Whole of data is scaled X~N(0,1).")
        except:
            print("Cannot load data")
            return

        try:
            NumFea = np.int32(ui.txtNumFea.text())
        except:
            msgBox.setText("Number of features is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False
        if NumFea < 1:
            msgBox.setText("Number of features must be greater than zero!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        if NumFea > np.shape(X)[1]:
            msgBox.setText("Number of features is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        # Batch
        try:
            Batch = np.int32(ui.txtBatch.text())
        except:
            msgBox.setText("Size of batch is wrong!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        if Batch == 0:
            Batch = None

        # Subject
        if not len(ui.txtSubject.currentText()):
            msgBox.setText("Please enter Subject variable name!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False

        try:
            Subject = InData[ui.txtSubject.currentText()]
            OutData[ui.txtOSubject.text()] = Subject
        except:
            print("Cannot load Subject ID")
            return

        # Label
        if not len(ui.txtLabel.currentText()):
            msgBox.setText("Please enter Label variable name!")
            msgBox.setIcon(QMessageBox.Critical)
            msgBox.setStandardButtons(QMessageBox.Ok)
            msgBox.exec_()
            return False
        OutData[ui.txtOLabel.text()] = InData[ui.txtLabel.currentText()]

        # Task
        if ui.cbTask.isChecked():
            if not len(ui.txtTask.currentText()):
                msgBox.setText("Please enter Task variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            OutData[ui.txtOTask.text()] = InData[ui.txtTask.currentText()]

        # Run
        if ui.cbRun.isChecked():
            if not len(ui.txtRun.currentText()):
                msgBox.setText("Please enter Run variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            OutData[ui.txtORun.text()] = InData[ui.txtRun.currentText()]

        # Counter
        if ui.cbCounter.isChecked():
            if not len(ui.txtCounter.currentText()):
                msgBox.setText("Please enter Counter variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            OutData[ui.txtOCounter.text()] = InData[
                ui.txtCounter.currentText()]

        # Matrix Label
        if ui.cbmLabel.isChecked():
            if not len(ui.txtmLabel.currentText()):
                msgBox.setText("Please enter Matrix Label variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            OutData[ui.txtOmLabel.text()] = InData[ui.txtmLabel.currentText()]

        # Design
        if ui.cbDM.isChecked():
            if not len(ui.txtDM.currentText()):
                msgBox.setText("Please enter Design Matrix variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            OutData[ui.txtODM.text()] = InData[ui.txtDM.currentText()]

        # Coordinate
        if ui.cbCol.isChecked():
            if not len(ui.txtCol.currentText()):
                msgBox.setText("Please enter Coordinator variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            OutData[ui.txtOCol.text()] = InData[ui.txtCol.currentText()]

        # Condition
        if ui.cbCond.isChecked():
            if not len(ui.txtCond.currentText()):
                msgBox.setText("Please enter Condition variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            OutData[ui.txtOCond.text()] = InData[ui.txtCond.currentText()]

        # Number of Scan
        if ui.cbNScan.isChecked():
            if not len(ui.txtScan.currentText()):
                msgBox.setText("Please enter Number of Scan variable name!")
                msgBox.setIcon(QMessageBox.Critical)
                msgBox.setStandardButtons(QMessageBox.Ok)
                msgBox.exec_()
                return False
            OutData[ui.txtOScan.text()] = InData[ui.txtScan.currentText()]

        Models = dict()
        Models["Name"] = "IPCA"

        if ui.rbALScale.isChecked():
            print("Partition data to subject level ...")
            SubjectUniq = np.unique(Subject)
            X_Sub = list()
            for subj in SubjectUniq:
                if ui.cbScale.isChecked() and ui.rbScale.isChecked():
                    X_Sub.append(
                        preprocessing.scale(
                            X[np.where(Subject == subj)[1], :]))
                    print("Data in subject level is scaled, X_" + str(subj) +
                          "~N(0,1).")
                else:
                    X_Sub.append(X[np.where(Subject == subj)[1], :])
                print("Subject ", subj, " is extracted from data.")

            print("Running IPCA in subject level ...")
            X_Sub_PCA = list()
            lenPCA = len(X_Sub)

            for xsubindx, xsub in enumerate(X_Sub):
                model = IncrementalPCA(n_components=NumFea, batch_size=Batch)
                model.fit(xsub)
                X_Sub_PCA.append(model.transform(xsub))
                Models["Model" + str(xsubindx + 1)] = str(
                    model.get_params(deep=True))
                print("IPCA: ", xsubindx + 1, " of ", lenPCA, " is done.")

            print("Data integration ... ")
            X_new = None
            for xsubindx, xsub in enumerate(X_Sub_PCA):
                X_new = np.concatenate(
                    (X_new, xsub)) if X_new is not None else xsub
                print("Integration: ", xsubindx + 1, " of ", lenPCA,
                      " is done.")
            OutData[ui.txtOData.text()] = X_new
        else:
            print("Running IPCA ...")
            model = IncrementalPCA(n_components=NumFea, batch_size=Batch)
            OutData[ui.txtOData.text()] = model.fit_transform(X)
            Models["Model"] = str(model.get_params(deep=True))

        OutData["ModelParameter"] = Models

        print("Saving ...")
        io.savemat(ui.txtOutFile.text(), mdict=OutData)
        print("DONE.")
        msgBox.setText("Incremental PCA is done.")
        msgBox.setIcon(QMessageBox.Information)
        msgBox.setStandardButtons(QMessageBox.Ok)
        msgBox.exec_()
예제 #42
0
clf = LDA()
clf.fit(data, label.ravel())

train_predict = clf.predict(data)
train_error = 1 - np.mean(train_predict == label)
print("Training data error: %f" % train_error)

test_predict = clf.predict(test_data)
test_error = 1 - np.mean(test_predict == test_label)
print("Test data error: %f" % test_error)

# ##Question 2

# In[4]:

pca = IncrementalPCA(n_components=49)
pca.fit(data, label)
U = pca.transform(data)
clf_lda = LDA()
train_pca = U
clf_lda.fit(train_pca, label.ravel())

train_pca_predict = clf_lda.predict(train_pca)
train_pca_error = 1 - np.mean(train_pca_predict == label)
print("Training data error after PCA: %f" % train_pca_error)

U = pca.transform(test_data)
test_pca = U
test_pca_predict = clf_lda.predict(test_pca)
test_pca_error = 1 - np.mean(test_pca_predict == test_label)
print("Test data error after PCA: %f" % test_pca_error)
예제 #43
0
파일: _pca.py 프로젝트: pedrofale/scanpy
def pca(
    data: Union[AnnData, np.ndarray, spmatrix],
    n_comps: Optional[int] = None,
    zero_center: Optional[bool] = True,
    svd_solver: str = 'arpack',
    random_state: AnyRandom = 0,
    return_info: bool = False,
    use_highly_variable: Optional[bool] = None,
    dtype: str = 'float32',
    copy: bool = False,
    chunked: bool = False,
    chunk_size: Optional[int] = None,
) -> Union[AnnData, np.ndarray, spmatrix]:
    """\
    Principal component analysis [Pedregosa11]_.

    Computes PCA coordinates, loadings and variance decomposition.
    Uses the implementation of *scikit-learn* [Pedregosa11]_.

    .. versionchanged:: 1.5.0

        In previous versions, computing a PCA on a sparse matrix would make a dense copy of
        the array for mean centering.
        As of scanpy 1.5.0, mean centering is implicit.
        While results are extremely similar, they are not exactly the same.
        If you would like to reproduce the old results, pass a dense array.

    Parameters
    ----------
    data
        The (annotated) data matrix of shape `n_obs` × `n_vars`.
        Rows correspond to cells and columns to genes.
    n_comps
        Number of principal components to compute. Defaults to 50, or 1 - minimum
        dimension size of selected representation.
    zero_center
        If `True`, compute standard PCA from covariance matrix.
        If `False`, omit zero-centering variables
        (uses :class:`~sklearn.decomposition.TruncatedSVD`),
        which allows to handle sparse input efficiently.
        Passing `None` decides automatically based on sparseness of the data.
    svd_solver
        SVD solver to use:

        `'arpack'` (the default)
          for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`)
        `'randomized'`
          for the randomized algorithm due to Halko (2009).
        `'auto'`
          chooses automatically depending on the size of the problem.
        `'lobpcg'`
          An alternative SciPy solver.

        .. versionchanged:: 1.4.5
           Default value changed from `'auto'` to `'arpack'`.

        Efficient computation of the principal components of a sparse matrix
        currently only works with the `'arpack`' or `'lobpcg'` solvers.

    random_state
        Change to use different initial states for the optimization.
    return_info
        Only relevant when not passing an :class:`~anndata.AnnData`:
        see “**Returns**”.
    use_highly_variable
        Whether to use highly variable genes only, stored in
        `.var['highly_variable']`.
        By default uses them if they have been determined beforehand.
    dtype
        Numpy data type string to which to convert the result.
    copy
        If an :class:`~anndata.AnnData` is passed, determines whether a copy
        is returned. Is ignored otherwise.
    chunked
        If `True`, perform an incremental PCA on segments of `chunk_size`.
        The incremental PCA automatically zero centers and ignores settings of
        `random_seed` and `svd_solver`. If `False`, perform a full PCA.
    chunk_size
        Number of observations to include in each chunk.
        Required if `chunked=True` was passed.

    Returns
    -------
    X_pca : :class:`~scipy.sparse.spmatrix`, :class:`~numpy.ndarray`
        If `data` is array-like and `return_info=False` was passed,
        this function only returns `X_pca`…
    adata : anndata.AnnData
        …otherwise if `copy=True` it returns or else adds fields to `adata`:

        `.obsm['X_pca']`
             PCA representation of data.
        `.varm['PCs']`
             The principal components containing the loadings.
        `.uns['pca']['variance_ratio']`
             Ratio of explained variance.
        `.uns['pca']['variance']`
             Explained variance, equivalent to the eigenvalues of the
             covariance matrix.
    """
    logg_start = logg.info('computing PCA')

    # chunked calculation is not randomized, anyways
    if svd_solver in {'auto', 'randomized'} and not chunked:
        logg.info(
            'Note that scikit-learn\'s randomized PCA might not be exactly '
            'reproducible across different computational platforms. For exact '
            'reproducibility, choose `svd_solver=\'arpack\'.`'
        )
    data_is_AnnData = isinstance(data, AnnData)
    if data_is_AnnData:
        adata = data.copy() if copy else data
    else:
        adata = AnnData(data, dtype=data.dtype)

    if use_highly_variable is True and 'highly_variable' not in adata.var.keys():
        raise ValueError(
            'Did not find adata.var[\'highly_variable\']. '
            'Either your data already only consists of highly-variable genes '
            'or consider running `pp.highly_variable_genes` first.'
        )
    if use_highly_variable is None:
        use_highly_variable = True if 'highly_variable' in adata.var.keys() else False
    if use_highly_variable:
        logg.info('    on highly variable genes')
    adata_comp = (
        adata[:, adata.var['highly_variable']] if use_highly_variable else adata
    )

    if n_comps is None:
        min_dim = min(adata_comp.n_vars, adata_comp.n_obs)
        if settings.N_PCS >= min_dim:
            n_comps = min_dim - 1
        else:
            n_comps = settings.N_PCS

    logg.info(f'    with n_comps={n_comps}')

    random_state = check_random_state(random_state)

    X = adata_comp.X

    if chunked:
        if not zero_center or random_state or svd_solver != 'arpack':
            logg.debug('Ignoring zero_center, random_state, svd_solver')

        from sklearn.decomposition import IncrementalPCA

        X_pca = np.zeros((X.shape[0], n_comps), X.dtype)

        pca_ = IncrementalPCA(n_components=n_comps)

        for chunk, _, _ in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            pca_.partial_fit(chunk)

        for chunk, start, end in adata_comp.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            X_pca[start:end] = pca_.transform(chunk)
    elif (not issparse(X) or svd_solver == "randomized") and zero_center:
        from sklearn.decomposition import PCA

        if issparse(X) and svd_solver == "randomized":
            # This  is for backwards compat. Better behaviour would be to either error or use arpack.
            logg.warning(
                "svd_solver 'randomized' does not work with sparse input. Densifying the array. "
                "This may take a very large amount of memory."
            )
            X = X.toarray()
        pca_ = PCA(
            n_components=n_comps, svd_solver=svd_solver, random_state=random_state
        )
        X_pca = pca_.fit_transform(X)
    elif issparse(X) and zero_center:
        from sklearn.decomposition import PCA

        if svd_solver == "auto":
            svd_solver = "arpack"
        if svd_solver not in {'lobpcg', 'arpack'}:
            raise ValueError(
                'svd_solver: {svd_solver} can not be used with sparse input.\n'
                'Use "arpack" (the default) or "lobpcg" instead.'
            )

        output = _pca_with_sparse(
            X, n_comps, solver=svd_solver, random_state=random_state
        )
        # this is just a wrapper for the results
        X_pca = output['X_pca']
        pca_ = PCA(n_components=n_comps, svd_solver=svd_solver)
        pca_.components_ = output['components']
        pca_.explained_variance_ = output['variance']
        pca_.explained_variance_ratio_ = output['variance_ratio']
    elif not zero_center:
        from sklearn.decomposition import TruncatedSVD

        logg.debug(
            '    without zero-centering: \n'
            '    the explained variance does not correspond to the exact statistical defintion\n'
            '    the first component, e.g., might be heavily influenced by different means\n'
            '    the following components often resemble the exact PCA very closely'
        )
        pca_ = TruncatedSVD(
            n_components=n_comps, random_state=random_state, algorithm=svd_solver
        )
        X_pca = pca_.fit_transform(X)
    else:
        raise Exception("This shouldn't happen. Please open a bug report.")

    if X_pca.dtype.descr != np.dtype(dtype).descr:
        X_pca = X_pca.astype(dtype)

    if data_is_AnnData:
        adata.obsm['X_pca'] = X_pca
        adata.uns['pca'] = {}
        adata.uns['pca']['params'] = {
            'zero_center': zero_center,
            'use_highly_variable': use_highly_variable,
        }
        if use_highly_variable:
            adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps))
            adata.varm['PCs'][adata.var['highly_variable']] = pca_.components_.T
        else:
            adata.varm['PCs'] = pca_.components_.T
        adata.uns['pca']['variance'] = pca_.explained_variance_
        adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_
        logg.info('    finished', time=logg_start)
        logg.debug(
            'and added\n'
            '    \'X_pca\', the PCA coordinates (adata.obs)\n'
            '    \'PC1\', \'PC2\', ..., the loadings (adata.var)\n'
            '    \'pca_variance\', the variance / eigenvalues (adata.uns)\n'
            '    \'pca_variance_ratio\', the variance ratio (adata.uns)'
        )
        return adata if copy else None
    else:
        logg.info('    finished', time=logg_start)
        if return_info:
            return (
                X_pca,
                pca_.components_,
                pca_.explained_variance_ratio_,
                pca_.explained_variance_,
            )
        else:
            return X_pca
예제 #44
0
class IPCA(object):
    def __init__(self,
                 n_components=None,
                 whiten=False,
                 copy=True,
                 batch_size=None):
        """
        :param n_components:   default为None ,int 或None, 想要保留的分量数,None 时,
        min(n_samples, n_features)
        :param whiten:   bool型,可选项, 默认为False, 当true(默认情况下为false)时,components_ 向量除以
        n_samples*components_以确保具有单位组件级方差的不相关输出。
        :param copy: 默认为True,  False时,x 将被覆盖,将节约能存,但存在不安全
        :param batch_size: default None, 批量样本数,   只在fit 中使用,设为None,系统自动设成5*n_features,
        以保持经度与内存开销的平衡
        """
        self.model = IncrementalPCA(n_components=n_components,
                                    whiten=whiten,
                                    copy=copy,
                                    batch_size=batch_size)

    def fit(self, x, y=None):
        self.model.fit(X=x, y=y)

    def transform(self, x):
        return self.model.transform(X=x)

    def fit_transform(self, x, y=None):
        return self.model.fit_transform(X=x, y=y)

    def get_params(self, deep=True):  # 获取评估器的参数
        return self.model.get_params(deep=deep)

    def set_params(self, **params):  # 设置评估器的参数
        self.model.set_params(**params)

    def inverse_transform(self, x):  # 与 fit_tansform 刚好相反的两个操作
        return self.model.inverse_transform(X=x)

    def get_precision(self):  # 根据生成模型计算精度矩阵
        return self.model.get_precision()

    def get_covariance(self):  # 根据生成模型获取协方差
        return self.model.get_covariance()

    def partial_fit(self, x, y=None, check_input=True):  # 增量训练
        self.model.partial_fit(X=x, y=y, check_input=check_input)

    def get_attributes(self):
        component = self.model.components_
        explained_variance = self.model.explained_variance_
        explained_variance_ratio = self.model.explained_variance_ratio_
        singular_values = self.model.singular_values_
        means = self.model.mean_  # 每个特征的均值
        var = self.model.var_  # 每个特征的方差
        noise_variance = self.model.noise_variance_  # 评估的噪声协方差
        n_component = self.model.n_components_
        n_samples_seen = self.model.n_samples_seen_
        return component, explained_variance, explained_variance_ratio, singular_values, means, var, noise_variance, \
               n_component, n_samples_seen
from scipy import stats
import timeit
from ConfusionMatrix import confusionMatrixAlgo
from sklearn.externals import joblib
from sklearn.svm import NuSVC
from sklearn import grid_search
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing
from sklearn.decomposition import PCA
from skimage.feature import hog
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from sklearn.decomposition import IncrementalPCA

pca = IncrementalPCA(n_components = 500)
def get_feature_space(images):
    features = []
    count = 1
    buffer = []
    for image in images:
        des = hog(image)
        features.extend(des.reshape(-1,36).tolist())
        if(count %100 ==0):
            print str(count) + " out of " + str (len(images))
        count+= 1
    return features
def to_BOW_features(features,codebook):
    BOW = [codebook.predict(feature) for feature in features]
    hist = [np.histogram(bag, bins = codebook.n_clusters)[0]for bag in BOW]
    return hist
예제 #46
0
    img = Image.open(i)

    img = img.resize((int(480 / 6), int(360 / 6)), Image.BICUBIC)

    img = img_to_matrix(img)
    img = flatten_img(img)

    dataset.append(img)

dataset = np.array(dataset)
print(dataset.shape)
print("Dataset make done.")

n = dataset.shape[0]
batch_size = 180
ipca = IncrementalPCA(n_components=100)

for i in range(n // batch_size):
    r_dataset = ipca.partial_fit(dataset[i * batch_size:(i + 1) * batch_size])

r_dataset = ipca.transform(dataset)
print(r_dataset.shape)
print("PCA done.")

# K-means clustering
import shutil
n_clusters_10 = 10
kmeans_10 = KMeans(n_clusters=n_clusters_10, random_state=5).fit(r_dataset)
labels_10 = kmeans_10.labels_
print("K-means clustering done.")
예제 #47
0
def pca(data,
        n_comps=None,
        zero_center=True,
        svd_solver='auto',
        random_state=0,
        return_info=False,
        dtype='float32',
        copy=False,
        chunked=False,
        chunk_size=None):
    """Principal component analysis [Pedregosa11]_.

    Computes PCA coordinates, loadings and variance decomposition. Uses the
    implementation of *scikit-learn* [Pedregosa11]_.

    Parameters
    ----------
    data : :class:`~scanpy.api.AnnData`, `np.ndarray`, `sp.sparse`
        The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    n_comps : `int`, optional (default: 50)
        Number of principal components to compute.
    zero_center : `bool` or `None`, optional (default: `True`)
        If `True`, compute standard PCA from covariance matrix. If `False`, omit
        zero-centering variables (uses *TruncatedSVD* from scikit-learn), which
        allows to handle sparse input efficiently.
    svd_solver : `str`, optional (default: 'auto')
        SVD solver to use. Either 'arpack' for the ARPACK wrapper in SciPy
        (scipy.sparse.linalg.svds), or 'randomized' for the randomized algorithm
        due to Halko (2009). "auto" chooses automatically depending on the size
        of the problem.
    random_state : `int`, optional (default: 0)
        Change to use different intial states for the optimization.
    return_info : `bool`, optional (default: `False`)
        Only relevant when not passing an :class:`~scanpy.api.AnnData`: see
        "Returns".
    dtype : `str` (default: 'float32')
        Numpy data type string to which to convert the result.
    copy : `bool`, optional (default: `False`)
        If an :class:`~scanpy.api.AnnData` is passed, determines whether a copy
        is returned. Is ignored otherwise.

    Returns
    -------
    If `data` is array-like and `return_info == False`, only returns `X_pca`,\
    otherwise returns or adds to `adata`:
    X_pca : `.obsm`
         PCA representation of data.
    PCs : `.varm`
         The principal components containing the loadings.
    variance_ratio : `.uns['pca']`
         Ratio of explained variance.
    variance : `.uns['pca']`
         Explained variance, equivalent to the eigenvalues of the covariance matrix.
    """

    if n_comps is None: n_comps = N_PCS

    if isinstance(data, AnnData):
        data_is_AnnData = True
        adata = data.copy() if copy else data
    else:
        data_is_AnnData = False
        adata = AnnData(data)

    logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4)

    if adata.n_vars < n_comps:
        n_comps = adata.n_vars - 1
        logg.msg('reducing number of computed PCs to',
                 n_comps,
                 'as dim of data is only',
                 adata.n_vars,
                 v=4)

    if chunked:
        if not zero_center or random_state or svd_solver != 'auto':
            logg.msg('Ignoring zero_center, random_state, svd_solver', v=4)

        from sklearn.decomposition import IncrementalPCA

        X_pca = np.zeros((adata.X.shape[0], n_comps), adata.X.dtype)

        pca_ = IncrementalPCA(n_components=n_comps)

        for chunk, _, _ in adata.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            pca_.partial_fit(chunk)

        for chunk, start, end in adata.chunked_X(chunk_size):
            chunk = chunk.toarray() if issparse(chunk) else chunk
            X_pca[start:end] = pca_.transform(chunk)
    else:
        zero_center = zero_center if zero_center is not None else False if issparse(
            adata.X) else True
        if zero_center:
            from sklearn.decomposition import PCA
            if issparse(adata.X):
                logg.msg(
                    '    as `zero_center=True`, '
                    'sparse input is densified and may '
                    'lead to huge memory consumption',
                    v=4)
                X = adata.X.toarray(
                )  # Copying the whole adata.X here, could cause memory problems
            else:
                X = adata.X
            pca_ = PCA(n_components=n_comps,
                       svd_solver=svd_solver,
                       random_state=random_state)
        else:
            from sklearn.decomposition import TruncatedSVD
            logg.msg(
                '    without zero-centering: \n'
                '    the explained variance does not correspond to the exact statistical defintion\n'
                '    the first component, e.g., might be heavily influenced by different means\n'
                '    the following components often resemble the exact PCA very closely',
                v=4)
            pca_ = TruncatedSVD(n_components=n_comps,
                                random_state=random_state)
            X = adata.X
        X_pca = pca_.fit_transform(X)

    if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype)

    if data_is_AnnData:
        adata.obsm['X_pca'] = X_pca
        adata.varm['PCs'] = pca_.components_.T
        adata.uns['pca'] = {}
        adata.uns['pca']['variance'] = pca_.explained_variance_
        adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_
        logg.msg('    finished', t=True, end=' ', v=4)
        logg.msg(
            'and added\n'
            '    \'X_pca\', the PCA coordinates (adata.obs)\n'
            '    \'PC1\', \'PC2\', ..., the loadings (adata.var)\n'
            '    \'pca_variance\', the variance / eigenvalues (adata.uns)\n'
            '    \'pca_variance_ratio\', the variance ratio (adata.uns)',
            v=4)
        return adata if copy else None
    else:
        if return_info:
            return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_
        else:
            return X_pca
    async def do_run_async(self):
        # Principal Component Analysis (PCA) is by far the most popular dimensionality reduction
        # algorithm. First it identifies the hyperplane that lies closest to the data, and then
        # it projects the data onto it.

        # How PCA works: For the number of dimensions that you want to reduce a dataset to, it identifies the axis
        # for which the projection of the dataset onto generates the maximum amount of variance (or the axis that minimizes the mean squared distance
        # between the original dataset and its projection onto that axis based on Pythagoras' theorem)
        # It starts with a first axis then finds a second axis orthogonal to the first that maximizes the amount of remaining variance
        # and then a third axis orthogonal to the first two and so on - as many axes as the number of dimensions required to reduce the dataset to.
        # The vectors that define the axis are called Principal Components
        # Once you have identified all the principal components, you can reduce the dimensionality
        # of the dataset down to d dimensions by projecting it onto the hyperplane
        # defined by the first d principal components.

        training_set = super().load_train_images()

        # Training set needs to be reshaped from 3D (60000,28,28) to 2D (60000, 784) for the classifier to be able to
        # use in training phase
        training_set_tr = training_set.reshape((60000, 784))
        training_labels = super().load_train_labels()

        X = training_set_tr[:1000, :]  # First 1000 instances

        # There is a standard matrix factorization technique called Singular Value Decomposition (SVD)
        # that can decompose the training set matrix X into the dot product of three matrices U
        # · Σ · VT, where VT contains all the principal components that we are looking for.

        X_centered = X - X.mean(axis=0)
        U, s, V = np.linalg.svd(X_centered)

        # The principal components vectors are then the columns of the transpose of V matrix
        C1 = V.T[:, 0]   # Shape (784,)
        C2 = V.T[:, 1]   # Shape (784,)

        # To project the training set onto the hyperplane, you can simply compute the dot
        # product of the training set matrix X by the matrix Wd, defined as the matrix containing the first d principal components
        # (i.e., the matrix composed of the first d columns of VT)

        W2 = V.T[:, :2]
        X2D = X_centered.dot(W2)

        # Same using Scikit-Learn
        pca = PCA(n_components=2)
        X2D = pca.fit_transform(X)  # X2D should be identical to the one computed above?

        # Instead of arbitrarily choosing the number of dimensions to reduce down to, it is
        # generally preferable to choose the number of dimensions that add up to a sufficiently
        # large portion of the variance (e.g., 95%). Unless, of course, you are reducing dimensionality
        # for data visualization—in that case you will generally want to reduce the
        # dimensionality down to 2 or 3.

        pca = PCA(n_components=0.95)
        X_reduced = pca.fit_transform(X)  # The 1000 instances should now have 129 features instead of the original 784

        # One problem with the preceding implementation of PCA is that it requires the whole
        # training set to fit in memory in order for the SVD algorithm to run. Fortunately,
        # Incremental PCA (IPCA) algorithms have been developed: you can split the training
        # set into mini-batches and feed an IPCA algorithm one mini-batch at a time. This is
        # useful for large training sets, and also to apply PCA online (i.e., on the fly, as new
        # instances arrive).

        X = training_set_tr

        n_batches = 100  # 100 batches of 600 instances
        inc_pca = IncrementalPCA(n_components=129)
        for X_batch in np.array_split(X, n_batches):
            inc_pca.partial_fit(X_batch)

        X_mnist_reduced = inc_pca.transform(X)

        # Measure the difference in the time required to train a K-Neighbors Classifier (known to be slow)
        # on the original and reduced MNIST dataset...The difference should be huge!

        start_time = time.time()
        clf = KNeighborsClassifier()
        clf.fit(X, training_labels)
        elapsed = time.time() - start_time

        print(f"Training a K-Neighbors Classifier on the original MNIST dataset took {elapsed} seconds.")

        start_time = time.time()
        clf.fit(X_mnist_reduced, training_labels)
        elapsed = time.time() - start_time

        print(f"Training a K-Neighbors Classifier on the reduced MNIST dataset took {elapsed} seconds.")
예제 #49
0
class AnnStream:
    def __init__(self, data, k: int, n_cluster: int, reduction_method: str,
                 dims: int, loadings: np.ndarray, use_for_pca: np.ndarray,
                 mu: np.ndarray, sigma: np.ndarray, ann_metric: str,
                 ann_efc: int, ann_ef: int, ann_m: int, nthreads: int,
                 ann_parallel: bool, rand_state: int, do_kmeans_fit: bool,
                 disable_scaling: bool, ann_idx):
        self.data = data
        self.k = k
        if self.k >= self.data.shape[0]:
            self.k = self.data.shape[0] - 1
        self.nClusters = max(n_cluster, 2)
        self.dims = dims
        self.loadings = loadings
        if self.dims is None and self.loadings is None:
            raise ValueError(
                "ERROR: Provide either value for atleast one: 'dims' or 'loadings'"
            )
        self.annMetric = ann_metric
        self.annEfc = ann_efc
        self.annEf = ann_ef
        self.annM = ann_m
        self.nthreads = nthreads
        if ann_parallel:
            self.annThreads = self.nthreads
        else:
            self.annThreads = 1
        self.randState = rand_state
        self.batchSize = self._handle_batch_size()
        self.method = reduction_method
        self.nCells, self.nFeats = self.data.shape
        self.clusterLabels: np.ndarray = np.repeat(-1, self.nCells)
        disable_reduction = False
        if self.dims < 1:
            disable_reduction = True
        with threadpool_limits(limits=self.nthreads):
            if self.method == 'pca':
                self.mu, self.sigma = mu, sigma
                if self.loadings is None or len(self.loadings) == 0:
                    if len(use_for_pca) != self.nCells:
                        raise ValueError(
                            "ERROR: `use_for_pca` does not have sample length as nCells"
                        )
                    if disable_reduction is False:
                        self._fit_pca(disable_scaling, use_for_pca)
                else:
                    # Even though the dims might have been already adjusted according to loadings before calling
                    # AnnStream, it could still be overwritten by _handle_batch_size. Hence need to hard set it here.
                    self.dims = self.loadings.shape[1]
                    # it is okay for dimensions to be larger than batch size here because we will not fit the PCA
                if disable_scaling:
                    if disable_reduction:
                        self.reducer = lambda x: x
                    else:
                        self.reducer = lambda x: x.dot(self.loadings)
                else:
                    if disable_reduction:
                        self.reducer = lambda x: self.transform_z(x)
                    else:
                        self.reducer = lambda x: self.transform_z(x).dot(
                            self.loadings)
            elif self.method == 'lsi':
                if self.loadings is None or len(self.loadings) == 0:
                    if disable_reduction is False:
                        self._fit_lsi()
                else:
                    self.dims = self.loadings.shape[1]
                if disable_reduction:
                    self.reducer = lambda x: x
                else:
                    self.reducer = lambda x: x.dot(self.loadings)
            elif self.method == 'custom':
                if self.loadings is None or len(self.loadings) == 0:
                    logger.warning(
                        "No loadings provided for manual dimension reduction")
                else:
                    self.dims = self.loadings.shape[1]
                if disable_reduction:
                    self.reducer = lambda x: x
                else:
                    self.reducer = lambda x: x.dot(self.loadings)
            else:
                raise ValueError(
                    f"ERROR: Unknown reduction method: {self.method}")
            if ann_idx is None:
                self.annIdx = self._fit_ann()
            else:
                self.annIdx = ann_idx
                self.annIdx.set_ef(self.annEf)
                self.annIdx.set_num_threads(1)
            self.kmeans = self._fit_kmeans(do_kmeans_fit)

    def _handle_batch_size(self):
        if self.dims > self.data.shape[0]:
            self.dims = self.data.shape[0]
        batch_size = self.data.chunksize[
            0]  # Assuming all chunks are same size
        if self.dims >= batch_size:
            self.dims = batch_size - 1  # -1 because we will do PCA +1
            logger.info(
                f"Number of PCA/LSI components reduced to batch size of {batch_size}"
            )
        if self.nClusters > batch_size:
            self.nClusters = batch_size
            logger.info(
                f"Cluster number reduced to batch size of {batch_size}")
        return batch_size

    def iter_blocks(self, msg: str = '') -> np.ndarray:
        for i in tqdm(self.data.blocks, desc=msg,
                      total=self.data.numblocks[0]):
            yield controlled_compute(i, self.nthreads)

    def transform_z(self, a: np.ndarray) -> np.ndarray:
        return (a - self.mu) / self.sigma

    def transform_ann(self,
                      a: np.ndarray,
                      k: int = None,
                      self_indices: np.ndarray = None) -> tuple:
        if k is None:
            k = self.k
        # Adding +1 to k because first neighbour will be the query itself
        if self_indices is None:
            i, d = self.annIdx.knn_query(a, k=k)
            return i, d
        else:
            i, d = self.annIdx.knn_query(a, k=k + 1)
            return fix_knn_query(i, d, self_indices)

    def _fit_pca(self, disable_scaling, use_for_pca) -> None:
        from sklearn.decomposition import IncrementalPCA
        # We fit 1 extra PC dim than specified and then ignore the last PC.
        self._pca = IncrementalPCA(n_components=self.dims + 1,
                                   batch_size=self.batchSize)
        do_sample_subset = False if use_for_pca.sum() == self.nCells else True
        s, e = 0, 0
        # We store the first block of values here. if such a case arises that we are left with less dims+1 cells to fit
        # then those cells can be added to end_reservoir for fitting. if there are no such cells then end reservoir is
        # just by itself after fitting rest of the cells. If may be the case that the first batch itself has less than
        # dims+1 cells. in that we keep adding cells to carry_over pile until it is big enough.
        end_reservoir = []
        # carry_over store cells that can yet not be added to end_reservoir ot be used for fitting pca directly.
        carry_over = []
        for i in self.iter_blocks(msg='Fitting PCA'):
            if do_sample_subset:
                e = s + i.shape[0]
                i = i[use_for_pca[s:e]]
                s = e
            if disable_scaling is False:
                i = self.transform_z(i)
            if len(carry_over) > 0:
                i = np.vstack((carry_over, i))
                carry_over = []
            if len(i) < (self.dims + 1):
                carry_over = i
                continue
            if len(end_reservoir) == 0:
                end_reservoir = i
                continue
            try:
                self._pca.partial_fit(i, check_input=False)
            except LinAlgError:
                # Add retry counter to make memory consumption doesn't escalate
                carry_over = i
        if len(carry_over) > 0:
            i = np.vstack((end_reservoir, carry_over))
        else:
            i = end_reservoir
        try:
            self._pca.partial_fit(i, check_input=False)
        except LinAlgError:
            logger.warning(
                "{i.shape[0]} samples were not used in PCA fitting due to LinAlgError",
                flush=True)
        self.loadings = self._pca.components_[:-1, :].T

    def _fit_lsi(self) -> None:
        from gensim.models import LsiModel
        from gensim.matutils import Dense2Corpus

        self._lsiModel = LsiModel(
            Dense2Corpus(
                controlled_compute(self.data.blocks[0], self.nthreads).T),
            num_topics=self.dims,
            chunksize=self.data.chunksize[0],
            id2word={x: x
                     for x in range(self.data.shape[1])},
            extra_samples=0)
        for n, i in enumerate(self.iter_blocks(msg="Fitting LSI model")):
            if n == 0:
                continue
            self._lsiModel.add_documents(Dense2Corpus(i.T))
        self.loadings = self._lsiModel.get_topics().T

    def _fit_ann(self):
        import hnswlib

        dims = self.dims
        if dims < 1:
            dims = self.data.shape[1]
        ann_idx = hnswlib.Index(space=self.annMetric, dim=dims)
        ann_idx.init_index(max_elements=self.nCells,
                           ef_construction=self.annEfc,
                           M=self.annM,
                           random_seed=self.randState)
        ann_idx.set_ef(self.annEf)
        ann_idx.set_num_threads(self.annThreads)
        for i in self.iter_blocks(msg='Fitting ANN'):
            ann_idx.add_items(self.reducer(i))
        return ann_idx

    def _fit_kmeans(self, do_ann_fit):
        from sklearn.cluster import MiniBatchKMeans

        if do_ann_fit is False:
            return None
        kmeans = MiniBatchKMeans(n_clusters=self.nClusters,
                                 random_state=self.randState,
                                 batch_size=self.batchSize)
        with threadpool_limits(limits=self.nthreads):
            for i in self.iter_blocks(msg='Fitting kmeans'):
                kmeans.partial_fit(self.reducer(i))
        temp = []
        for i in self.iter_blocks(msg='Estimating seed partitions'):
            temp.extend(kmeans.predict(self.reducer(i)))
        self.clusterLabels = np.array(temp)
        return kmeans
예제 #50
0
# Avoid "Mean of empty slice." in sklearn:
batch_size = max(n_components + 1, args.batch_size)
print("batch_size = {}".format(batch_size))

# Create data loader
minibatchlist = DataLoader.createTestMinibatchList(len(images_path),
                                                   batch_size)
# Training = False -> outputs only the current observation, not a tuple
data_loader = DataLoader(minibatchlist,
                         images_path,
                         n_workers=4,
                         is_training=False)

print("Fitting PCA with n_components={}".format(n_components))
ipca = IncrementalPCA(n_components=n_components)

pbar = tqdm(total=len(data_loader))
for obs_var in data_loader:
    ipca.partial_fit(toNumpyMatrix(obs_var))
    pbar.update(1)
pbar.close()
# Save PCA transformation
with open(log_folder + "/pca.pkl", "wb") as f:
    pkl.dump(ipca, f)

print("Transforming observations to states")
predictions = []
for obs_var in data_loader:
    predictions.append(ipca.transform(toNumpyMatrix(obs_var)))
predictions = np.concatenate(predictions, axis=0)
예제 #51
0
#Making the screeplot - plotting the cumulative variance against the number of components
# fig = plt.figure(figsize = (12,9))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components', fontsize=10)
plt.ylabel('cumulative explained variance', fontsize=10)
plt.title('PCA Cumulative Explained Variance', fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
img_file = results_path.joinpath('PCA_Cumulative_Explained_Variance.png')
plt.savefig(img_file)
plt.show()

# Looks like approx. 50 components are enough to describe 90% of the variance in the dataset
# We'll choose 50 components for our modeling
#Using incremental PCA for efficiency - saves a lot of time on larger datasets
pca_final = IncrementalPCA(n_components=16)
df_train_pca = pca_final.fit_transform(X_train_rus)
print("df_train_pca.shape")
print(df_train_pca.shape)

#Creating correlation matrix for the principal components - I expect little to no correlation
corrmat = np.corrcoef(df_train_pca.transpose())
plt.figure(figsize=(16, 16))
sns.set(font_scale=.8)
sns.heatmap(corrmat,
            vmin=df_corr.values.min(),
            vmax=1,
            fmt='.1f',
            square=True,
            cmap="Blues",
            linewidths=0.1,
예제 #52
0
           y="Second Vector",
           hue="Label",
           data=X_train_scatter,
           fit_reg=False)
ax = plt.gca()
ax.set_title("Separation of Observations Using Original Feature Set")

# In[ ]:

# Incremental PCA
from sklearn.decomposition import IncrementalPCA

n_components = 784
batch_size = None

incrementalPCA = IncrementalPCA(n_components=n_components,
                                batch_size=batch_size)

X_train_incrementalPCA = incrementalPCA.fit_transform(X_train)
X_train_incrementalPCA = pd.DataFrame(data=X_train_incrementalPCA,
                                      index=train_index)

X_validation_incrementalPCA = incrementalPCA.transform(X_validation)
X_validation_incrementalPCA = pd.DataFrame(data=X_validation_incrementalPCA,
                                           index=validation_index)

scatterPlot(X_train_incrementalPCA, y_train, "Incremental PCA")

# In[ ]:

# Sparse PCA
from sklearn.decomposition import SparsePCA
예제 #53
0
class PCANet(object):
    def __init__(self, image_shape, filter_shape_l1, step_shape_l1,
                 n_l1_output, filter_shape_l2, step_shape_l2, n_l2_output,
                 filter_shape_pooling, step_shape_pooling):
        """
        Parameters
        ----------
        image_shape: int or sequence of ints
            Input image shape.
        filter_shape_l1: int or sequence of ints
            The shape of the kernel in the first convolution layer.
            If the value is int, a filter of the square shape is applied.
            If you want to apply a filter of a different aspect ratio, just
            pass a tuple of shape (height, width).
        step_shape_l1: int or sequence of ints
            The shape of kernel step in the first convolution layer.
            If the value is int, a step of the square shape is applied.
            If you want to apply a step of a different aspect ratio, just
            pass a tuple of shape (height, width).
        n_l1_output:
            L1 in the original paper. The number of outputs obtained
            from a set of input images.
        filter_shape_l2: int or sequence of ints
            The shape of the kernel in the second convolution layer.
            If the value is int, a filter of the square shape is applied.
            If you want to apply a filter of a different aspect ratio, just
            pass a tuple of shape (height, width).
        step_shape_l2: int or sequence of ints
            The shape of kernel step in the second convolution layer.
            If the value is int, a step of the square shape is applied.
            If you want to apply a step of a different aspect ratio, just
            pass a tuple of shape (height, width).
        n_l2_output:
            L2 in the original paper. The number of outputs obtained
            from each L1 output.
        filter_shape_pooling: int or sequence of ints
            The shape of the filter in the pooling layer.
        step_shape_pooling: int or sequence of ints
            The shape of the filter step in the pooling layer.
        """

        self.image_shape = to_tuple_if_int(image_shape)

        self.filter_shape_l1 = to_tuple_if_int(filter_shape_l1)
        self.step_shape_l1 = to_tuple_if_int(step_shape_l1)
        self.n_l1_output = n_l1_output

        self.conv1 = Conv2d(1, 6, 3)

        self.filter_shape_l2 = to_tuple_if_int(filter_shape_l2)
        self.step_shape_l2 = to_tuple_if_int(step_shape_l2)
        self.n_l2_output = n_l2_output

        self.filter_shape_pooling = to_tuple_if_int(filter_shape_pooling)
        self.step_shape_pooling = to_tuple_if_int(step_shape_pooling)
        self.n_bins = None  # TODO make n_bins specifiable

        self.pca_l1 = IncrementalPCA(n_l1_output)
        self.pca_l2 = IncrementalPCA(n_l2_output)

    def histogram(self, binary_images):
        """
        Separate a given image into blocks and calculate a histogram
        in each block.

        Supporse data in a block is in range [0, 3] and the acutual
        values are

        ::

            [0 0 1]
            [2 2 2]
            [2 3 3]

        | If default bins ``[-0.5 0.5 1.5 2.5 3.5]`` applied,
          the histogram will be ``[2 1 4 2]``.
        | If ``n_bins`` is specified, the range of data divided equally.

        | For example, if the data is in range ``[0, 3]`` and ``n_bins = 2``,
        | bins will be ``[-0.5 1.5 3.5]`` and the histogram will be ``[3 6]``.
        """

        k = pow(2, self.n_l2_output)
        if self.n_bins is None:
            self.n_bins = k + 1
        bins = xp.linspace(-0.5, k - 0.5, self.n_bins)

        def bhist(image):
            # calculate Bhist(T) in the original paper
            ps = Patches(image, self.filter_shape_pooling,
                         self.step_shape_pooling).patches

            H = [xp.histogram(p.flatten(), bins)[0] for p in ps]
            return xp.concatenate(H)

        return xp.vstack([bhist(image) for image in binary_images])

    def process_input(self, images):
        assert (np.ndim(images) >= 3)
        assert (images.shape[1:3] == self.image_shape)
        if np.ndim(images) == 3:
            # forcibly convert to multi-channel images
            images = atleast_4d(images)
        images = to_channels_first(images)
        return images

    def fit(self, images):
        """
        Train PCANet

        Parameters
        ----------
        images: np.ndarray
            | Color / grayscale images of shape
            | (n_images, height, width, n_channels) or
            | (n_images, height, width)
        """
        images = self.process_input(images)
        # images.shape == (n_images, n_channels, y, x)

        for image in images:
            X = []
            for channel in image:
                patches = image_to_patch_vectors(channel, self.filter_shape_l1,
                                                 self.step_shape_l1)
                X.append(patches)
            patches = np.hstack(X)
            # patches.shape = (n_patches, n_patches * vector length)
            self.pca_l1.partial_fit(patches)

        filters_l1 = components_to_filters(
            self.pca_l1.components_,
            n_channels=images.shape[1],
            filter_shape=self.filter_shape_l1,
        )

        images = torch.Tensor(images)
        images = F.relu(self.conv1(images))
        #images.shape(n_images, L1, y, x)
        images.reshape(-1, *images.shape[2:4])
        for image in images:
            patches = image_to_patch_vectors(image, self.filter_shape_l2,
                                             self.step_shape_l2)
            self.pca_l2.partial_fit(patches)

        return self

    def transform(self, images):
        """
        Parameters
        ----------
        images: np.ndarray
            | Color / grayscale images of shape
            | (n_images, height, width, n_channels) or
            | (n_images, height, width)

        Returns
        -------
        X: np.ndarray
            A set of feature vectors of shape (n_images, n_features)
            where :code:`n_features` is determined by the hyperparameters
        """
        images = self.process_input(images)
        # images.shape == (n_images, n_channels, y, x)

        filters_l1 = components_to_filters(
            self.pca_l1.components_,
            n_channels=images.shape[1],
            filter_shape=self.filter_shape_l1,
        )

        filters_l2 = components_to_filters(self.pca_l2.components_,
                                           n_channels=1,
                                           filter_shape=self.filter_shape_l2)

        images = Conv2d(images, filters_l1, stride=self.step_shape_l1).data

        images = xp.swapaxes(images, 0, 1)

        # L1.shape == (L1, n_images, y, x)
        # iterate over each L1 output

        X = []
        for maps in images:
            n_images, h, w = maps.shape
            maps = Conv2d(
                maps.reshape(n_images, 1, h, w),  # 1 channel images
                filters_l2,
                stride=self.step_shape_l2).data

            # maps.shape == (n_images, L2, y, x) right here
            maps = binarize(maps)
            maps = binary_to_decimal(maps)
            # maps.shape == (n_images, y, x)
            x = self.histogram(maps)

            # x is a set of feature vectors.
            # The shape of x is (n_images, vector length)
            X.append(x)

        # concatenate over L1
        X = xp.hstack(X)

        if gpu_enabled():
            X = X.to('cpu')

        X = X.astype(np.float64)

        # The shape of X is (n_images, L1 * vector length)
        return X

    def validate_structure(self):
        """
        Check that the filter visits all pixels of input images without
        dropping any information.

        Raises
        ------
        ValueError:
            if the network structure does not satisfy the above constraint.
        """
        def is_valid_(input_shape, filter_shape, step_shape):
            ys, xs = steps(input_shape, filter_shape, step_shape)
            fh, fw = filter_shape
            h, w = input_shape
            if ys[-1] + fh != h or xs[-1] + fw != w:
                raise ValueError("Invalid network structure.")
            return output_shape(ys, xs)

        output_shape_l1 = is_valid_(self.image_shape, self.filter_shape_l1,
                                    self.step_shape_l1)
        output_shape_l2 = is_valid_(output_shape_l1, self.filter_shape_l2,
                                    self.step_shape_l2)
        is_valid_(output_shape_l2, self.filter_shape_pooling,
                  self.filter_shape_pooling)
예제 #54
0
def fit_embedding(dataset,
                  embed_dir,
                  standardize_features=True,
                  pca_n_components=None,
                  umap_n_components=2,
                  umap_init='random',
                  umap_n_neighbors=100,
                  umap_min_dist=0.0,
                  umap_metric='euclidean',
                  low_memory=False,
                  save_transform=True,
                  seed=None,
                  verbose=True):
    """
	train_set: a feature matrix e.g. of (N, F) dimensions, used to define the embedding
	After some experimentation with chemical features of 1024 of 50k-.5M compounds,
	a reasonable embedding first reduces by PCA to 20 dimensions and then UMAP to 2 dimensions.
	UMAP parameters of 100 neighbors and min_dist of 0.0 seem to work well too. init='random'
	can help UMAP from getting stuck.
	return:
		saves embedding data to
			../intermediate_data/embeddings/tag/embedding_info.tsv
			../intermediate_data/embeddings/tag/pca_reducer.joblib
			../intermediate_data/embeddings/tag/umap_reducer.joblib
	"""
    if not os.path.exists(embed_dir):
        os.mkdir(embed_dir)
    else:
        print("WARNING: embed_dir already exists: {}".format(embed_dir))

    random_state = np.random.RandomState(seed=seed)

    begin_time = time.time()

    if standardize_features:
        if verbose:
            print(
                "Standardizing dataset so each feature has zero-mean and unit variance."
            )
        standardizer = StandardScaler(copy=False)
        standardizer.fit(dataset)
        dataset = standardizer.transform(dataset)

    if pca_n_components is None:
        pca_n_components = dataset.shape[1]
        if verbose:
            print("Setting PCA n_componets to full rank of dataset: {}".format(
                pca_n_components))

    if verbose:
        print("Reducing the dimension by PCA from {} to {} dimensions".format(
            dataset.shape[1], pca_n_components))
    pca_reducer = IncrementalPCA(n_components=pca_n_components,
                                 batch_size=1000,
                                 copy=False)
    pca_reducer.fit(dataset)
    pca_embedding = pca_reducer.transform(dataset)

    if verbose:
        print("Reducing the dimension by UMAP to {} dimensions".format(
            umap_n_components))
    umap_reducer = umap.UMAP(
        n_components=umap_n_components,
        metric=umap_metric,
        n_neighbors=umap_n_neighbors,
        min_dist=umap_min_dist,
        init=umap_init,
        low_memory=low_memory,
        #random_state=random_state,
        verbose=True)
    umap_embedding = umap_reducer.fit_transform(pca_embedding)
    umap_embedding = pd.DataFrame(
        data=umap_embedding,
        columns=["UMAP_" + str(i + 1) for i in range(umap_n_components)])
    end_time = time.time()

    if verbose:
        print("created embedding {0} runtime: {1:.2f}s".format(
            embed_dir, end_time - begin_time))
        print("saving embedding to {}".format(embed_dir))
    with open("{}/model_info.tsv".format(embed_dir), 'w') as f:
        f.write("key\tvalue\n")
        f.write("seed\t{}\n".format(seed))
        f.write("input_dim\t{}\n".format(dataset.shape))
        f.write("standardize_features\t{}\n".format(standardize_features))
        f.write("pca_n_component\t{}\n".format(pca_n_components))
        f.write("umap_n_component\t{}\n".format(umap_n_components))
        f.write("umap_metric\t{}\n".format(umap_metric))
        f.write("umap_n_neighbors\t{}\n".format(umap_n_neighbors))
        f.write("umap_min_dist\t{}\n".format(umap_min_dist))
        f.write("umap_init\t{}\n".format(umap_init))

    if save_transform:
        if verbose:
            print("Saving transform to {}.".format(embed_dir))
        if pca_n_components is not None:
            joblib.dump(value=pca_reducer,
                        filename="{}/pca_reducer.joblib".format(embed_dir))
        if standardize_features:
            joblib.dump(value=standardizer,
                        filename="{}/standardizer.joblib".format(embed_dir))
        joblib.dump(value=umap_reducer,
                    filename="{}/umap_reducer.joblib".format(embed_dir))

    pa.parquet.write_table(table=pa.Table.from_pandas(umap_embedding),
                           where="{}/umap_embedding.parquet".format(embed_dir))
    return umap_embedding
예제 #55
0
app = Flask(__name__)


@app.route("/")
def index():
    return '<img src="static/grafico.png"/>'


if __name__ == '__main__':
    iris = load_iris()
    X = iris.data
    y = iris.target

    n_components = 2
    ipca = IncrementalPCA(n_components=n_components, batch_size=10)
    X_ipca = ipca.fit_transform(X)

    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)

    colors = ['navy', 'turquoise', 'darkorange']

    for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
        plt.figure(figsize=(8, 8))
        for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
            plt.scatter(X_transformed[y == i, 0],
                        X_transformed[y == i, 1],
                        color=color,
                        lw=2,
                        label=target_name)
예제 #56
0
        checkpoint_base = os.path.join(base, str(i))
        checkpoint_filenames = sorted(os.listdir(checkpoint_base))
        for epoch in range(2, epoch_max, epoch_step):
            filename = checkpoint_filenames[epoch]
            checkpoint = torch.load(os.path.join(checkpoint_base, filename))
            model = eval(checkpoint['arch'])()
            model.load_state_dict(checkpoint['state_dict'])
            params = np.zeros((0, ))
            for p in model.parameters():
                params = np.append(params, p.cpu().data.numpy().flatten())
            all_params.append(params)
            all_accs.append(checkpoint['logger'].entries[epoch -
                                                         1]['accuracy'])

    all_params = np.array(all_params)
    IncrementalPCA(2, batch_size=6).fit_transform(all_params)
    x = [p[0] for p in all_params]
    y = [p[1] for p in all_params]

    seg = len(range(1, epoch_max, epoch_step))
    plt.subplot(121)
    plt.title('All layers (DeepMnistCNN)')
    for i in range(1, 9):
        x_ = x[(i - 1) * seg:i * seg]
        y_ = y[(i - 1) * seg:i * seg]
        a_ = all_accs[(i - 1) * seg:i * seg]
        plt.plot(x_, y_, 'o:', color=cmap((i - 1) / 7))
        for xi, yi, ai in zip(x_, y_, a_):
            plt.annotate(str('{:.1f}'.format(ai * 100)),
                         xy=(xi, yi),
                         xytext=(xi + 0.004, yi + 0.004),
예제 #57
0
def pca_compress_channel(X, k):
    incr_pca = IncrementalPCA(n_components=k)
    X_reduced = incr_pca.fit_transform(X).astype(np.float16)
    return (X_reduced, incr_pca)
예제 #58
0
from sklearn.decomposition import IncrementalPCA
from sklearn import svm
from sklearn.externals import joblib

datadir = sys.argv[1]
pcadir = sys.argv[2]
n_comp = 300
step = 1000

fnames = [
    os.path.basename(filename) for filename in glob.glob("%s/*.txt" % datadir)
]

used = len(fnames)

ipca = IncrementalPCA(n_components=n_comp)

for batch_start in xrange(0, used, step):
    batch_end = min(batch_start + step, used)
    print "Loading from %d to %d" % (batch_start, batch_end)
    data = []
    labels = []
    for fname in fnames[batch_start:batch_end]:
        print "Loading image", fname
        data.append(np.loadtxt("%s/%s" % (datadir, fname)))
    print np.array(data).shape
    ipca.partial_fit(data)

if os.path.isdir(pcadir):
    shutil.rmtree(pcadir)
os.mkdir(pcadir)
예제 #59
0
파일: empca_v1.py 프로젝트: edgarxi/empca
from __future__ import division

import numpy as np
from sklearn.decomposition import PCA, IncrementalPCA
#import tensorflow as tf
X = np.random.random((3, 4))

pca = PCA(n_components=1)
inc_pnca = IncrementalPCA(n_components=1)
pca.fit(X.T)
phi = pca.components_
print phi
inc_pnca.fit(X.T)
print inc_pnca.components_


def EMPCA(X, n_var, n_obs, n_epochs):
    X = X - np.mean(X)
    assert X.shape == (n_var, n_obs), "shape error in dataset"
    phi = np.random.rand(n_var)
    c = np.zeros(n_obs)
    for i in range(n_epochs):  #repeat until convergence
        for j, x_j in enumerate(X.T):  #E-step
            #print np.dot(x_j, phi)
            #print x_j.shape, phi.shape
            c[j] = np.dot(x_j, phi)
            #print ",",(c*X).shape
            #print np.sum(c*X,axis=1).shape
        phi = np.sum(c * X, axis=1) / np.sum((c**2))
        #print phi.shape
        phi = phi / np.linalg.norm(phi)
예제 #60
0
PARSER.add_argument(
    '-c',
    '--chunksize',
    type=int,
    default=50000,
    help='the number of lines to be read from the INPUT file ' +
    'at a time and stored in memory, the default value is 50000')

ARGS = PARSER.parse_args()

READER = pd.read_csv(ARGS.input,
                     sep=ARGS.delimiter,
                     chunksize=ARGS.chunksize,
                     header=None,
                     error_bad_lines=False)
PCA = IncrementalPCA(n_components=ARGS.dimensions)

total_read = 0
total_written = 0

if ARGS.model and os.path.isfile(ARGS.model):
    PCA = joblib.load(ARGS.model)
else:
    for chunk in READER:
        PCA.partial_fit(chunk)
        total_read += ARGS.chunksize
        #print(str(total_read) + ' vectors read ...\n')
    if ARGS.model:
        if not ARGS.model.endswith('.pkl'):
            ARGS.model += '.pkl'
        joblib.dump(PCA, ARGS.model)