def ipca(): train_features, test_features = gf.get_tfidf() vectorizer = gf.get_tfidf() n_components = 250 ipca = IncrementalPCA(n_components=n_components, batch_size=1250) start_time = time.time() print 'start ipca on train' X_ipca = ipca.fit_transform(train_features) runtime = time.time() - start_time print '-----' print '%.2f seconds to ipca on train' % runtime print '-----' train_features = None print 'ipca train done' np.savetxt('train_features.csv', X_ipca, fmt='%.8e', delimiter=",") X_ipca = None print 'ipca train file done' test_features = gf.get_tfidf(vectorizer, False) Y_ipca = ipca.fit_transform(test_features) test_features, vectorizer = None, None print 'ipca test done' np.savetxt('test_features.csv', Y_ipca, fmt='%.8e', delimiter=",") svd_test_features = None print 'ipca test file done'
def reduceDataset(self,nr=3,method='PCA'): '''It reduces the dimensionality of a given dataset using different techniques provided by Sklearn library Methods available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] #dataset=self.dataset[Model.in_columns] #dataset=self.dataset[['Humidity','TemperatureF','Sea Level PressureIn','PrecipitationIn','Dew PointF','Value']] #PCA if method=='PCA': sklearn_pca = sklearnPCA(n_components=nr) reduced = sklearn_pca.fit_transform(dataset) #Factor Analysis elif method=='FactorAnalysis': fa=FactorAnalysis(n_components=nr) reduced=fa.fit_transform(dataset) #kernel pca with rbf kernel elif method=='KPCArbf': kpca=KernelPCA(nr,kernel='rbf') reduced=kpca.fit_transform(dataset) #kernel pca with poly kernel elif method=='KPCApoly': kpca=KernelPCA(nr,kernel='poly') reduced=kpca.fit_transform(dataset) #kernel pca with cosine kernel elif method=='KPCAcosine': kpca=KernelPCA(nr,kernel='cosine') reduced=kpca.fit_transform(dataset) #kernel pca with sigmoid kernel elif method=='KPCAsigmoid': kpca=KernelPCA(nr,kernel='sigmoid') reduced=kpca.fit_transform(dataset) #ICA elif method=='IPCA': ipca=IncrementalPCA(nr) reduced=ipca.fit_transform(dataset) #Fast ICA elif method=='FastICAParallel': fip=FastICA(nr,algorithm='parallel') reduced=fip.fit_transform(dataset) elif method=='FastICADeflation': fid=FastICA(nr,algorithm='deflation') reduced=fid.fit_transform(dataset) elif method == 'All': self.dimensionalityReduction(nr=nr) return self self.ModelInputs.update({method:reduced}) self.datasetsAvailable.append(method) return self
def get_pca_array(list_chunks, topology): """ Takes a list of mdtraj.Trajectory objects and featurize them to backbone - Alpha Carbons pairwise distances. Perform 2 component Incremental PCA on the featurized trajectory. Parameters ---------- list_chunks: list of mdTraj.Trajectory objects topology: str Name of the Topology file Returns ------- Y: np.array shape(frames, features) """ pca = IncrementalPCA(n_components=2) top = md.load_prmtop(topology) ca_backbone = top.select("name CA") pairs = top.select_pairs(ca_backbone, ca_backbone) pair_distances = [] for chunk in list_chunks: X = md.compute_distances(chunk, pairs) pair_distances.append(X) distance_array = np.concatenate(pair_distances) print("No. of data points: %d" % distance_array.shape[0]) print("No. of features (pairwise distances): %d" % distance_array.shape[1]) Y = pca.fit_transform(distance_array) return Y
def dimensionalityReduction(self,nr=5): '''It applies all the dimensionality reduction techniques available in this class: Techniques available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] sklearn_pca = sklearnPCA(n_components=nr) p_components = sklearn_pca.fit_transform(dataset) fa=FactorAnalysis(n_components=nr) factors=fa.fit_transform(dataset) kpca=KernelPCA(nr,kernel='rbf') rbf=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='poly') poly=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='cosine') cosine=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='sigmoid') sigmoid=kpca.fit_transform(dataset) ipca=IncrementalPCA(nr) i_components=ipca.fit_transform(dataset) fip=FastICA(nr,algorithm='parallel') fid=FastICA(nr,algorithm='deflation') ficaD=fip.fit_transform(dataset) ficaP=fid.fit_transform(dataset) '''isomap=Isomap(n_components=nr).fit_transform(dataset) try: lle1=LocallyLinearEmbedding(n_components=nr).fit_transform(dataset) except ValueError: lle1=LocallyLinearEmbedding(n_components=nr,eigen_solver='dense').fit_transform(dataset) try: lle2=LocallyLinearEmbedding(n_components=nr,method='modified').fit_transform(dataset) except ValueError: lle2=LocallyLinearEmbedding(n_components=nr,method='modified',eigen_solver='dense').fit_transform(dataset) try: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa').fit_transform(dataset) except ValueError: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa',eigen_solver='dense').fit_transform(dataset)''' values=[p_components,factors,rbf,poly,cosine,sigmoid,i_components,ficaD,ficaP]#,isomap,lle1,lle2,lle3] keys=['PCA','FactorAnalysis','KPCArbf','KPCApoly','KPCAcosine','KPCAsigmoid','IPCA','FastICADeflation','FastICAParallel']#,'Isomap','LLE','LLEmodified','LLEltsa'] self.ModelInputs.update(dict(zip(keys, values))) [self.datasetsAvailable.append(key) for key in keys ] #debug #dataset=pd.DataFrame(self.ModelInputs['Dataset']) #dataset['Output']=self.ModelOutput #self.debug['Dimensionalityreduction']=dataset ### return self
def reduce_data(features, out_dir, dim=10, first_column=True): array = np.load(features) subarray = array if not first_column: subarray = array[:, 1:] ipca = IncrementalPCA(n_components=dim, copy=False, batch_size=500000) ipca.fit_transform(subarray) new_array = subarray # when it cannot fit into memory do it incrementally like below # new_array_1 = tsvd.fit_transform(subarray[:1500000, :]) # new_array_2 = tsvd.fit_transform(subarray[1500000:3400000, :]) # new_array_3 = tsvd.fit_transform(subarray[3400000:, :]) # new_array = np.vstack([new_array_1, new_array_2, new_array_3]) if not first_column: new_array = np.c_[array[:, 0], new_array] assert new_array.shape[0] == array.shape[0] np.save(os.path.join(out_dir, os.path.basename(features) + "_pca"), new_array)
def ipca(data, labels, new_dimension): print "start incremental pca..." if hasattr(data, "todense"): data = np.array(data.todense()) start = time.time() pca = IncrementalPCA(n_components=new_dimension) reduced = pca.fit_transform(data) end = time.time() return (reduced, end-start)
def run_pca(n_components,n_sites,order_dict,sim_mat): output_file = open('pca_100000_100','w') ipca = IncrementalPCA(n_components=n_components,batch_size=8000) sim_mat_ipca = ipca.fit_transform(sim_mat) var_sim_ipca = ipca.explained_variance_ratio_ output_file.write(",".join(str(x) for x in var_sim_ipca)+'\n') for siteid in order_dict: stringa = ' '.join( [siteid, str(sim_mat_ipca[order_dict[siteid], 0]), str(sim_mat_ipca[order_dict[siteid], 1]), str(sim_mat_ipca[order_dict[siteid], 2]), str(sim_mat_ipca[order_dict[siteid], 3]), str(sim_mat_ipca[order_dict[siteid], 4]), str(sim_mat_ipca[order_dict[siteid], 5]), str(sim_mat_ipca[order_dict[siteid], 6]) ]) output_file.write(stringa +'\n') n_bins = 1000. binned = np.empty((n_sites,5)).astype(np.int32) for k in range(5): delta = (sim_mat_ipca[:, k].max()-sim_mat_ipca[:, k].min())/n_bins min_k = sim_mat_ipca[:, k].min() for i in range(n_sites): binned[i,k] = int((sim_mat_ipca[i, k]-min_k)/delta) f = open('pc_100000_100.csv','w') for siteid in order_dict: stringa = ' '.join( [siteid, str(binned[order_dict[siteid], 0]), str(binned[order_dict[siteid], 1]), str(binned[order_dict[siteid], 2]), str(binned[order_dict[siteid], 3]), str(binned[order_dict[siteid], 4]) ]) f.write(stringa +'\n') f.close()
def test_incremental_pca(): """Incremental PCA on dense arrays.""" X = iris.data batch_size = X.shape[0] // 3 ipca = IncrementalPCA(n_components=2, batch_size=batch_size) pca = PCA(n_components=2) pca.fit_transform(X) X_transformed = ipca.fit_transform(X) np.testing.assert_equal(X_transformed.shape, (X.shape[0], 2)) assert_almost_equal(ipca.explained_variance_ratio_.sum(), pca.explained_variance_ratio_.sum(), 1) for n_components in [1, 2, X.shape[1]]: ipca = IncrementalPCA(n_components, batch_size=batch_size) ipca.fit(X) cov = ipca.get_covariance() precision = ipca.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]))
class PCASK(AbstractFeature): def __init__(self, n_components): AbstractFeature.__init__(self) self.n_components = n_components #for key in options: #setattr(self,key,options[key]) def compute(self,X,y): if X.ndim == 3: X = X.reshape((X.shape[0],X.shape[1]*X.shape[2])) self.ipca = IncrementalPCA(n_components=self.n_components, batch_size=None) return self.ipca.fit_transform(X) def extract(self,X): if X.ndim == 2: X = X.reshape((X.shape[0]*X.shape[1])) return list(self.ipca.transform([X])[0]) def __repr__(self): return "PCASK"
def pca( data: Union[AnnData, np.ndarray, spmatrix], n_comps: int = N_PCS, zero_center: Optional[bool] = True, svd_solver: str = 'auto', random_state: int = 0, return_info: bool = False, use_highly_variable: Optional[bool] = None, dtype: str = 'float32', copy: bool = False, chunked: bool = False, chunk_size: Optional[int] = None, ) -> Union[AnnData, np.ndarray, spmatrix]: """Principal component analysis [Pedregosa11]_. Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of *scikit-learn* [Pedregosa11]_. Parameters ---------- data The (annotated) data matrix of shape ``n_obs`` × ``n_vars``. Rows correspond to cells and columns to genes. n_comps Number of principal components to compute. zero_center If `True`, compute standard PCA from covariance matrix. If ``False``, omit zero-centering variables (uses :class:`~sklearn.decomposition.TruncatedSVD`), which allows to handle sparse input efficiently. Passing ``None`` decides automatically based on sparseness of the data. svd_solver SVD solver to use: ``'arpack'`` for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`) ``'randomized'`` for the randomized algorithm due to Halko (2009). ``'auto'`` (the default) chooses automatically depending on the size of the problem. random_state Change to use different initial states for the optimization. return_info Only relevant when not passing an :class:`~anndata.AnnData`: see “**Returns**”. use_highly_variable Whether to use highly variable genes only, stored in ``.var['highly_variable']``. By default uses them if they have been determined beforehand. dtype Numpy data type string to which to convert the result. copy If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Is ignored otherwise. chunked If ``True``, perform an incremental PCA on segments of ``chunk_size``. The incremental PCA automatically zero centers and ignores settings of ``random_seed`` and ``svd_solver``. If ``False``, perform a full PCA. chunk_size Number of observations to include in each chunk. Required if ``chunked=True`` was passed. Returns ------- X_pca : :class:`scipy.sparse.spmatrix` or :class:`numpy.ndarray` If `data` is array-like and ``return_info=False`` was passed, this function only returns `X_pca`… adata : :class:`~anndata.AnnData` …otherwise if ``copy=True`` it returns or else adds fields to ``adata``: ``.obsm['X_pca']`` PCA representation of data. ``.varm['PCs']`` The principal components containing the loadings. ``.uns['pca']['variance_ratio']``) Ratio of explained variance. ``.uns['pca']['variance']`` Explained variance, equivalent to the eigenvalues of the covariance matrix. """ # chunked calculation is not randomized, anyways if svd_solver in {'auto', 'randomized'} and not chunked: logg.info( 'Note that scikit-learn\'s randomized PCA might not be exactly ' 'reproducible across different computational platforms. For exact ' 'reproducibility, choose `svd_solver=\'arpack\'.` This will likely ' 'become the Scanpy default in the future.') data_is_AnnData = isinstance(data, AnnData) if data_is_AnnData: adata = data.copy() if copy else data else: adata = AnnData(data) logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4) if adata.n_vars < n_comps: n_comps = adata.n_vars - 1 logg.msg('reducing number of computed PCs to', n_comps, 'as dim of data is only', adata.n_vars, v=4) if use_highly_variable is True and 'highly_variable' not in adata.var.keys( ): raise ValueError( 'Did not find adata.var[\'highly_variable\']. ' 'Either your data already only consists of highly-variable genes ' 'or consider running `pp.filter_genes_dispersion` first.') if use_highly_variable is None: use_highly_variable = True if 'highly_variable' in adata.var.keys( ) else False adata_comp = adata[:, adata. var['highly_variable']] if use_highly_variable else adata if chunked: if not zero_center or random_state or svd_solver != 'auto': logg.msg('Ignoring zero_center, random_state, svd_solver', v=4) from sklearn.decomposition import IncrementalPCA X_pca = np.zeros((adata_comp.X.shape[0], n_comps), adata_comp.X.dtype) pca_ = IncrementalPCA(n_components=n_comps) for chunk, _, _ in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk pca_.partial_fit(chunk) for chunk, start, end in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk X_pca[start:end] = pca_.transform(chunk) else: if zero_center is None: zero_center = not issparse(adata_comp.X) if zero_center: from sklearn.decomposition import PCA if issparse(adata_comp.X): logg.msg( ' as `zero_center=True`, ' 'sparse input is densified and may ' 'lead to huge memory consumption', v=4) X = adata_comp.X.toarray( ) # Copying the whole adata_comp.X here, could cause memory problems else: X = adata_comp.X pca_ = PCA(n_components=n_comps, svd_solver=svd_solver, random_state=random_state) else: from sklearn.decomposition import TruncatedSVD logg.msg( ' without zero-centering: \n' ' the explained variance does not correspond to the exact statistical defintion\n' ' the first component, e.g., might be heavily influenced by different means\n' ' the following components often resemble the exact PCA very closely', v=4) pca_ = TruncatedSVD(n_components=n_comps, random_state=random_state) X = adata_comp.X X_pca = pca_.fit_transform(X) if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype) if data_is_AnnData: adata.obsm['X_pca'] = X_pca if use_highly_variable: adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) adata.varm['PCs'][ adata.var['highly_variable']] = pca_.components_.T else: adata.varm['PCs'] = pca_.components_.T adata.uns['pca'] = {} adata.uns['pca']['variance'] = pca_.explained_variance_ adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_ logg.msg(' finished', t=True, end=' ', v=4) logg.msg( 'and added\n' ' \'X_pca\', the PCA coordinates (adata.obs)\n' ' \'PC1\', \'PC2\', ..., the loadings (adata.var)\n' ' \'pca_variance\', the variance / eigenvalues (adata.uns)\n' ' \'pca_variance_ratio\', the variance ratio (adata.uns)', v=4) return adata if copy else None else: if return_info: return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_ else: return X_pca
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.decomposition import PCA, IncrementalPCA iris = load_iris() x = iris.data y = iris.target target_names = iris.target_names n_components = 2 pca = PCA(n_components=n_components) x_pca = pca.fit_transform(x) ipca = IncrementalPCA(n_components=n_components, batch_size=10) #分批处理,每次10条数据 x_ipca = ipca.fit_transform(x) colors = ['navy', 'turquoise', 'darkorange'] #海军蓝/蓝绿色/暗橘色 for x_transformed, title, number in [(x_pca, 'PCA', 1), (x_ipca, 'Incremental PCA', 2)]: plt.figure(number, figsize=(8, 8)) for color, i, target_name in zip(colors, [0, 1, 2], target_names): plt.scatter(x_transformed[y == i, 0], x_transformed[y == i, 1], color=color, lw=2, label=target_name) if 'Incremental' in title: err = np.abs(np.abs(x_pca) - np.abs(x_ipca)).mean()
def feature_extraction_pca(raw_data_features, raw_data_labels, timestamps): """ Args: raw_data_features: The fourth column is the barometer data. Returns: features: Features extracted from the data features, where features[:, 0] is the mean magnitude of acceleration; features[:, 1] is the variance of acceleration; features[:, 2:6] is the fft power spectrum of equally-spaced frequencies; features[: 6:12] is the fft power spectrum of frequencies in logarithmic sacle; features[:, 13] is the slope of pressure. """ features = None labels = None accel_magnitudes = np.sqrt((raw_data_features[:, 0]**2).reshape(-1, 1)+ (raw_data_features[:, 1]**2).reshape(-1, 1)+ (raw_data_features[:, 2]**2).reshape(-1, 1)) # The window size for feature extraction segment_size = 128 for i in range(0, accel_magnitudes.shape[0]-segment_size, 64): # TO DO Compute mean and variance of acceleration for each segment segment = accel_magnitudes[i:i+segment_size] accel_mean = np.mean(segment) accel_var = np.var(segment) accel_var_skew = skew(segment) accel_var_kurt = kurtosis(segment) segment_fft_powers = np.abs(np.fft.fft(segment))**2 #print(segment_fft_powers) # Aggreate band power within frequency range, with equal space (window size=32) or logarithmic scale # Band power of equally-sapced bands: 4 features equal_band_power = list() window_size = 32 for j in range(0, len(segment_fft_powers), window_size): equal_band_power.append(sum(segment_fft_powers[j: j+32]).tolist()[0]) # Band power of bands in logarithmic scale: 7 features log_band_power = list() freqs = [0, 2, 4, 8, 16, 32, 64, 128] for j in range(len(freqs)-1): log_band_power.append(sum(segment_fft_powers[freqs[j]: freqs[j+1]]).tolist()[0]) # Slope of barometer data # bar_slope = raw_data_features[i+segment_size-1, 3] - raw_data_features[i, 3] bar_slope = np.polyfit(timestamps[i:i+segment_size], raw_data_features[i:i+segment_size, 3], 1)[0] # bar_slope = np.polyfit([x*0.1 for x in range(segment_size)], raw_data_features[i:i+segment_size, 3], 1)[0] feature = [accel_mean, accel_var, accel_var_skew, accel_var_kurt] + equal_band_power + log_band_power + [bar_slope] if features is None: features = np.array([feature]) else: features = np.append(features, [feature], axis=0) label = Counter(raw_data_labels[i:i+segment_size][:, 0].tolist()).most_common(1)[0][0] if labels is None: labels = np.array([label]) else: labels = np.append(labels, [label], axis=0) pca = IncrementalPCA(n_components=5) features = pca.fit_transform(features) return features, labels
pca.fit(X_train) print(pca.components_) colnames = list(X_train.columns) pcs_df = pd.DataFrame({'PC1': pca.components_[0], \ 'PC2': pca.components_[1], \ 'Feature': colnames}) pcs_df.head() explained_variance_ratio_ = np.around(pca.explained_variance_ratio_, decimals=3) explained_variance_ratio_ fig = plt.figure(figsize=(12, 8)) plt.plot(np.cumsum(pca.explained_variance_ratio_), 'ro-', linewidth=2) plt.title('Scree Plot') plt.xlabel('Principle components') plt.ylabel('variance') plt.show() from sklearn.decomposition import IncrementalPCA pca_final = IncrementalPCA(n_components=2) df_pca = pca_final.fit_transform( cleaned_master_data.drop(['Hospital overall rating'], axis=1)) df_pca.shape
# Every entry has 13 features and one binary label. In order to avoid "Out-of-memory" errors, we need to preprocess our dataset. # For this purpose i use StandardScaler for scaling and IncrementalPCA for preprocessing data. import numpy as np import pandas as pd from sklearn.decomposition import IncrementalPCA from sklearn.preprocessing import StandardScaler # I select 10000 rows per chunk to be sure, that it will be finished (i have only 2 Gb RAM) chunk_size = 10000 components = 3 # Only three components is not good idea, but i have huge dataset and after preprocessing my data also can be large. for chunk in pd.read_csv('train.tar.gz', compression='gzip', sep=';', header=0, quotechar='"', chunksize=chunk_size): labels = chunk.iloc[:, 2] selected_features = chunk.iloc[:, [3:16]] scaled_features = StandardScaler().fit_transform(selected_features) ipca = IncrementalPCA(n_components=components) principalComponents = ipca.fit_transform(scaled_features) preprocessed_data = pd.DataFrame(data=principalComponents) merged_data = pd.concat([preprocessed_data, labels], axis=1) merged_data.to_csv('preprocessed_data.csv', mode='w', sep=';', header=0) # P.S. Of course chunksize and number of components depends on your available computer resources.
def fit_pca(self, matrix): """Fit pca matrix and save sklearn model """ reducer = IncrementalPCA(n_components=800, batch_size=2500) reduced_matrix = reducer.fit_transform(matrix) self.rev_matrix_pca = reduced_matrix self.pca_model = reducer
def pca( data: Union[AnnData, np.ndarray, spmatrix], n_comps: Optional[int] = None, zero_center: Optional[bool] = True, svd_solver: str = 'arpack', random_state: AnyRandom = 0, return_info: bool = False, use_highly_variable: Optional[bool] = None, dtype: str = 'float32', copy: bool = False, chunked: bool = False, chunk_size: Optional[int] = None, ) -> Union[AnnData, np.ndarray, spmatrix]: """\ Principal component analysis [Pedregosa11]_. Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of *scikit-learn* [Pedregosa11]_. .. versionchanged:: 1.5.0 In previous versions, computing a PCA on a sparse matrix would make a dense copy of the array for mean centering. As of scanpy 1.5.0, mean centering is implicit. While results are extremely similar, they are not exactly the same. If you would like to reproduce the old results, pass a dense array. Parameters ---------- data The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. n_comps Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation. zero_center If `True`, compute standard PCA from covariance matrix. If `False`, omit zero-centering variables (uses :class:`~sklearn.decomposition.TruncatedSVD`), which allows to handle sparse input efficiently. Passing `None` decides automatically based on sparseness of the data. svd_solver SVD solver to use: `'arpack'` (the default) for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`) `'randomized'` for the randomized algorithm due to Halko (2009). `'auto'` chooses automatically depending on the size of the problem. `'lobpcg'` An alternative SciPy solver. .. versionchanged:: 1.4.5 Default value changed from `'auto'` to `'arpack'`. Efficient computation of the principal components of a sparse matrix currently only works with the `'arpack`' or `'lobpcg'` solvers. random_state Change to use different initial states for the optimization. return_info Only relevant when not passing an :class:`~anndata.AnnData`: see “**Returns**”. use_highly_variable Whether to use highly variable genes only, stored in `.var['highly_variable']`. By default uses them if they have been determined beforehand. dtype Numpy data type string to which to convert the result. copy If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Is ignored otherwise. chunked If `True`, perform an incremental PCA on segments of `chunk_size`. The incremental PCA automatically zero centers and ignores settings of `random_seed` and `svd_solver`. If `False`, perform a full PCA. chunk_size Number of observations to include in each chunk. Required if `chunked=True` was passed. Returns ------- X_pca : :class:`~scipy.sparse.spmatrix`, :class:`~numpy.ndarray` If `data` is array-like and `return_info=False` was passed, this function only returns `X_pca`… adata : anndata.AnnData …otherwise if `copy=True` it returns or else adds fields to `adata`: `.obsm['X_pca']` PCA representation of data. `.varm['PCs']` The principal components containing the loadings. `.uns['pca']['variance_ratio']` Ratio of explained variance. `.uns['pca']['variance']` Explained variance, equivalent to the eigenvalues of the covariance matrix. """ logg_start = logg.info(f'computing PCA') # chunked calculation is not randomized, anyways if svd_solver in {'auto', 'randomized'} and not chunked: logg.info( 'Note that scikit-learn\'s randomized PCA might not be exactly ' 'reproducible across different computational platforms. For exact ' 'reproducibility, choose `svd_solver=\'arpack\'.`') data_is_AnnData = isinstance(data, AnnData) if data_is_AnnData: adata = data.copy() if copy else data else: adata = AnnData(data) if use_highly_variable is True and 'highly_variable' not in adata.var.keys( ): raise ValueError( 'Did not find adata.var[\'highly_variable\']. ' 'Either your data already only consists of highly-variable genes ' 'or consider running `pp.highly_variable_genes` first.') if use_highly_variable is None: use_highly_variable = True if 'highly_variable' in adata.var.keys( ) else False if use_highly_variable: logg.info(' on highly variable genes') adata_comp = (adata[:, adata.var['highly_variable']] if use_highly_variable else adata) if n_comps is None: min_dim = min(adata_comp.n_vars, adata_comp.n_obs) if settings.N_PCS >= min_dim: n_comps = min_dim - 1 else: n_comps = settings.N_PCS logg.info(f' with n_comps={n_comps}') random_state = check_random_state(random_state) X = adata_comp.X if chunked: if not zero_center or random_state or svd_solver != 'arpack': logg.debug('Ignoring zero_center, random_state, svd_solver') from sklearn.decomposition import IncrementalPCA X_pca = np.zeros((X.shape[0], n_comps), X.dtype) pca_ = IncrementalPCA(n_components=n_comps) for chunk, _, _ in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk pca_.partial_fit(chunk) for chunk, start, end in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk X_pca[start:end] = pca_.transform(chunk) elif (not issparse(X) or svd_solver == "randomized") and zero_center: from sklearn.decomposition import PCA if issparse(X) and svd_solver == "randomized": # This is for backwards compat. Better behaviour would be to either error or use arpack. logg.warning( "svd_solver 'randomized' does not work with sparse input. Densifying the array. " "This may take a very large amount of memory.") X = X.toarray() pca_ = PCA(n_components=n_comps, svd_solver=svd_solver, random_state=random_state) X_pca = pca_.fit_transform(X) elif issparse(X) and zero_center: from sklearn.decomposition import PCA if svd_solver == "auto": svd_solver = "arpack" if svd_solver not in {'lobpcg', 'arpack'}: raise ValueError( 'svd_solver: {svd_solver} can not be used with sparse input.\n' 'Use "arpack" (the default) or "lobpcg" instead.') output = _pca_with_sparse(X, n_comps, solver=svd_solver, random_state=random_state) # this is just a wrapper for the results X_pca = output['X_pca'] pca_ = PCA(n_components=n_comps, svd_solver=svd_solver) pca_.components_ = output['components'] pca_.explained_variance_ = output['variance'] pca_.explained_variance_ratio_ = output['variance_ratio'] elif not zero_center: from sklearn.decomposition import TruncatedSVD logg.debug( ' without zero-centering: \n' ' the explained variance does not correspond to the exact statistical defintion\n' ' the first component, e.g., might be heavily influenced by different means\n' ' the following components often resemble the exact PCA very closely' ) pca_ = TruncatedSVD(n_components=n_comps, random_state=random_state, algorithm=svd_solver) X_pca = pca_.fit_transform(X) else: raise Exception("This shouldn't happen. Please open a bug report.") if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype) if data_is_AnnData: adata.obsm['X_pca'] = X_pca adata.uns['pca'] = {} adata.uns['pca']['params'] = { 'zero_center': zero_center, 'use_highly_variable': use_highly_variable, } if use_highly_variable: adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) adata.varm['PCs'][ adata.var['highly_variable']] = pca_.components_.T else: adata.varm['PCs'] = pca_.components_.T adata.uns['pca']['variance'] = pca_.explained_variance_ adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_ logg.info(' finished', time=logg_start) logg.debug( 'and added\n' ' \'X_pca\', the PCA coordinates (adata.obs)\n' ' \'PC1\', \'PC2\', ..., the loadings (adata.var)\n' ' \'pca_variance\', the variance / eigenvalues (adata.uns)\n' ' \'pca_variance_ratio\', the variance ratio (adata.uns)') return adata if copy else None else: logg.info(' finished', time=logg_start) if return_info: return ( X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_, ) else: return X_pca
def main(): print('\033[1m' + 'Loading all the datasets...' + '\033[0m') arffs_dic = obtain_arffs('./datasets/') # Extract an specific database dataset_name = 'breast-w' # possible datasets ('hypothyroid', 'breast-w', 'waveform') dat1 = arffs_dic[dataset_name] df1 = pd.DataFrame(dat1[0]) # original data in pandas dataframe groundtruth_labels = df1[df1.columns[ len(df1.columns) - 1]].values # original labels in a numpy array df1 = df1.drop(df1.columns[len(df1.columns) - 1], 1) if dataset_name == 'hypothyroid': df1 = df1.drop( 'TBG', 1 ) # This column only contains NaNs so does not add any value to the clustering data1 = df1.values # original data in a numpy array without labels load = Preprocess() data_x = load.preprocess_method(data1) data_x = data_x.astype(np.float64) le = LabelEncoder() le.fit(np.unique(groundtruth_labels)) groundtruth_labels = le.transform(groundtruth_labels) num_clusters = len( np.unique(groundtruth_labels)) # Number of different labels # -------------------------------------------------------------------------------Compute covariance and eigenvectors original_mean = np.mean(data_x, axis=0) cov_m = compute_covariance(data_x, original_mean) eig_vals, eig_vect = np.linalg.eig(cov_m) idxsort = eig_vals.argsort()[::-1] eig_vals = eig_vals[idxsort].real eig_vect = eig_vect[:, idxsort].real # ---------------------------------------------------------------------Decide the number of features we want to keep prop_variance = 0.9 k = proportion_of_variance(eig_vals, prop_variance) print('\nThe value of K selected to obtain a proportion of variance = ' + str(prop_variance) + ' is: ' + str(k) + '\n') eig_vals_red = eig_vals[:k] eig_vect_red = eig_vect[:, :k] # Eigenvectors are in columns (8xk) # ---------------------------------------------------------------------------------Reduce dimensionality of the data # A1) Using our implementation of PCA transf_data_x = np.dot((eig_vect_red.T), (data_x - original_mean).T).T # B1) Using the PCA implementation of sklearn pca = PCA(n_components=k) transf_data_x_sklearn = pca.fit_transform(data_x) # C1) Using the incremental PCA implementation of sklearn incrementalpca = IncrementalPCA(n_components=k) transf_data_x_sklearn2 = incrementalpca.fit_transform(data_x) # --------------------------------------------------------------------------------------------------Reconstruct data # A2) Reconstruct data with our method reconstruct_data_x = np.dot(eig_vect_red, transf_data_x.T) reconstruct_data_x = reconstruct_data_x.T + original_mean # B2) Reconstruct data with PCA sklearn reconstruct_data_x1 = np.dot(pca.components_.T, transf_data_x_sklearn.T) reconstruct_data_x1 = reconstruct_data_x1.T + original_mean # C2) Reconstruct data with incremental PCA sklearn reconstruct_data_x2 = np.dot(incrementalpca.components_.T, transf_data_x_sklearn2.T) reconstruct_data_x2 = reconstruct_data_x2.T + original_mean # ----------------------------------------------------------------Error between original data and reconstructed data # A3) Error between original data and reconstruct data error = reconstruct_data_x - data_x total_error = (np.sum(abs(error)) / np.sum(abs(data_x))) * 100 print( 'The relative error after reconstructing the original matrix with K = ' + str(k) + ' is ' + '\033[1m' + '\033[' '94m' + str(round(total_error, 2)) + '%' + '\033[0m' + ' [using our implementation of PCA]') # B3) Error between original data and reconstruct data 1 error1 = reconstruct_data_x1 - data_x total_error1 = (np.sum(abs(error1)) / np.sum(abs(data_x))) * 100 print( 'The relative error after reconstructing the original matrix with K = ' + str(k) + ' is ' + '\033[1m' + '\033[' '94m' + str(round(total_error1, 2)) + '%' + '\033[0m' + ' [using pca.fit_transform of Sklearn]') # C3) Error between original data and reconstruct data 2 error2 = reconstruct_data_x2 - data_x total_error2 = (np.sum(abs(error2)) / np.sum(abs(data_x))) * 100 print( 'The relative error after reconstructing the original matrix with K = ' + str(k) + ' is ' + '\033[1m' + '\033[' '94m' + str(round(total_error2, 2)) + '%' + '\033[0m' + ' [using incrementalpca.fit_transform of Sklearn]') # ------------------------------------------------------------------------------Kmeans with dimensionality reduction print( '\n---------------------------------------------------------------------------------------------------------' ) print('K-MEANS APPLIED TO THE ORIGINAL DATA') tester_kmeans(data_x, groundtruth_labels) print( '\n---------------------------------------------------------------------------------------------------------' ) print( 'K-MEANS APPLIED TO THE TRANSFORMED DATA USING OUR IMPLEMENTATION OF PCA' ) labels = tester_kmeans(transf_data_x, groundtruth_labels) print( '\n---------------------------------------------------------------------------------------------------------' ) print( 'K-MEANS APPLIED TO THE TRANSFORMED DATA USING pca.fit_transform OF SKLEARN' ) tester_kmeans(transf_data_x_sklearn, groundtruth_labels) print( '\n---------------------------------------------------------------------------------------------------------' ) print( 'K-MEANS APPLIED TO THE TRANSFORMED DATA USING incrementalpca.fit_transform OF SKLEARN' ) tester_kmeans(transf_data_x_sklearn2, groundtruth_labels) print( '\n---------------------------------------------------------------------------------------------------------' ) # -----------------------------------------------------------------------------------------------------Scatter plots ploting_boolean = False plot_scatters = False # only change to True for a database with not too many features (like breast-w) if ploting_boolean: # Plot eigenvector plt.plot(eig_vals, 'ro-', linewidth=2, markersize=6) plt.title('Magnitude of the eigenvalues') plt.show() if plot_scatters: # Plottings: scatter plots # Original data with groundtruth labels ploting_v(data_x, num_clusters, groundtruth_labels, 'original data with groundtruth labels') # Transfomed data with our implementation of PCA and with groundtruth labels ploting_v(transf_data_x, num_clusters, groundtruth_labels, 'transformed data (our PCA) with groundtruth ' 'labels') # Transfomed data with pca.fit_transform and with groundtruth labels ploting_v( transf_data_x_sklearn, num_clusters, groundtruth_labels, 'transformed data (Sklearn PCA v1) ' 'with groundtruth labels') # Transfomed data with incrementalpca.fit_transform and with groundtruth labels ploting_v( transf_data_x_sklearn2, num_clusters, groundtruth_labels, 'transformed data (Sklearn PCA v2) ' 'with groundtruth labels') # ------------------------------------------------------------------------------------------------------3D plots # Plottings: 3D plots # Original data without labels ploting_v3d(data_x, 1, np.zeros(len(groundtruth_labels)), 'original data without labels') # Original data with groundtruth labels ploting_v3d(data_x, num_clusters, groundtruth_labels, 'original data with groundtruth labels') # Reconstructed data without labels ploting_v3d(reconstruct_data_x, 1, np.zeros(len(groundtruth_labels)), 'reconstructed data without labels') # Transfomed data with our implementation of PCA and without labels ploting_v3d(transf_data_x, 1, np.zeros(len(groundtruth_labels)), 'transformed data without labels') # Transfomed data with our implementation of PCA and with groundtruth_labels ploting_v3d(transf_data_x, num_clusters, groundtruth_labels, 'transformed data with groundtruth labels') # Transfomed data with our implementation of PCA and with the labels obtained with our K-means ploting_v3d(transf_data_x, num_clusters, labels, 'transformed data with labels from our K-means') # Plot of the correlation matrix of the dataset plot_corr_matrix(data_x, legend=False)
# ## 2. Dimensionality Reduction # # We use Principal Component analysis to reduce the dimension of the remaining 176 bands into just 4 principle components. This is done using the Incremental PCA function imported from the Scikit-learn library. Incremental PCA is chosen because of the large number of features in the input data and would work well running it in small batches. # # In[15]: #Using the IncrementalPCA method from the scikit-learn library from sklearn.decomposition import IncrementalPCA IPCA = IncrementalPCA(n_components=4, batch_size = 5) X_IPCA = IPCA.fit_transform(X_flat_transposed) print(X_IPCA.shape) # In[16]: #Array is transposed to be consistent with the original format X_IPCA = np.transpose(X_IPCA) X_IPCA.shape # #
# Time = # Data decomposition print("Now Decompositing Data") start_time = time.clock() #from sklearn.decomposition import TruncatedSVD #decomp = TruncatedSVD(n_components=1000,n_iter=5) #decomp.fit(train_data) train_data = pca.fit_transform(train_data) end_time = time.clock() print("Decompositing Complete \nTime =", end_time - start_time) # Time = print(train_data) # Saving decomposed data as csv csv_decomp_train_path = 'csv_pca900decomp_alphabets_train.csv' with open( csv_decomp_train_path, 'w') as f:
print stanames AttSta.setInPaths(InPath) Files = AttSta.getatt(stanames, 'InPath') net = LCB_net() net.AddFilesSta(Files) X = net.getvarallsta(var=var, by='H', how='mean', From=From, To=To) # X = X[0:100] # for i in range(17): # X = pd.concat([X,X], axis=1) # print X.shape tic = time.clock() pca = IncrementalPCA(n_components=2, batch_size=3) K = pca.fit_transform(X) toc = time.clock() print toc - tic #For comparison, compute PCA tic = time.clock() pca = PCA(n_components=2) H = pca.fit_transform(X) toc = time.clock() print toc - tic plt.figure() plt.plot(K) plt.figure() plt.plot(H)
vec.fit(description) features = vec.transform(description) US_df['category_id'].nunique() cls = MiniBatchKMeans(n_clusters=16, random_state=0) cls.fit(features) cls.predict(features) from sklearn.metrics import homogeneity_score homogeneity_score(US_df.category_id, cls.predict(features)) from sklearn.metrics import completeness_score completeness_score(US_df.category_id, cls.predict(features)) # reduce the features to 2D ipca = IncrementalPCA(n_components=2, batch_size=100) reduced_features = ipca.fit_transform(features.toarray()) # reduce the cluster centers to 2D reduced_cluster_centers = ipca.transform(cls.cluster_centers_) plt.scatter(features[:, 0], features[:, 1], c=cls.predict(features)) plt.scatter(cls.cluster_centers_[:, 0], cls.cluster_centers_[:, 1], marker='x', s=150, c='b') #CLUSTERING TITLE #TF-IDF vec = TfidfVectorizer(stop_words="english") vec.fit(US_df["title"])
def prep_data( train, test, feature_groups, scale_data=False, ): """Feature selection and data preprocessing. Args: feature_groups (string): Which features to use. Options: "all" - All features "submodels" - Only features from submodels "basic_text" - Only basic text features like word counts, etc. "non_linguistic" - All main features not including pos,dep,ent features from spacy "other" - review_stars, grade_level, polarity, subjectivity "spacy_linguistic" - Only pos,ent.dep features from spacy "top_features" - Top 15 features chosen from feature selection steps "pca" - Top 20 PCA Features scale_data (bool, optional): Whether or not to standard scale data. Defaults to False. """ # Feature Selection feature_options = {} feature_options["submodels"] = [ "nb_prob", "svm_pred", "ft_prob", "lda_t1", "lda_t2", "lda_t3", "lda_t4", "lda_t5", ] feature_options["other"] = [ "review_stars", "grade_level", "polarity", "subjectivity", ] feature_options["basic_text"] = [ "word_cnt", "character_cnt", "num_cnt", "uppercase_cnt", "#@_cnt", "sentence_cnt", "lexicon_cnt", "syllable_cnt", "avg_word_len", "token_cnt", "stopword_cnt", "stopword_pct", "ent_cnt", "ent_pct", ] feature_options["spacy_linguistic"] = [ "pos_adj_pct", "pos_adj_cnt", "pos_adp_pct", "pos_adp_cnt", "pos_adv_pct", "pos_adv_cnt", "pos_aux_pct", "pos_aux_cnt", "pos_conj_pct", "pos_conj_cnt", "pos_det_pct", "pos_det_cnt", "pos_intj_pct", "pos_intj_cnt", "pos_noun_pct", "pos_noun_cnt", "pos_num_pct", "pos_num_cnt", "pos_part_pct", "pos_part_cnt", "pos_pron_pct", "pos_pron_cnt", "pos_propn_pct", "pos_propn_cnt", "pos_punct_pct", "pos_punct_cnt", "pos_sconj_pct", "pos_sconj_cnt", "pos_sym_pct", "pos_sym_cnt", "pos_verb_pct", "pos_verb_cnt", "pos_x_pct", "pos_x_cnt", "dep_root_pct", "dep_root_cnt", "dep_acl_pct", "dep_acl_cnt", "dep_acomp_pct", "dep_acomp_cnt", "dep_advcl_pct", "dep_advcl_cnt", "dep_advmod_pct", "dep_advmod_cnt", "dep_agent_pct", "dep_agent_cnt", "dep_amod_pct", "dep_amod_cnt", "dep_appos_pct", "dep_appos_cnt", "dep_attr_pct", "dep_attr_cnt", "dep_aux_pct", "dep_aux_cnt", "dep_auxpass_pct", "dep_auxpass_cnt", "dep_case_pct", "dep_case_cnt", "dep_cc_pct", "dep_cc_cnt", "dep_ccomp_pct", "dep_ccomp_cnt", "dep_compound_pct", "dep_compound_cnt", "dep_conj_pct", "dep_conj_cnt", "dep_csubj_pct", "dep_csubj_cnt", "dep_csubjpass_pct", "dep_csubjpass_cnt", "dep_dative_pct", "dep_dative_cnt", "dep_dep_pct", "dep_dep_cnt", "dep_det_pct", "dep_det_cnt", "dep_dobj_pct", "dep_dobj_cnt", "dep_expl_pct", "dep_expl_cnt", "dep_intj_pct", "dep_intj_cnt", "dep_mark_pct", "dep_mark_cnt", "dep_meta_pct", "dep_meta_cnt", "dep_neg_pct", "dep_neg_cnt", "dep_nmod_pct", "dep_nmod_cnt", "dep_npadvmod_pct", "dep_npadvmod_cnt", "dep_nsubj_pct", "dep_nsubj_cnt", "dep_nsubjpass_pct", "dep_nsubjpass_cnt", "dep_nummod_pct", "dep_nummod_cnt", "dep_oprd_pct", "dep_oprd_cnt", "dep_parataxis_pct", "dep_parataxis_cnt", "dep_pcomp_pct", "dep_pcomp_cnt", "dep_pobj_pct", "dep_pobj_cnt", "dep_poss_pct", "dep_poss_cnt", "dep_preconj_pct", "dep_preconj_cnt", "dep_predet_pct", "dep_predet_cnt", "dep_prep_pct", "dep_prep_cnt", "dep_prt_pct", "dep_prt_cnt", "dep_punct_pct", "dep_punct_cnt", "dep_quantmod_pct", "dep_quantmod_cnt", "dep_relcl_pct", "dep_relcl_cnt", "dep_xcomp_pct", "dep_xcomp_cnt", "ent_cardinal_pct", "ent_cardinal_cnt", "ent_date_pct", "ent_date_cnt", "ent_event_pct", "ent_event_cnt", "ent_fac_pct", "ent_fac_cnt", "ent_gpe_pct", "ent_gpe_cnt", "ent_language_pct", "ent_language_cnt", "ent_law_pct", "ent_law_cnt", "ent_loc_pct", "ent_loc_cnt", "ent_money_pct", "ent_money_cnt", "ent_norp_pct", "ent_norp_cnt", "ent_ordinal_pct", "ent_ordinal_cnt", "ent_org_pct", "ent_org_cnt", "ent_percent_pct", "ent_percent_cnt", "ent_person_pct", "ent_person_cnt", "ent_product_pct", "ent_product_cnt", "ent_quantity_pct", "ent_quantity_cnt", "ent_time_pct", "ent_time_cnt", "ent_work_of_art_pct", "ent_work_of_art_cnt", ] feature_options["top_features"] = [ "svm_pred", "ft_prob", "nb_prob", "token_cnt", "review_stars", "polarity", "subjectivity", "grade_level", "character_cnt", "avg_word_len", "lda_t1", "lda_t2", "lda_t3", "lda_t4", "lda_t5", ] feature_options["all"] = ( feature_options["submodels"] + feature_options["other"] + feature_options["basic_text"] + feature_options["spacy_linguistic"] ) feature_options["non_linguistic"] = ( feature_options["submodels"] + feature_options["other"] + feature_options["basic_text"] ) feature_options["pca"] = feature_options["all"] features = feature_options[feature_groups] + [ "review_id", "target_clf", "target_reg", ] train = train[features] test = test[features] # Data Split (Train/Test) X_train = train.drop(columns=["review_id", "target_clf", "target_reg"]) X_test = test.drop(columns=["review_id", "target_clf", "target_reg"]) y_train = train["target_clf"] y_test = test["target_clf"] print("\nData Split Complete") print(f"X_train Shape: {X_train.shape}") print(f"X_test Shape: {X_test.shape}") print(f"y_train Shape: {y_train.shape}") print(f"y_test Shape: {y_test.shape}") # Preprocessing Options if scale_data and feature_groups != "pca": start = time.perf_counter() standard_scaler = StandardScaler() X_train_scaled = standard_scaler.fit_transform(X_train) X_test_scaled = standard_scaler.transform(X_test) end = time.perf_counter() print("\nTrain and Test Data Scaled") print(f"Preprocessing took {(end-start):.2f} seconds.") print(f"X_train Shape: {X_train_scaled.shape}") print(f"X_test Shape: {X_test_scaled.shape}") print(f"y_train Shape: {y_train.shape}") print(f"y_test Shape: {y_test.shape}") return (X_train_scaled, X_test_scaled, y_train, y_test) elif feature_groups == "pca": start = time.perf_counter() standard_scaler = StandardScaler() X_train_scaled = standard_scaler.fit_transform(X_train) X_test_scaled = standard_scaler.transform(X_test) end = time.perf_counter() print("\nTrain and Test Data Scaled") print(f"Feature Scaling took {(end-start):.2f} seconds.") start = time.perf_counter() pca = IncrementalPCA(n_components=20) X_train_pca = pca.fit_transform(X_train_scaled) X_test_pca = pca.transform(X_test_scaled) end = time.perf_counter() print("\nTrain and Test Data PCA Complete") print(f"PCA took {(end-start):.2f} seconds.") print(f"X_train Shape: {X_train_pca.shape}") print(f"X_test Shape: {X_test_pca.shape}") print(f"y_train Shape: {y_train.shape}") print(f"y_test Shape: {y_test.shape}") return (X_train_pca, X_test_pca, y_train, y_test) else: return (X_train, X_test, y_train, y_test)
from PIL import Image import numpy as np from pybooru import Danbooru from pathlib import Path import urllib.request import os from sklearn.externals import joblib from sklearn.decomposition import IncrementalPCA classfier = joblib.load("model.pkl") ipca = IncrementalPCA(n_components=20) hoge = [0.5] * 200 hoge = np.array(hoge) #hoge.reshape(1,-1) #print(hoge.shape) data = [] data.append(hoge) data = np.array(data) print(data.shape) data = ipca.fit_transform(data) print(data.shape) pr_label = classfier.predict(data) print(pr_label)
print (i.shape) i = np.pad(i,pad_width=(0,max-i.shape[0]), mode='constant', constant_values = 0 ).flatten() print(i.shape) return array_list,max def switch_list_to_ndarray(array_list,max): new_array = np.array() for i in array_list: new_array.append(i) return new_array spectrograms,max_length= clean_and_pad_mfcc(spectrograms) spectrograms = np.array(spectrograms) print(type(spectrograms[0])) print(spectrograms[0].shape) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(spectrograms, labels, test_size=0.4, random_state=0) from sklearn.decomposition import IncrementalPCA ipca = IncrementalPCA(n_components=100, batch_size=3) ipca.fit_transform(X_train)
def parse_args(): parser = argparse.ArgumentParser( description='Tag wikipedia article names with artist names') parser.add_argument('--infile', default='apsp.npy', help='APSP matrix') parser.add_argument('--outfile', default='model.pkl', help='Model save file') parser.add_argument('--num_components', default=100, type=int, help='Number latent topics in the model') return parser.parse_args() if __name__ == '__main__': args = parse_args() model = IncrementalPCA(n_components=args.num_components) apsp = np.load(args.infile) # apsp_gpu = gpuarray.GPUArray(np.shape(apsp), np.float32, order="F") # apsp_gpu.set(apsp) print('Fitting model') # model = model.fit_transform(apsp_gpu) model = model.fit_transform(apsp) print('Saving model') joblib.dump(model, args.outfile)
def main(): """ Get data from db and save it as csv """ bq = BQHandler() io = IO(gs_bucket=options.gs_bucket) viz = Viz(io=io) starttime, endtime = io.get_dates(options) logging.info('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) if options.model == 'rf': model = RandomForestRegressor( n_estimators=options.n_estimators, n_jobs=-1, min_samples_leaf=options.min_samples_leaf, min_samples_split=options.min_samples_split, max_features=options.max_features, max_depth=options.max_depth, bootstrap=options.bootstrap) elif options.model == 'lr': model = SGDRegressor(warm_start=True, max_iter=options.n_loops, shuffle=options.shuffle, power_t=options.power_t, penalty=options.regularizer, learning_rate=options.learning_rate, eta0=options.eta0, alpha=options.alpha, tol=0.0001) elif options.model == 'svr': model = SVR() elif options.model == 'ard': model = ARDRegression(n_iter=options.n_loops, alpha_1=options.alpha_1, alpha_2=options.alpha_2, lambda_1=options.lambda_1, lambda_2=options.lambda_2, threshold_lambda=options.threshold_lambda, fit_intercept=options.fit_intercept, copy_X=options.copy_X) elif options.model == 'gp': k_long_term = 66.0**2 * RBF(length_scale=67.0) k_seasonal = 2.4**2 * RBF(length_scale=90.0) * ExpSineSquared( length_scale=150, periodicity=1.0, periodicity_bounds=(0, 10000)) k_medium_term = 0.66**2 * RationalQuadratic(length_scale=1.2, alpha=0.78) k_noise = 0.18**2 * RBF(length_scale=0.134) + WhiteKernel( noise_level=0.19**2) #kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise model = GaussianProcessRegressor( kernel=kernel_gpml, #alpha=0, optimizer=None, normalize_y=True) elif options.model == 'llasso': model = LocalizedLasso(num_iter=options.n_loops, batch_size=options.batch_size) elif options.model == 'nlasso': model = NetworkLasso(num_iter=options.n_loops, batch_size=options.batch_size) graph_data = pd.read_csv(options.graph_data, names=[ 'date', 'start_hour', 'src', 'dst', 'type', 'sum_delay', 'sum_ahead', 'add_delay', 'add_ahead', 'train_count' ]) #stations_to_pick = options.stations_to_pick.split(',') #graph = model.fetch_connections(graph_data, stations_to_pick) model.fetch_connections(graph_data) if options.pca: ipca = IncrementalPCA(n_components=options.pca_components, whiten=options.whiten, copy=False) rmses, maes, r2s, skills, start_times, end_times, end_times_obj = [], [], [], [], [], [], [] X_complete = [] # Used for feature selection start = starttime end = start + timedelta(days=int(options.day_step), hours=int(options.hour_step)) if end > endtime: end = endtime while end <= endtime and start < end: logging.info('Processing time range {} - {}'.format( start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M'))) # Load data ############################################################ try: logging.info('Reading data...') data = bq.get_rows(start, end, loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names, only_winters=options.only_winters) data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['train_count', 'delay'], aggs=aggs) # Filter only timesteps with large distribution in the whole network if options.filter_delay_limit is not None: data = io.filter_delay_with_limit(data, options.filter_delay_limit) if options.y_avg_hours is not None: data = io.calc_running_delay_avg(data, options.y_avg_hours) if options.y_avg: data = io.calc_delay_avg(data) data.sort_values(by=['time', 'trainstation'], inplace=True) if options.impute: logging.info('Imputing missing values...') data.drop(columns=['train_type'], inplace=True) data = imputer.fit_transform(data) data.loc[:, 'train_type'] = None if options.month: logging.info('Adding month to the dataset...') data['month'] = data['time'].map(lambda x: x.month) if 'month' not in options.feature_params: options.feature_params.append('month') if options.model == 'ard' and len(data) > options.n_samples: logging.info('Sampling {} values from data...'.format( options.n_samples)) data = data.sample(options.n_samples) l_data = data.loc[:, options.label_params] f_data = data.loc[:, options.feature_params] except ValueError as e: f_data, l_data = [], [] if len(f_data) < 2 or len(l_data) < 2: start = end end = start + timedelta(days=int(options.day_step), hours=int(options.hour_step)) continue logging.info('Processing {} rows...'.format(len(f_data))) train, test = train_test_split(data, test_size=0.1) X_train = train.loc[:, options.feature_params].astype(np.float32).values y_train = train.loc[:, options.label_params].astype( np.float32).values.ravel() X_test = test.loc[:, options.feature_params].astype(np.float32).values y_test = test.loc[:, options.label_params].astype( np.float32).values.ravel() logging.debug('Features shape: {}'.format(X_train.shape)) if options.normalize: logging.info('Normalizing data...') xscaler, yscaler = StandardScaler(), StandardScaler() X_train = xscaler.fit_transform(X_train) X_test = xscaler.transform(X_test) if len(options.label_params) == 1: y_train = yscaler.fit_transform(y_train.reshape(-1, 1)).ravel() #y_test = yscaler.transform(y_test.reshape(-1, 1)).ravel() else: y_train = yscaler.fit_transform(y_train) #y_test = yscaler.transform(y_test) if options.pca: logging.info('Doing PCA analyzis for the data...') X_train = ipca.fit_transform(X_train) fname = options.output_path + '/ipca_explained_variance.png' viz.explained_variance(ipca, fname) #io._upload_to_bucket(filename=fname, ext_filename=fname) X_test = ipca.fit_transform(X_test) if options.model == 'llasso': graph_data = pd.read_csv(options.graph_data, names=[ 'date', 'start_hour', 'src', 'dst', 'type', 'sum_delay', 'sum_ahead', 'add_delay', 'add_ahead', 'train_count' ]) graph = model.fetch_connections(graph_data) logging.debug('Features shape after pre-processing: {}'.format( X_train.shape)) # FIT ################################################################## if options.cv: logging.info('Doing random search for hyper parameters...') if options.model == 'rf': param_grid = { "n_estimators": [10, 100, 200, 800], "max_depth": [3, 20, None], "max_features": ["auto", "sqrt", "log2", None], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4, 10], "bootstrap": [True, False] } elif options.model == 'lr': param_grid = { "penalty": [None, 'l2', 'l1'], "alpha": [0.00001, 0.0001, 0.001, 0.01, 0.1], "l1_ratio": [0.1, 0.15, 0.2, 0.5], "shuffle": [True, False], "learning_rate": ['constant', 'optimal', 'invscaling'], "eta0": [0.001, 0.01, 0.1], "power_t": [0.1, 0.25, 0.5] } elif options.model == 'svr': param_grid = { "C": [0.001, 0.01, 0.1, 1, 10], "epsilon": [0.01, 0.1, 0.5], "kernel": ['rbf', 'linear', 'poly', 'sigmoid', 'precomputed'], "degree": [2, 3, 4], "shrinking": [True, False], "gamma": [0.001, 0.01, 0.1], "coef0": [0, 0.1, 1] } else: raise ("No param_grid set for given model ({})".format( options.model)) random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=int( options.n_iter_search), n_jobs=-1) random_search.fit(X_train, y_train) logging.info("RandomizedSearchCV done.") fname = options.output_path + '/random_search_cv_results.txt' io.report_cv_results(random_search.cv_results_, fname) #io._upload_to_bucket(filename=fname, ext_filename=fname) sys.exit() else: logging.info('Training...') if options.model in ['rf', 'svr', 'ard', 'gp']: model.fit(X_train, y_train) if options.feature_selection: X_complete = X_train y_complete = y_train meta_complete = data.loc[:, options.meta_params] elif options.model in ['llasso']: model.fit(X_train, y_train, stations=train.loc[:, 'trainstation'].values) elif options.model in ['nlasso']: model.partial_fit(X_train, y_train, stations=train.loc[:, 'trainstation'].values) else: model.partial_fit(X_train, y_train) if options.feature_selection: try: X_complete = np.append(X_complete, X_train) y_complete = np.append(Y_complete, y_train) meta_complete = meta_complete.append( data.loc[:, options.meta_params]) except (ValueError, NameError): X_complete = X_train y_complete = y_train meta_complete = data.loc[:, options.meta_params] # EVALUATE ############################################################# # Check training score to estimate amount of overfitting # Here we assume that we have a datetime index (from time columns) y_pred_train = model.predict(X_train) rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train)) mae_train = np.sqrt(mean_squared_error(y_train, y_pred_train)) logging.info('Training data RMSE: {} and MAE: {}'.format( rmse_train, mae_train)) #try: if True: print(train) #range = ('2013-02-01','2013-02-28') range = ('2010-01-01', '2010-01-02') X_train_sample = train.loc[range[0]:range[1], options.feature_params].astype( np.float32).values target = train.loc[range[0]:range[1], options.label_params].astype( np.float32).values.ravel() y_pred_sample = model.predict(X_train_sample) times = train.loc[range[0]:range[1], 'time'].values df = pd.DataFrame(times + y_pred_sample) print(df) sys.exit() # Draw visualisation fname = '{}/timeseries_training_data.png'.format( options.output_path) viz.plot_delay(times, target, y_pred, 'Delay for station {}'.format(stationName), fname) fname = '{}/scatter_all_stations.png'.format(options.vis_path) viz.scatter_predictions(times, target, y_pred, savepath=options.vis_path, filename='scatter_{}'.format(station)) #except KeyError: # pass # Mean delay over the whole dataset (both train and validation), # used to calculate Brier Skill if options.y_avg: mean_delay = 3.375953418071136 else: mean_delay = 6.011229358531166 if options.model == 'llasso': print('X_test shape: {}'.format(X_test.shape)) y_pred, weights = model.predict(X_test, test.loc[:, 'trainstation'].values) else: y_pred = model.predict(X_test) if options.normalize: y_pred = yscaler.inverse_transform(y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) rmse_stat = math.sqrt( mean_squared_error(y_test, np.full_like(y_test, mean_delay))) skill = 1 - rmse / rmse_stat rmses.append(rmse) maes.append(mae) r2s.append(r2) skills.append(skill) start_times.append(start.strftime('%Y-%m-%dT%H:%M:%S')) end_times.append(end.strftime('%Y-%m-%dT%H:%M:%S')) end_times_obj.append(end) if options.model in ['rf', 'lr', 'ard', 'gp']: logging.info('R2 score for training: {}'.format( model.score(X_train, y_train))) logging.info('RMSE: {}'.format(rmse)) logging.info('MAE: {}'.format(mae)) logging.info('R2 score: {}'.format(r2)) logging.info('Brier Skill Score score: {}'.format(skill)) start = end end = start + timedelta(days=int(options.day_step), hours=int(options.hour_step)) if end > endtime: end = endtime # SAVE ##################################################################### io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file) if options.normalize: fname = options.save_path + '/xscaler.pkl' io.save_scikit_model(xscaler, filename=fname, ext_filename=fname) fname = options.save_path + '/yscaler.pkl' io.save_scikit_model(yscaler, filename=fname, ext_filename=fname) if options.model == 'rf': fname = options.output_path + '/rfc_feature_importance.png' viz.rfc_feature_importance(model.feature_importances_, fname, feature_names=options.feature_params) #io._upload_to_bucket(filename=fname, ext_filename=fname) try: fname = options.output_path + '/learning_over_time.png' viz.plot_learning_over_time(end_times_obj, rmses, maes, r2s, filename=fname) #io._upload_to_bucket(filename=fname, ext_filename=fname) except Exception as e: logging.error(e) error_data = { 'start_times': start_times, 'end_times': end_times, 'rmse': rmses, 'mae': maes, 'r2': r2s, 'skill': skills } fname = '{}/training_time_validation_errors.csv'.format( options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname) # FEATURE SELECTION ######################################################## if options.feature_selection: logging.info('Doing feature selection...') selector = SelectFromModel(model, prefit=True) print(pd.DataFrame(data=X_complete)) X_selected = selector.transform(X_complete) selected_columns = f_data.columns.values[selector.get_support()] logging.info( 'Selected following parameters: {}'.format(selected_columns)) data_sel = meta_complete.join( pd.DataFrame(data=y_complete, columns=options.label_params)).join( pd.DataFrame(data=X_selected, columns=selected_columns)) print(pd.DataFrame(data=X_selected, columns=selected_columns)) print(data_sel)
# print(train_data) # print(train_label) end_time = time.clock() print("Loading Complete \nTime =", end_time - start_time) # Time = # Data decomposition print("Now Decompositing Data") start_time = time.clock() #from sklearn.decomposition import TruncatedSVD #decomp = TruncatedSVD(n_components=1000,n_iter=5) #decomp.fit(train_data) train_data = pca.fit_transform(train_data) end_time = time.clock() print("Decompositing Complete \nTime =", end_time - start_time) # Time = print(train_data) # Saving decomposed data as csv csv_decomp_train_path = 'csv_pca900decomp_alphabets_train.csv' with open(csv_decomp_train_path, 'w') as f: writer = csv.writer(f, lineterminator='\n') writer.writerow(train_data) #writer.writerow('\n') ########## Learning ###################################
def projectPCA(self): return np.dot(self.data, self.eigenvector)[:, :2] if __name__ == '__main__': range = np.random.RandomState(1) # Load Data data = np.dot(range.rand(2, 2), range.randn(2, 200)).T plt.scatter(data[:, 0], data[:, 1]) # Show Data plt.axis('equal') plt.savefig('PCAData.png') # Numpy PCA pca = PCANumpy(data=data) # Print projected PCA print('Numpy PCA: ') print(pca.projectPCA()) print('\n============\n') # Scikitlearn PCA pcaSklearn = IncrementalPCA(n_components=2, batch_size=10) newSklearnPCA = pcaSklearn.fit_transform(data) print('Scikit-learn PCA: ') print(newSklearnPCA) # Distance between matrices print('\n============\n') print("Distance Between Matrices:") print(np.linalg.norm(pca.projectPCA() - newSklearnPCA))
def post_eval(data, params, replot_runtimeplots=False): fontsize = 18 if replot_runtimeplots: print('Plotting runtime plots...') try: les = np.loadtxt(params['saveto'] + 'les.out') plot_les(les, params, fontsize) except: print('LE plotting failed.') try: pws = np.loadtxt(params['saveto'] + 'pws.out') plot_traj(pws, params, fontsize) except: print('Projected trajectory plotting failed.') d_rng = intersection(params['freeze_d_its'], (params['start_lam_it'], params['max_iter'] - 1)) g_rng = intersection(params['freeze_g_its'], (params['start_lam_it'], params['max_iter'] - 1)) if d_rng is not None and g_rng is not None: both_rng = intersection(list(d_rng), list(g_rng)) else: both_rng = None print('Plotting gradient norms...') try: ds = np.loadtxt(params['saveto'] + 'd_norm.out') fig = plt.figure() ax = plt.subplot(111) plt.plot(range(len(ds)), ds, 'k-') if d_rng is not None: plt.plot(d_rng, ds[d_rng], '-', color='dodgerblue') if g_rng is not None: plt.plot(g_rng, ds[g_rng], 'r-') if both_rng is not None: plt.plot(both_rng, ds[both_rng], '-', color='lime') ax.set_ylabel('Discriminator Gradient L2 Norm', fontsize=fontsize) ax.set_xlabel('Iteration', fontsize=fontsize) plt.tick_params(axis='both', which='major', labelsize=fontsize) plt.title('Final Norm: {:.3e}'.format(ds[-1]), fontsize=fontsize) locs, _ = plt.xticks() if locs[-1] >= 10000: newlocs = [loc for loc in locs if loc >= 0 and loc < len(ds)] xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs] plt.xticks(newlocs, xlabels) plt.tight_layout() fig.savefig(params['saveto'] + 'd_norm.pdf') except: print('d_norm.out not found.') try: gs = np.loadtxt(params['saveto'] + 'g_norm.out') fig = plt.figure() ax = plt.subplot(111) plt.plot(range(len(gs)), gs, 'k-') if d_rng is not None: plt.plot(d_rng, gs[d_rng], '-', color='dodgerblue') if g_rng is not None: plt.plot(g_rng, gs[g_rng], 'r-') if both_rng is not None: plt.plot(both_rng, gs[both_rng], '-', color='lime') ax.set_ylabel('Generator Gradient L2 Norm', fontsize=fontsize) ax.set_xlabel('Iteration', fontsize=fontsize) plt.tick_params(axis='both', which='major', labelsize=fontsize) plt.title('Final Norm: {:.3e}'.format(gs[-1]), fontsize=fontsize) locs, _ = plt.xticks() if locs[-1] >= 10000: newlocs = [loc for loc in locs if loc >= 0 and loc < len(gs)] xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs] plt.xticks(newlocs, xlabels) plt.tight_layout() fig.savefig(params['saveto'] + 'g_norm.pdf') except: print('g_norm.out not found.') print('Plotting loss...') try: fs = np.loadtxt(params['saveto'] + 'loss.out') fig = plt.figure() ax = plt.subplot(111) plt.plot(range(len(fs)), np.array(fs), 'k-') if d_rng is not None: plt.plot(d_rng, fs[d_rng], '-', color='dodgerblue') if g_rng is not None: plt.plot(g_rng, fs[g_rng], 'r-') if both_rng is not None: plt.plot(both_rng, fs[both_rng], '-', color='lime') ax.set_ylabel('Minimax Loss', fontsize=fontsize) ax.set_xlabel('Iteration', fontsize=fontsize) plt.tick_params(axis='both', which='major', labelsize=fontsize) plt.title('Final Loss: {:.3e}'.format(fs[-1]), fontsize=fontsize) locs, _ = plt.xticks() if locs[-1] >= 10000: newlocs = [loc for loc in locs if loc >= 0 and loc < len(fs)] xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs] plt.xticks(newlocs, xlabels) plt.tight_layout() fig.savefig(params['saveto'] + 'loss.pdf') except: print('loss.out not found.') print('Loading weights from saved files...') weights = [] for w_i in range(params['start_lam_it'], params['max_iter'], params['weights_every']): w_D = flatten_nested( pickle.load( open(params['saveto'] + 'weights/D_' + str(w_i) + '.pkl', 'rb'))) w_G = flatten_nested( pickle.load( open(params['saveto'] + 'weights/G_' + str(w_i) + '.pkl', 'rb'))) weights.append(np.hstack([w_D, w_G])) weights = np.vstack(weights) d_rng = shift_range(d_rng, shift=-params['start_lam_it'], keep_every=params['weights_every']) g_rng = shift_range(g_rng, shift=-params['start_lam_it'], keep_every=params['weights_every']) both_rng = shift_range(both_rng, shift=-params['start_lam_it'], keep_every=params['weights_every']) print('Plotting PCA of trajectory...') ipca = IncrementalPCA(n_components=2, batch_size=10) X_ipca = ipca.fit_transform(weights) fig, ax = plt.subplots() path = mpath.Path(X_ipca) verts = path.interpolated(steps=1).vertices x, y = verts[:, 0], verts[:, 1] z = np.linspace(0, 1, len(x)) colorline(x, y, z, cmap=plt.get_cmap('Greys'), linewidth=1.0) if d_rng is not None: plt.plot(X_ipca[d_rng, 0], X_ipca[d_rng, 1], '-', color='dodgerblue', lw=0.5) if g_rng is not None: plt.plot(X_ipca[g_rng, 0], X_ipca[g_rng, 1], 'r-', lw=0.5) if both_rng is not None: plt.plot(X_ipca[both_rng, 0], X_ipca[both_rng, 1], '-', color='lime', lw=0.5) ax.set_xlim([X_ipca[:, 0].min(), X_ipca[:, 0].max()]) ax.set_ylim([X_ipca[:, 1].min(), X_ipca[:, 1].max()]) plt.title('Weights Trajectory Projected onto Top-2 PCs\n' + r'($p2p_x,p2p_y$)' + ' = ({:.3f},{:.3f})'.format(np.ptp(x), np.ptp(y)), fontsize=fontsize) plt.tick_params(axis='both', which='major', bottom=False, top=False, left=False, right=False) ax.set_yticklabels([]) ax.set_xticklabels([]) fig.tight_layout() fig.savefig(params['saveto'] + 'weights_pca.pdf') plt.close(fig) print('Plotting PCA of normalized trajectory...') ipca2 = IncrementalPCA(n_components=2, batch_size=10) weights_normalized = (weights - weights.min(axis=0)) / ( np.ptp(weights, axis=0) + 1e-10) X_ipca2 = ipca2.fit_transform(weights_normalized) fig, ax = plt.subplots() path2 = mpath.Path(X_ipca2) verts2 = path2.interpolated(steps=1).vertices x2, y2 = verts2[:, 0], verts2[:, 1] z2 = np.linspace(0, 1, len(x2)) colorline(x2, y2, z2, cmap=plt.get_cmap('Greys'), linewidth=1.0) if d_rng is not None: plt.plot(X_ipca2[d_rng, 0], X_ipca2[d_rng, 1], '-', color='dodgerblue', lw=0.5) if g_rng is not None: plt.plot(X_ipca2[g_rng, 0], X_ipca2[g_rng, 1], 'r-', lw=0.5) if both_rng is not None: plt.plot(X_ipca2[both_rng, 0], X_ipca2[both_rng, 1], '-', color='lime', lw=0.5) plt.title('Normalized Weights Trajectory\nProjected onto Top-2 PCs\n' + r'($p2p_x,p2p_y$)' + ' = ({:.3f},{:.3f})'.format(np.ptp(x2), np.ptp(y2)), fontsize=fontsize) plt.tick_params(axis='both', which='major', bottom=False, top=False, left=False, right=False) ax.set_yticklabels([]) ax.set_xticklabels([]) fig.tight_layout() fig.savefig(params['saveto'] + 'weights_pca2.pdf') plt.close(fig) print('Plotting norm of weights over trajectory...') w_norms = np.linalg.norm(weights, axis=1) fig = plt.figure() plt.plot(range(len(w_norms)), w_norms, 'k-') if d_rng is not None: plt.plot(d_rng, w_norms[d_rng], '-', color='dodgerblue') if g_rng is not None: plt.plot(g_rng, w_norms[g_rng], 'r-') if both_rng is not None: plt.plot(both_rng, w_norms[both_rng], '-', color='lime') plt.xlabel('Iteration', fontsize=fontsize) plt.ylabel(r'Norm of Weights ($||w||$)', fontsize=fontsize) plt.title('Norm of Weights Over Trajectory\n' + r'($p2p$' + '={:.3f})'.format(np.ptp(w_norms)), fontsize=fontsize) plt.tick_params(axis='both', which='major', labelsize=fontsize) locs, _ = plt.xticks() if locs[-1] >= 10000: newlocs = [loc for loc in locs if loc >= 0 and loc < len(w_norms)] xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs] plt.xticks(newlocs, xlabels) plt.tight_layout() fig.savefig(params['saveto'] + 'weight_norms.pdf') plt.close(fig) print('Plotting distance of weights from mean over trajectory...') weights_mean = weights.mean(axis=0) w_mean_norms = np.linalg.norm(weights - weights_mean, axis=1) fig = plt.figure() plt.plot(range(len(w_mean_norms)), w_mean_norms, 'k-') if d_rng is not None: plt.plot(d_rng, w_mean_norms[d_rng], '-', color='dodgerblue') if g_rng is not None: plt.plot(g_rng, w_mean_norms[g_rng], 'r-') if both_rng is not None: plt.plot(both_rng, w_mean_norms[both_rng], '-', color='lime') plt.xlabel('Iteration', fontsize=fontsize) plt.ylabel(r'Norm of Weights ($||w||$)', fontsize=fontsize) plt.title('Norm of Weights Over Trajectory\n' + r'($p2p$' + '={:.3f})'.format(np.ptp(w_mean_norms)), fontsize=fontsize) plt.tick_params(axis='both', which='major', labelsize=fontsize) locs, _ = plt.xticks() if locs[-1] >= 10000: newlocs = [loc for loc in locs if loc >= 0 and loc < len(w_mean_norms)] xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs] plt.xticks(newlocs, xlabels) plt.tight_layout() fig.savefig(params['saveto'] + 'weight_mean_norms.pdf') plt.close(fig) print('Plotting angular distance of weights from mean over trajectory...') w_mean_angles = 180 / np.pi * np.arccos( np.sum(weights * weights_mean, axis=1) / w_norms / np.linalg.norm(weights_mean)) fig = plt.figure() plt.plot(range(len(w_mean_angles)), w_mean_angles, 'k-') if d_rng is not None: plt.plot(d_rng, w_mean_angles[d_rng], '-', color='dodgerblue') if g_rng is not None: plt.plot(g_rng, w_mean_angles[g_rng], 'r-') if both_rng is not None: plt.plot(both_rng, w_mean_angles[both_rng], '-', color='lime') plt.title('Angular Deviation of Weights\nfrom Mean Over Trajectory\n' + r'($p2p$' + '={:.3f})'.format(np.ptp(w_mean_angles)), fontsize=fontsize) plt.tick_params(axis='both', which='major', labelsize=fontsize) plt.ylabel('Angles in degrees', fontsize=fontsize) plt.xlabel('Iteration', fontsize=fontsize) locs, _ = plt.xticks() if locs[-1] >= 10000: newlocs = [ loc for loc in locs if loc >= 0 and loc < len(w_mean_angles) ] xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs] plt.xticks(newlocs, xlabels) plt.tight_layout() fig.savefig(params['saveto'] + 'weight_mean_angles.pdf') plt.close(fig) print( 'Plotting distance of weights from closest vector over trajectory...') D = pairwise_distances(weights) closest = weights[D.sum(axis=1).argmin()] w_closest_norms = np.linalg.norm(weights - closest, axis=1) fig = plt.figure() plt.plot(range(len(w_closest_norms)), w_closest_norms, 'k-') if d_rng is not None: plt.plot(d_rng, w_closest_norms[d_rng], '-', color='dodgerblue') if g_rng is not None: plt.plot(g_rng, w_closest_norms[g_rng], 'r-') if both_rng is not None: plt.plot(both_rng, w_closest_norms[both_rng], '-', color='lime') plt.title('Norm of Weights Over Trajectory\n' + r'($p2p$' + '={:.3f})'.format(np.ptp(w_closest_norms)), fontsize=fontsize) plt.tick_params(axis='both', which='major', labelsize=fontsize) plt.xlabel('Iteration', fontsize=fontsize) plt.ylabel(r'Norm of Weights ($||w||$)', fontsize=fontsize) locs, _ = plt.xticks() if locs[-1] >= 10000: newlocs = [ loc for loc in locs if loc >= 0 and loc < len(w_closest_norms) ] xlabels = [str(int(loc) // 1000) + 'k' for loc in newlocs] plt.xticks(newlocs, xlabels) plt.tight_layout() fig.savefig(params['saveto'] + 'weight_closest_norms.pdf') plt.close(fig) print('Plotting sample series over epochs...') if params['n_viz'] > 0: np_samples = [] for viz_i in range(0, params['max_iter'], params['viz_every']): np_samples.append( np.load(params['saveto'] + 'samples/' + str(viz_i) + '.npy')) data.plot_series(np_samples, params) print('Complete.')
def train_cluster(data_type=0, dimension_reduction=0, cluster_way=0, n_components=50, threshold=2, n_clusters=210, branching_factor=50, linkage=0, max_iter=500, eps=1.0): if data_type == 0: train_data = load_stage2_tf_idf("") elif data_type == 1: train_data = load_stage2_tf_idf("") nn_data = load_nn_stage2_features() train_data = pd.merge(train_data, nn_data, 'left', on="file_name") elif data_type == 2: train_data = load_nn_stage2_features() elif data_type == 3: train_data = load_stage2_tf_idf("1000") nn_data = load_nn_stage2_features() train_data = pd.merge(train_data, nn_data, 'left', on="file_name") dll = load_stage2_tf_idf("_dll") train_data = pd.merge(train_data, dll, 'left', on="file_name") dll = load_stage2_tf_idf("_hkey", "first") train_data = pd.merge(train_data, dll, 'left', on="file_name") dll = load_stage2_tf_idf("_hkey", "last") train_data = pd.merge(train_data, dll, 'left', on="file_name") train_data.fillna(0, inplace=True) elif data_type == 4: train_data = load_stage2_tf_idf("1000") nn_data = load_nn_stage2_features() train_data = pd.merge(train_data, nn_data, 'left', on="file_name") dll = load_stage2_tf_idf("_dll") train_data = pd.merge(train_data, dll, 'left', on="file_name") dll = load_stage2_tf_idf("_hkey", "first") train_data = pd.merge(train_data, dll, 'left', on="file_name") dll = load_stage2_tf_idf("_hkey", "last") train_data = pd.merge(train_data, dll, 'left', on="file_name") dll = load_clustering_statics_files() train_data = pd.merge(train_data, dll, 'left', on="file_name") train_data.fillna(0, inplace=True) file_name = train_data["file_name"] train_data.drop(columns=["file_name"], inplace=True) X = StandardScaler(with_mean=False).fit_transform(train_data) origin_data = X if dimension_reduction == 0: pass elif dimension_reduction == 1: model = IncrementalPCA(n_components=n_components) X = model.fit_transform(X) elif dimension_reduction == 2: model = NMF(n_components=n_components, init='random', random_state=0, max_iter=max_iter) X = model.fit_transform(X) elif dimension_reduction == 3: model = PCA(n_components=n_components) X = model.fit_transform(X) print(len(X[0])) if cluster_way == 0: mode = ["ward", "complete", "average", "single"] db = AgglomerativeClustering(n_clusters=n_clusters, linkage=mode[linkage]).fit(X) labels = db.labels_ pd.DataFrame(data={ "id": file_name, "family_id": db.labels_ }).to_csv(os.path.join( "predictions", "aggcl" + "_" + str(n_clusters) + "_" + str(data_type) + "_" + str(dimension_reduction) + "_" + str(n_components) + ".csv"), index=False) print(len(set(labels))) elif cluster_way == 1: db = Birch(branching_factor=branching_factor, n_clusters=n_clusters, threshold=threshold).fit(X) labels = db.predict(X) pd.DataFrame(data={ "id": file_name, "family_id": db.labels_ }).to_csv(os.path.join("predictions", "birch" + ".csv"), index=False) print(len(set(labels))) elif cluster_way == 2: db = hdbscan.HDBSCAN(min_cluster_size=40) db.fit(X) labels = db.labels_ pd.DataFrame(data={ "id": file_name, "family_id": db.labels_ }).to_csv(os.path.join("predictions", "hdb_40" + ".csv"), index=False) print(len(set(labels))) elif cluster_way == 3: db = DBSCAN(eps=eps, n_jobs=-1).fit(X) labels = db.labels_ pd.DataFrame(data={ "id": file_name, "family_id": db.labels_ }).to_csv(os.path.join( "predictions", "db" + "_" + str(eps) + "_" + str(dimension_reduction) + ".csv"), index=False) print(len(set(labels))) elif cluster_way == 4: labels = np.zeros((len(file_name), )) pd.DataFrame(data={ "id": file_name, "family_id": np.zeros((len(file_name), )) }).to_csv(os.path.join("predictions", "zeros" + ".csv"), index=False) elif cluster_way == 5: db = KMeans(n_clusters=n_clusters, random_state=0).fit(X) labels = db.labels_ pd.DataFrame(data={ "id": file_name, "family_id": db.labels_ }).to_csv(os.path.join("predictions", "kmeans" + str(n_clusters) + ".csv"), index=False) print(len(set(labels))) elif cluster_way == 6: db = AffinityPropagation() # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) scores = evaluate_cluster_performance(origin_data, labels) evaluate_cluster_performance(X, labels) return scores
# In[5]: if __name__ == "__main__": # Load in data file df = pd.read_table(_ROOT_DIR + "data/2018_07_11_pca_te_enhancers/test.tsv") # Get features used in PCA features_df = df.loc[:, "aaaaaa":"tttttt"] # Get labels labels_df = df.loc[:, "label"] # Create the PCA ipca = IncrementalPCA(n_components=N_COMPONENTS) features_transformed = ipca.fit_transform(features_df) # Label the transformed coordinates transformed_df = pca.label_coordinates( transformed_coordinates=features_transformed, labels=labels_df) # Get a list of all unique labels labels_list = list(set(labels_df)) # Create combinations of principal components to plot components = (1, 2, 3, 4, 5) combinations_list = generate_combinations(components) # Plot different combinations of principal components for combination in combinations_list:
from sklearn.decomposition import IncrementalPCA from sklearn.cluster import AgglomerativeClustering from sklearn import metrics import warnings warnings.filterwarnings('ignore') # 读取数据 f = open(r"breast.txt") line = f.readline() data_list = [] while line: num = list(map(float, line.split())) data_list.append(num) line = f.readline() f.close() data_array = np.array(data_list) X = data_array[:, :-1] y = data_array[:, -1] # 降维处理 pca = IncrementalPCA(n_components=1) X = pca.fit_transform(X) # 聚类 clst = AgglomerativeClustering(n_clusters=2) clst.fit(X) # 计算 NMI result_NMI = metrics.normalized_mutual_info_score(y, clst.labels_) print("result_NMI:", result_NMI)
vectors.append(model[word]) labels.append(word) print('- found ' + str(len(labels)) + ' entities x ' + str(len(vectors[0])) + ' dimensions') # convert both lists into numpy vectors for reduction vectors = np.asarray(vectors) labels = np.asarray(labels) print('- done') # if specified, reduce using IncrementalPCA first (down # to a smaller number of dimensions before the final reduction) if run_init_reduction: print('reducing to ' + str(init_dimensions) + 'D using IncrementalPCA...') ipca = IncrementalPCA(n_components=init_dimensions) vectors = ipca.fit_transform(vectors) print('- done') # save reduced vector space to file print('- saving as csv...') with open( 'ModelsAndData/' + model_name + '-' + str(init_dimensions) + 'D.csv', 'w') as f: for i in range(len(labels)): f.write(labels[i] + ',' + ','.join(map(str, vectors[i])) + '\n') # reduce using t-SNE print('reducing to ' + str(num_dimensions) + 'D using t-SNE...') print('- may take a really, really (really) long time :)') vectors = np.asarray(vectors) tsne = TSNE(n_components=num_dimensions, random_state=0)
def pca(data, n_comps=None, zero_center=True, svd_solver='auto', random_state=0, return_info=False, dtype='float32', copy=False, chunked=False, chunk_size=None): """Principal component analysis [Pedregosa11]_. Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of *scikit-learn* [Pedregosa11]_. Parameters ---------- data : :class:`~anndata.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. n_comps : `int`, optional (default: 50) Number of principal components to compute. zero_center : `bool` or `None`, optional (default: `True`) If `True`, compute standard PCA from covariance matrix. If `False`, omit zero-centering variables (uses *TruncatedSVD* from scikit-learn), which allows to handle sparse input efficiently. svd_solver : `str`, optional (default: 'auto') SVD solver to use. Either 'arpack' for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds), or 'randomized' for the randomized algorithm due to Halko (2009). 'auto' chooses automatically depending on the size of the problem. random_state : `int`, optional (default: 0) Change to use different intial states for the optimization. return_info : `bool`, optional (default: `False`) Only relevant when not passing an :class:`~anndata.AnnData`: see "Returns". dtype : `str` (default: 'float32') Numpy data type string to which to convert the result. copy : `bool`, optional (default: `False`) If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Is ignored otherwise. chunked : `bool`, optional (default: `False`) If `True`, perform an incremental PCA on segments of `chunk_size`. The incremental PCA automatically zero centers and ignores settings of `random_seed` and `svd_solver`. If `False`, perform a full PCA. chunk_size : `int`, optional (default: `None`) Number of observations to include in each chunk. Required if `chunked` is `True`. Returns ------- If `data` is array-like and `return_info == False`, only returns `X_pca`,\ otherwise returns or adds to `adata`: X_pca : `.obsm` PCA representation of data. PCs : `.varm` The principal components containing the loadings. variance_ratio : `.uns['pca']` Ratio of explained variance. variance : `.uns['pca']` Explained variance, equivalent to the eigenvalues of the covariance matrix. """ # chunked calculation is not randomized, anyways if svd_solver in {'auto', 'randomized'} and not chunked: logg.info( 'Note that scikit-learn\'s randomized PCA might not be exactly ' 'reproducible across different computational platforms. For exact ' 'reproducibility, choose `svd_solver=\'arpack\'.` This will likely ' 'become the Scanpy default in the future.') if n_comps is None: n_comps = N_PCS if isinstance(data, AnnData): data_is_AnnData = True adata = data.copy() if copy else data else: data_is_AnnData = False adata = AnnData(data) logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4) if adata.n_vars < n_comps: n_comps = adata.n_vars - 1 logg.msg('reducing number of computed PCs to', n_comps, 'as dim of data is only', adata.n_vars, v=4) if chunked: if not zero_center or random_state or svd_solver != 'auto': logg.msg('Ignoring zero_center, random_state, svd_solver', v=4) from sklearn.decomposition import IncrementalPCA X_pca = np.zeros((adata.X.shape[0], n_comps), adata.X.dtype) pca_ = IncrementalPCA(n_components=n_comps) for chunk, _, _ in adata.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk pca_.partial_fit(chunk) for chunk, start, end in adata.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk X_pca[start:end] = pca_.transform(chunk) else: zero_center = zero_center if zero_center is not None else False if issparse( adata.X) else True if zero_center: from sklearn.decomposition import PCA if issparse(adata.X): logg.msg( ' as `zero_center=True`, ' 'sparse input is densified and may ' 'lead to huge memory consumption', v=4) X = adata.X.toarray( ) # Copying the whole adata.X here, could cause memory problems else: X = adata.X pca_ = PCA(n_components=n_comps, svd_solver=svd_solver, random_state=random_state) else: from sklearn.decomposition import TruncatedSVD logg.msg( ' without zero-centering: \n' ' the explained variance does not correspond to the exact statistical defintion\n' ' the first component, e.g., might be heavily influenced by different means\n' ' the following components often resemble the exact PCA very closely', v=4) pca_ = TruncatedSVD(n_components=n_comps, random_state=random_state) X = adata.X X_pca = pca_.fit_transform(X) if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype) if data_is_AnnData: adata.obsm['X_pca'] = X_pca adata.varm['PCs'] = pca_.components_.T adata.uns['pca'] = {} adata.uns['pca']['variance'] = pca_.explained_variance_ adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_ logg.msg(' finished', t=True, end=' ', v=4) logg.msg( 'and added\n' ' \'X_pca\', the PCA coordinates (adata.obs)\n' ' \'PC1\', \'PC2\', ..., the loadings (adata.var)\n' ' \'pca_variance\', the variance / eigenvalues (adata.uns)\n' ' \'pca_variance_ratio\', the variance ratio (adata.uns)', v=4) return adata if copy else None else: if return_info: return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_ else: return X_pca
print("SparsePCA transforming...") pca = SparsePCA(n_components=args.pl_dim) Train = pca.fit_transform(Train) Devel = pca.fit_transform(Devel) elif args.kpca: print("KernelPCA transforming...") pca = KernelPCA(n_components=args.pl_dim) Train = pca.fit_transform(Train) Devel = pca.fit_transform(Devel) elif args.ipca: print("i-PCA transforming...") ipca = IncrementalPCA(batch_size=args.ipca_batch, copy=True, n_components=args.pl_dim, whiten=True) Train = ipca.fit_transform(Train) Devel = ipca.fit_transform(Devel) elif args.lda: print("LDA transforming...") lda = LDA(n_components=args.pl_dim) if args.arousal: labels = Train_L[:, 0] elif args.valence: labels = Train_L[:, 1] elif args.liking: labels = Train_L[:, 2] lda = lda.fit(Train, labels) #learning the projection matrix Train = lda.transform(Train) Devel = lda.transfrom(Devel)
def main(date, takeSubset=False): """ Reduces the dimensionality of the training data to 3 dimensions, plots the transformed data in 3d space. The idea is to bring out separability between the resistance classes which may be hidden in the dimensionality of the data. :param date: (string) Data collection date YYYY_MMDD :param takeSubset: (boolean) Transform and plot a random subset of the trainng data? :return: (None) """ mkl.set_num_threads(8) # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) if takeSubset: indices = np.random.choice(range(0, len(trainY)), size=NUM_SAMPLES, replace=False) X = trainX[indices,:] y = trainY[indices] else: X = trainX y = trainY X = np.nan_to_num(X) # Break the data into resistance classes susIndex = Constants.LABEL_TO_INDEX[Constants.SUSCEPTIBLE] drIndex = Constants.LABEL_TO_INDEX[Constants.DR_RESISTANT] grIndex = Constants.LABEL_TO_INDEX[Constants.GR_RESISTANT] susX = X[y==susIndex, :] drX = X[y==drIndex, :] grX = X[y==grIndex, :] # Transform the data using PCA pca = IncrementalPCA(n_components=6) pointsSUS = pca.fit_transform(susX) pointsGR= pca.fit_transform(grX) pointsDR = pca.fit_transform(drX) # Plot the transformed data in 3D space traceSUS = go.Scatter3d( x=pointsSUS[:, 0], y=pointsSUS[:, 1], z=pointsSUS[:, 2], mode='markers', marker=dict( size=5, line=dict( color='rgba(255, 0, 0, 0)', width=0.1 ), opacity=0 ) ) traceDR = go.Scatter3d( x=pointsDR[:, 0], y=pointsDR[:, 1], z=pointsDR[:, 2], mode='markers', marker=dict( size=5, line=dict( color='rgba(0, 255, 0, 0)', width=0.1 ), opacity=0 ) ) traceGR = go.Scatter3d( x=pointsGR[:, 0], y=pointsGR[:, 1], z=pointsGR[:, 2], mode='markers', marker=dict( size=5, line=dict( color='rgba(0, 0, 255, 0)', width=0.1 ), opacity=0 ) ) data = [traceSUS, traceDR, traceGR] fig = go.Figure(data=data) py.iplot(fig, filename='3D PCA Wavelength Plot') # Plot the principle components eigenSpectra = pca.components_ plt.subplot(3,1,1) plt.plot(Constants.WAVELENGTHS, eigenSpectra[0, :]) plt.title("Principle Components 1 - 3") plt.subplot(3,1,2) plt.plot(Constants.WAVELENGTHS, eigenSpectra[1, :]) plt.subplot(3,1,3) plt.plot(Constants.WAVELENGTHS, eigenSpectra[2, :]) plt.xlabel("Wavelength (nm)") plt.show() plt.clf() plt.subplot(3,1,1) plt.plot(Constants.WAVELENGTHS, eigenSpectra[3, :]) plt.title("Principle Components 4 - 6") plt.subplot(3,1,2) plt.plot(Constants.WAVELENGTHS, eigenSpectra[4, :]) plt.subplot(3,1,3) plt.plot(Constants.WAVELENGTHS, eigenSpectra[5, :]) plt.xlabel("Wavelength (nm)") plt.show()
def get_image_features(data_type, block): """ Method which returns the data type expected """ if data_type == 'lab': block_file_path = '/tmp/lab_img.png' block.save(block_file_path) data = transform.get_LAB_L_SVD_s(Image.open(block_file_path)) if data_type == 'mscn': img_mscn_revisited = transform.rgb_to_mscn(block) # save tmp as img img_output = Image.fromarray(img_mscn_revisited.astype('uint8'), 'L') mscn_revisited_file_path = '/tmp/mscn_revisited_img.png' img_output.save(mscn_revisited_file_path) img_block = Image.open(mscn_revisited_file_path) # extract from temp image data = compression.get_SVD_s(img_block) """if data_type == 'mscn': img_gray = np.array(color.rgb2gray(np.asarray(block))*255, 'uint8') img_mscn = transform.calculate_mscn_coefficients(img_gray, 7) img_mscn_norm = transform.normalize_2D_arr(img_mscn) img_mscn_gray = np.array(img_mscn_norm*255, 'uint8') data = compression.get_SVD_s(img_mscn_gray) """ if data_type == 'low_bits_6': low_bits_6 = transform.rgb_to_LAB_L_low_bits(block, 6) data = compression.get_SVD_s(low_bits_6) if data_type == 'low_bits_5': low_bits_5 = transform.rgb_to_LAB_L_low_bits(block, 5) data = compression.get_SVD_s(low_bits_5) if data_type == 'low_bits_4': low_bits_4 = transform.rgb_to_LAB_L_low_bits(block, 4) data = compression.get_SVD_s(low_bits_4) if data_type == 'low_bits_3': low_bits_3 = transform.rgb_to_LAB_L_low_bits(block, 3) data = compression.get_SVD_s(low_bits_3) if data_type == 'low_bits_2': low_bits_2 = transform.rgb_to_LAB_L_low_bits(block, 2) data = compression.get_SVD_s(low_bits_2) if data_type == 'low_bits_4_shifted_2': data = compression.get_SVD_s(transform.rgb_to_LAB_L_bits( block, (3, 6))) if data_type == 'sub_blocks_stats': block = np.asarray(block) width, height, _ = block.shape sub_width, sub_height = int(width / 4), int(height / 4) sub_blocks = segmentation.divide_in_blocks(block, (sub_width, sub_height)) data = [] for sub_b in sub_blocks: # by default use the whole lab L canal l_svd_data = np.array(transform.get_LAB_L_SVD_s(sub_b)) # get information we want from svd data.append(np.mean(l_svd_data)) data.append(np.median(l_svd_data)) data.append(np.percentile(l_svd_data, 25)) data.append(np.percentile(l_svd_data, 75)) data.append(np.var(l_svd_data)) area_under_curve = utils.integral_area_trapz(l_svd_data, dx=100) data.append(area_under_curve) # convert into numpy array after computing all stats data = np.asarray(data) if data_type == 'sub_blocks_stats_reduced': block = np.asarray(block) width, height, _ = block.shape sub_width, sub_height = int(width / 4), int(height / 4) sub_blocks = segmentation.divide_in_blocks(block, (sub_width, sub_height)) data = [] for sub_b in sub_blocks: # by default use the whole lab L canal l_svd_data = np.array(transform.get_LAB_L_SVD_s(sub_b)) # get information we want from svd data.append(np.mean(l_svd_data)) data.append(np.median(l_svd_data)) data.append(np.percentile(l_svd_data, 25)) data.append(np.percentile(l_svd_data, 75)) data.append(np.var(l_svd_data)) # convert into numpy array after computing all stats data = np.asarray(data) if data_type == 'sub_blocks_area': block = np.asarray(block) width, height, _ = block.shape sub_width, sub_height = int(width / 8), int(height / 8) sub_blocks = segmentation.divide_in_blocks(block, (sub_width, sub_height)) data = [] for sub_b in sub_blocks: # by default use the whole lab L canal l_svd_data = np.array(transform.get_LAB_L_SVD_s(sub_b)) area_under_curve = utils.integral_area_trapz(l_svd_data, dx=50) data.append(area_under_curve) # convert into numpy array after computing all stats data = np.asarray(data) if data_type == 'sub_blocks_area_normed': block = np.asarray(block) width, height, _ = block.shape sub_width, sub_height = int(width / 8), int(height / 8) sub_blocks = segmentation.divide_in_blocks(block, (sub_width, sub_height)) data = [] for sub_b in sub_blocks: # by default use the whole lab L canal l_svd_data = np.array(transform.get_LAB_L_SVD_s(sub_b)) l_svd_data = utils.normalize_arr(l_svd_data) area_under_curve = utils.integral_area_trapz(l_svd_data, dx=50) data.append(area_under_curve) # convert into numpy array after computing all stats data = np.asarray(data) if data_type == 'mscn_var_4': data = _get_mscn_variance(block, (100, 100)) if data_type == 'mscn_var_16': data = _get_mscn_variance(block, (50, 50)) if data_type == 'mscn_var_64': data = _get_mscn_variance(block, (25, 25)) if data_type == 'mscn_var_16_max': data = _get_mscn_variance(block, (50, 50)) data = np.asarray(data) size = int(len(data) / 4) indices = data.argsort()[-size:][::-1] data = data[indices] if data_type == 'mscn_var_64_max': data = _get_mscn_variance(block, (25, 25)) data = np.asarray(data) size = int(len(data) / 4) indices = data.argsort()[-size:][::-1] data = data[indices] if data_type == 'ica_diff': current_image = transform.get_LAB_L(block) ica = FastICA(n_components=50) ica.fit(current_image) image_ica = ica.fit_transform(current_image) image_restored = ica.inverse_transform(image_ica) final_image = utils.normalize_2D_arr(image_restored) final_image = np.array(final_image * 255, 'uint8') sv_values = utils.normalize_arr(compression.get_SVD_s(current_image)) ica_sv_values = utils.normalize_arr(compression.get_SVD_s(final_image)) data = abs(np.array(sv_values) - np.array(ica_sv_values)) if data_type == 'svd_trunc_diff': current_image = transform.get_LAB_L(block) svd = TruncatedSVD(n_components=30, n_iter=100, random_state=42) transformed_image = svd.fit_transform(current_image) restored_image = svd.inverse_transform(transformed_image) reduced_image = (current_image - restored_image) U, s, V = compression.get_SVD(reduced_image) data = s if data_type == 'ipca_diff': current_image = transform.get_LAB_L(block) transformer = IncrementalPCA(n_components=20, batch_size=25) transformed_image = transformer.fit_transform(current_image) restored_image = transformer.inverse_transform(transformed_image) reduced_image = (current_image - restored_image) U, s, V = compression.get_SVD(reduced_image) data = s if data_type == 'svd_reconstruct': reconstructed_interval = (90, 200) begin, end = reconstructed_interval lab_img = transform.get_LAB_L(block) lab_img = np.array(lab_img, 'uint8') U, s, V = lin_svd(lab_img, full_matrices=True) smat = np.zeros((end - begin, end - begin), dtype=complex) smat[:, :] = np.diag(s[begin:end]) output_img = np.dot(U[:, begin:end], np.dot(smat, V[begin:end, :])) output_img = np.array(output_img, 'uint8') data = compression.get_SVD_s(output_img) if 'sv_std_filters' in data_type: # convert into lab by default to apply filters lab_img = transform.get_LAB_L(block) arr = np.array(lab_img) images = [] # Apply list of filter on arr images.append(medfilt2d(arr, [3, 3])) images.append(medfilt2d(arr, [5, 5])) images.append(wiener(arr, [3, 3])) images.append(wiener(arr, [5, 5])) # By default computation of current block image s_arr = compression.get_SVD_s(arr) sv_vector = [s_arr] # for each new image apply SVD and get SV for img in images: s = compression.get_SVD_s(img) sv_vector.append(s) sv_array = np.array(sv_vector) _, length = sv_array.shape sv_std = [] # normalize each SV vectors and compute standard deviation for each sub vectors for i in range(length): sv_array[:, i] = utils.normalize_arr(sv_array[:, i]) sv_std.append(np.std(sv_array[:, i])) indices = [] if 'lowest' in data_type: indices = utils.get_indices_of_lowest_values(sv_std, 200) if 'highest' in data_type: indices = utils.get_indices_of_highest_values(sv_std, 200) # data are arranged following std trend computed data = s_arr[indices] # with the use of wavelet if 'wave_sv_std_filters' in data_type: # convert into lab by default to apply filters lab_img = transform.get_LAB_L(block) arr = np.array(lab_img) images = [] # Apply list of filter on arr images.append(medfilt2d(arr, [3, 3])) # By default computation of current block image s_arr = compression.get_SVD_s(arr) sv_vector = [s_arr] # for each new image apply SVD and get SV for img in images: s = compression.get_SVD_s(img) sv_vector.append(s) sv_array = np.array(sv_vector) _, length = sv_array.shape sv_std = [] # normalize each SV vectors and compute standard deviation for each sub vectors for i in range(length): sv_array[:, i] = utils.normalize_arr(sv_array[:, i]) sv_std.append(np.std(sv_array[:, i])) indices = [] if 'lowest' in data_type: indices = utils.get_indices_of_lowest_values(sv_std, 200) if 'highest' in data_type: indices = utils.get_indices_of_highest_values(sv_std, 200) # data are arranged following std trend computed data = s_arr[indices] # with the use of wavelet if 'sv_std_filters_full' in data_type: # convert into lab by default to apply filters lab_img = transform.get_LAB_L(block) arr = np.array(lab_img) images = [] # Apply list of filter on arr kernel = np.ones((3, 3), np.float32) / 9 images.append(cv2.filter2D(arr, -1, kernel)) kernel = np.ones((5, 5), np.float32) / 25 images.append(cv2.filter2D(arr, -1, kernel)) images.append(cv2.GaussianBlur(arr, (3, 3), 0.5)) images.append(cv2.GaussianBlur(arr, (3, 3), 1)) images.append(cv2.GaussianBlur(arr, (3, 3), 1.5)) images.append(cv2.GaussianBlur(arr, (5, 5), 0.5)) images.append(cv2.GaussianBlur(arr, (5, 5), 1)) images.append(cv2.GaussianBlur(arr, (5, 5), 1.5)) images.append(medfilt2d(arr, [3, 3])) images.append(medfilt2d(arr, [5, 5])) images.append(wiener(arr, [3, 3])) images.append(wiener(arr, [5, 5])) wave = w2d(arr, 'db1', 2) images.append(np.array(wave, 'float64')) # By default computation of current block image s_arr = compression.get_SVD_s(arr) sv_vector = [s_arr] # for each new image apply SVD and get SV for img in images: s = compression.get_SVD_s(img) sv_vector.append(s) sv_array = np.array(sv_vector) _, length = sv_array.shape sv_std = [] # normalize each SV vectors and compute standard deviation for each sub vectors for i in range(length): sv_array[:, i] = utils.normalize_arr(sv_array[:, i]) sv_std.append(np.std(sv_array[:, i])) indices = [] if 'lowest' in data_type: indices = utils.get_indices_of_lowest_values(sv_std, 200) if 'highest' in data_type: indices = utils.get_indices_of_highest_values(sv_std, 200) # data are arranged following std trend computed data = s_arr[indices] if 'sv_entropy_std_filters' in data_type: lab_img = transform.get_LAB_L(block) arr = np.array(lab_img) images = [] kernel = np.ones((3, 3), np.float32) / 9 images.append(cv2.filter2D(arr, -1, kernel)) kernel = np.ones((5, 5), np.float32) / 25 images.append(cv2.filter2D(arr, -1, kernel)) images.append(cv2.GaussianBlur(arr, (3, 3), 0.5)) images.append(cv2.GaussianBlur(arr, (3, 3), 1)) images.append(cv2.GaussianBlur(arr, (3, 3), 1.5)) images.append(cv2.GaussianBlur(arr, (5, 5), 0.5)) images.append(cv2.GaussianBlur(arr, (5, 5), 1)) images.append(cv2.GaussianBlur(arr, (5, 5), 1.5)) images.append(medfilt2d(arr, [3, 3])) images.append(medfilt2d(arr, [5, 5])) images.append(wiener(arr, [3, 3])) images.append(wiener(arr, [5, 5])) wave = w2d(arr, 'db1', 2) images.append(np.array(wave, 'float64')) sv_vector = [] sv_entropy_list = [] # for each new image apply SVD and get SV for img in images: s = compression.get_SVD_s(img) sv_vector.append(s) sv_entropy = [ utils.get_entropy_contribution_of_i(s, id_sv) for id_sv, sv in enumerate(s) ] sv_entropy_list.append(sv_entropy) sv_std = [] sv_array = np.array(sv_vector) _, length = sv_array.shape # normalize each SV vectors and compute standard deviation for each sub vectors for i in range(length): sv_array[:, i] = utils.normalize_arr(sv_array[:, i]) sv_std.append(np.std(sv_array[:, i])) indices = [] if 'lowest' in data_type: indices = utils.get_indices_of_lowest_values(sv_std, 200) if 'highest' in data_type: indices = utils.get_indices_of_highest_values(sv_std, 200) # data are arranged following std trend computed s_arr = compression.get_SVD_s(arr) data = s_arr[indices] if 'convolutional_kernels' in data_type: sub_zones = segmentation.divide_in_blocks(block, (20, 20)) data = [] diff_std_list_3 = [] diff_std_list_5 = [] diff_mean_list_3 = [] diff_mean_list_5 = [] plane_std_list_3 = [] plane_std_list_5 = [] plane_mean_list_3 = [] plane_mean_list_5 = [] plane_max_std_list_3 = [] plane_max_std_list_5 = [] plane_max_mean_list_3 = [] plane_max_mean_list_5 = [] for sub_zone in sub_zones: l_img = transform.get_LAB_L(sub_zone) normed_l_img = utils.normalize_2D_arr(l_img) # bilateral with window of size (3, 3) normed_diff = convolution.convolution2D(normed_l_img, kernels.min_bilateral_diff, (3, 3)) std_diff = np.std(normed_diff) mean_diff = np.mean(normed_diff) diff_std_list_3.append(std_diff) diff_mean_list_3.append(mean_diff) # bilateral with window of size (5, 5) normed_diff = convolution.convolution2D(normed_l_img, kernels.min_bilateral_diff, (5, 5)) std_diff = np.std(normed_diff) mean_diff = np.mean(normed_diff) diff_std_list_5.append(std_diff) diff_mean_list_5.append(mean_diff) # plane mean with window of size (3, 3) normed_plane_mean = convolution.convolution2D( normed_l_img, kernels.plane_mean, (3, 3)) std_plane_mean = np.std(normed_plane_mean) mean_plane_mean = np.mean(normed_plane_mean) plane_std_list_3.append(std_plane_mean) plane_mean_list_3.append(mean_plane_mean) # plane mean with window of size (5, 5) normed_plane_mean = convolution.convolution2D( normed_l_img, kernels.plane_mean, (5, 5)) std_plane_mean = np.std(normed_plane_mean) mean_plane_mean = np.mean(normed_plane_mean) plane_std_list_5.append(std_plane_mean) plane_mean_list_5.append(mean_plane_mean) # plane max error with window of size (3, 3) normed_plane_max = convolution.convolution2D( normed_l_img, kernels.plane_max_error, (3, 3)) std_plane_max = np.std(normed_plane_max) mean_plane_max = np.mean(normed_plane_max) plane_max_std_list_3.append(std_plane_max) plane_max_mean_list_3.append(mean_plane_max) # plane max error with window of size (5, 5) normed_plane_max = convolution.convolution2D( normed_l_img, kernels.plane_max_error, (5, 5)) std_plane_max = np.std(normed_plane_max) mean_plane_max = np.mean(normed_plane_max) plane_max_std_list_5.append(std_plane_max) plane_max_mean_list_5.append(mean_plane_max) diff_std_list_3 = np.array(diff_std_list_3) diff_std_list_5 = np.array(diff_std_list_5) diff_mean_list_3 = np.array(diff_mean_list_3) diff_mean_list_5 = np.array(diff_mean_list_5) plane_std_list_3 = np.array(plane_std_list_3) plane_std_list_5 = np.array(plane_std_list_5) plane_mean_list_3 = np.array(plane_mean_list_3) plane_mean_list_5 = np.array(plane_mean_list_5) plane_max_std_list_3 = np.array(plane_max_std_list_3) plane_max_std_list_5 = np.array(plane_max_std_list_5) plane_max_mean_list_3 = np.array(plane_max_mean_list_3) plane_max_mean_list_5 = np.array(plane_max_mean_list_5) if 'std_max_blocks' in data_type: data.append(np.std(diff_std_list_3[0:int(len(sub_zones) / 5)])) data.append(np.std(diff_mean_list_3[0:int(len(sub_zones) / 5)])) data.append(np.std(diff_std_list_5[0:int(len(sub_zones) / 5)])) data.append(np.std(diff_mean_list_5[0:int(len(sub_zones) / 5)])) data.append(np.std(plane_std_list_3[0:int(len(sub_zones) / 5)])) data.append(np.std(plane_mean_list_3[0:int(len(sub_zones) / 5)])) data.append(np.std(plane_std_list_5[0:int(len(sub_zones) / 5)])) data.append(np.std(plane_mean_list_5[0:int(len(sub_zones) / 5)])) data.append(np.std(plane_max_std_list_3[0:int(len(sub_zones) / 5)])) data.append( np.std(plane_max_mean_list_3[0:int(len(sub_zones) / 5)])) data.append(np.std(plane_max_std_list_5[0:int(len(sub_zones) / 5)])) data.append( np.std(plane_max_mean_list_5[0:int(len(sub_zones) / 5)])) if 'mean_max_blocks' in data_type: data.append(np.mean(diff_std_list_3[0:int(len(sub_zones) / 5)])) data.append(np.mean(diff_mean_list_3[0:int(len(sub_zones) / 5)])) data.append(np.mean(diff_std_list_5[0:int(len(sub_zones) / 5)])) data.append(np.mean(diff_mean_list_5[0:int(len(sub_zones) / 5)])) data.append(np.mean(plane_std_list_3[0:int(len(sub_zones) / 5)])) data.append(np.mean(plane_mean_list_3[0:int(len(sub_zones) / 5)])) data.append(np.mean(plane_std_list_5[0:int(len(sub_zones) / 5)])) data.append(np.mean(plane_mean_list_5[0:int(len(sub_zones) / 5)])) data.append( np.mean(plane_max_std_list_3[0:int(len(sub_zones) / 5)])) data.append( np.mean(plane_max_mean_list_3[0:int(len(sub_zones) / 5)])) data.append( np.mean(plane_max_std_list_5[0:int(len(sub_zones) / 5)])) data.append( np.mean(plane_max_mean_list_5[0:int(len(sub_zones) / 5)])) if 'std_normed' in data_type: data.append(np.std(diff_std_list_3)) data.append(np.std(diff_mean_list_3)) data.append(np.std(diff_std_list_5)) data.append(np.std(diff_mean_list_5)) data.append(np.std(plane_std_list_3)) data.append(np.std(plane_mean_list_3)) data.append(np.std(plane_std_list_5)) data.append(np.std(plane_mean_list_5)) data.append(np.std(plane_max_std_list_3)) data.append(np.std(plane_max_mean_list_3)) data.append(np.std(plane_max_std_list_5)) data.append(np.std(plane_max_mean_list_5)) if 'mean_normed' in data_type: data.append(np.mean(diff_std_list_3)) data.append(np.mean(diff_mean_list_3)) data.append(np.mean(diff_std_list_5)) data.append(np.mean(diff_mean_list_5)) data.append(np.mean(plane_std_list_3)) data.append(np.mean(plane_mean_list_3)) data.append(np.mean(plane_std_list_5)) data.append(np.mean(plane_mean_list_5)) data.append(np.mean(plane_max_std_list_3)) data.append(np.mean(plane_max_mean_list_3)) data.append(np.mean(plane_max_std_list_5)) data.append(np.mean(plane_max_mean_list_5)) data = np.array(data) if data_type == 'convolutional_kernel_stats_svd': l_img = transform.get_LAB_L(block) normed_l_img = utils.normalize_2D_arr(l_img) # bilateral with window of size (5, 5) normed_diff = convolution.convolution2D(normed_l_img, kernels.min_bilateral_diff, (5, 5)) # getting sigma vector from SVD compression s = compression.get_SVD_s(normed_diff) data = s if data_type == 'svd_entropy': l_img = transform.get_LAB_L(block) blocks = segmentation.divide_in_blocks(l_img, (20, 20)) values = [] for b in blocks: sv = compression.get_SVD_s(b) values.append(utils.get_entropy(sv)) data = np.array(values) if data_type == 'svd_entropy_20': l_img = transform.get_LAB_L(block) blocks = segmentation.divide_in_blocks(l_img, (20, 20)) values = [] for b in blocks: sv = compression.get_SVD_s(b) values.append(utils.get_entropy(sv)) data = np.array(values) if data_type == 'svd_entropy_noise_20': l_img = transform.get_LAB_L(block) blocks = segmentation.divide_in_blocks(l_img, (20, 20)) values = [] for b in blocks: sv = compression.get_SVD_s(b) sv_size = len(sv) values.append(utils.get_entropy(sv[int(sv_size / 4):])) data = np.array(values) return data
import numpy as np from gensim.models import Word2Vec from sklearn.decomposition import IncrementalPCA # from bhtsne import tsne WORD2VEC_MODEL = 'GNews.model' WORD2VEC_JSON = 'word2vec.json' model = Word2Vec.load(WORD2VEC_MODEL) words = [] vectors = np.empty((len(model.vocab.keys()), 300)) # vectors = np.empty((6, 300)) # for i, w in enumerate(['email', 'password', 'user', 'date', 'this', 'is']): for i, w in enumerate(model.vocab.keys()): words.append(w) vectors[i] = model[w] # vectors = tsne(vectors, dimensions=3, perplexity=50) ipca = IncrementalPCA(n_components=2, batch_size=25000) vectors = ipca.fit_transform(vectors) json_vectors = {} for i, w in enumerate(words): json_vectors[w] = vectors[i].tolist() with open(WORD2VEC_JSON, 'w') as f: json.dump(json_vectors, f)
plt.xticks(fontsize=8) plt.yticks(fontsize=8) plt.tight_layout() #fig, ax = plt.subplots() #ax.legend(LABEL_COLOR_MAP, legend_names) #plt.legend(legend_names, loc='best') plt.title('Principal Component Analysis', fontsize=12) img_file = results_path.joinpath('Principal_Component_Scatter_Plot.png') plt.savefig(img_file) plt.show() # Looks like approx. 50 components are enough to describe 90% of the variance in the dataset # We'll choose 50 components for our modeling #Using incremental PCA for efficiency - saves a lot of time on larger datasets pca_final = IncrementalPCA(n_components=16) df_train_pca = pca_final.fit_transform(X_train_rus) print("df_train_pca.shape") print(df_train_pca.shape) #Creating correlation matrix for the principal components - I expect little to no correlation df_corr = data_df.corr() corrmat = np.corrcoef(df_train_pca.transpose()) plt.figure(figsize=(16, 16)) sns.set(font_scale=.8) sns.heatmap(corrmat, vmin=df_corr.values.min(), vmax=1, fmt='.1f', square=True, cmap="Blues", linewidths=0.1,
# sample = np.random.choice(len(data[0]), 2) # interpolation_coefficient = np.random.beta(2, 2) # interpolation_coefficient = 0.5 # new_data_point = interpolation_coefficient * data[0][sample[0]] + (1 - interpolation_coefficient) * data[0][sample[1]] # new_target = np.any([data[1][sample[0]].astype(int), data[1][sample[1]].astype(int)], axis=0) # X_training_list.append(new_data_point) # y_training_list.append(new_target) # X_training = np.array(X_training_list) # y_training = np.array(y_training_list) # print(X_training.shape) # print(y_training.shape) print("PCA ...") print("original data: ", dataset_matrix.shape) pca = IncrementalPCA(n_components=800, batch_size=1000) X_training = pca.fit_transform(X_training) X_testing = pca.transform(X_testing) print("training data: ", X_training.shape) print("testing data: ", X_testing.shape) device = torch.device('cuda') activation_functions = {nn.ReLU(), torch.tanh} size_hidden1 = range(100, 500, 50) size_hidden2 = range(100, 500, 50) regularization_coefficients = [ 1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2 ] print( cross_validation(X_training, y_training,
# License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.decomposition import PCA, IncrementalPCA iris = load_iris() X = iris.data y = iris.target print(X) print(y) n_components = 2 ipca = IncrementalPCA(n_components=n_components, batch_size=10) X_ipca = ipca.fit_transform(X) print(X_ipca) pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X) print("pca:") print(X_pca) colors = ['navy', 'turquoise', 'darkorange'] for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]: plt.figure(figsize=(8, 8)) for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names): plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1], color=color,
def main(): """ Get data from db and save it as csv """ bq = _bq.BQHandler() io = _io.IO(gs_bucket=options.gs_bucket) viz = _viz.Viz() starttime, endtime = io.get_dates(options) print('Using dataset {} and time range {} - {}'.format( options.feature_dataset, starttime.strftime('%Y-%m-%d'), endtime.strftime('%Y-%m-%d'))) all_param_names = options.label_params + options.feature_params + options.meta_params aggs = io.get_aggs_from_param_names(options.feature_params) if options.pca: ipca = IncrementalPCA(n_components=options.pca_components, whiten=options.whiten, copy=False) rmses, maes, r2s, vars, start_times, end_times, end_times_obj = [], [], [], [], [], [], [] start = starttime end = endtime print('Processing time range {} - {}'.format( start.strftime('%Y-%m-%d %H:%M'), end.strftime('%Y-%m-%d %H:%M'))) try: print('Reading data...') data = bq.get_rows(start, end, loc_col='trainstation', project=options.project, dataset=options.feature_dataset, table=options.feature_table, parameters=all_param_names) data = io.filter_train_type(labels_df=data, train_types=options.train_types, sum_types=True, train_type_column='train_type', location_column='trainstation', time_column='time', sum_columns=['delay'], aggs=aggs) if options.y_avg_hours is not None: data = io.calc_running_delay_avg(data, options.y_avg_hours) data.sort_values(by=['time', 'trainstation'], inplace=True) if options.impute: print('Imputing missing values...') data.drop(columns=['train_type'], inplace=True) data = imputer.fit_transform(data) data.loc[:, 'train_type'] = None if options.model == 'ard' and len(data) > options.n_samples: print('Sampling {} values from data...'.format(options.n_samples)) data = data.sample(options.n_samples) #l_data = data.loc[:,options.meta_params + options.label_params] #f_data = data.loc[:,options.meta_params + options.feature_params] except ValueError as e: f_data, l_data = [], [] #f_data.rename(columns={'trainstation':'loc_name'}, inplace=True) #logging.debug('Labels shape: {}'.format(l_data.shape)) print('Processing {} rows...'.format(len(data))) #assert l_data.shape[0] == f_data.shape[0] target = data.loc[:, options.label_params].astype(np.float32).values #print(f_data.columns) #features = f_data.drop(columns=['loc_name', 'time']).astype(np.float32).values features = data.loc[:, options.feature_params].astype(np.float32).values X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33) logging.debug('Features shape: {}'.format(X_train.shape)) n_samples, n_dims = X_train.shape if options.normalize: print('Normalizing data...') print(X_train) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.fit_transform(X_test) if options.pca: print('Doing PCA analyzis for the data...') X_train = ipca.fit_transform(X_train) fname = options.output_path + '/ipca_explained_variance.png' viz.explained_variance(ipca, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) X_test = ipca.fit_transform(X_test) logging.debug('Features shape after pre-processing: {}'.format( X_train.shape)) print('Training...') print(X_train.shape) input_dim = X_train.shape[1] #k1 = gpflow.kernels.Matern52(input_dim, lengthscales=0.3) #k_seasonal = gpflow.kernels.Periodic(input_dim=input_dim, period=2190, name='k_seasonal') #k_small = gpflow.kernels.Periodic(input_dim=input_dim, period=120, name='k_small') k_weather = gpflow.kernels.RBF(input_dim=input_dim, ARD=True) #k_noise = gpflow.kernels.White(input_dim=input_dim) #k = k_seasonal + k_weather + k_noise k = k_weather Z = np.random.rand(150, input_dim) if options.cv: logging.info('Doing random search for hyper parameters...') param_grid = {"length_scale": [0.1, 1, 2], "whiten": [True, False]} model = GP(dim=input_dim, Z=Z) random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=int(options.n_iter_search), n_jobs=-1) random_search.fit(X_train, y_train) logging.info("RandomizedSearchCV done.") sys.exit() else: model = GP(dim=input_dim, Z=Z) model.fit(X_train.astype(np.float64), y_train.reshape((-1, 1)).astype(np.float64)) model.save(options.save_file) print('Training finished') print(model.model) # Z_list = options.z_list.split(',') #for size in Z_list: # with tf.Session() as sess: #custom_config = gpflow.settings.get_settings() #custom_config.verbosity.tf_compile_verb = True #with gpflow.settings.temp_settings(custom_config), gpflow.session_manager.get_session().as_default(): #Z = X_train[::5].copy() # Z = np.random.rand(int(size), 19) # print('Training with inducing points: {}'.format(Z.shape)) # # # model = gpflow.models.SVGP(X_train.astype(np.float64), # # y_train.reshape((-1,1)).astype(np.float64), # # kern=k, # # likelihood=gpflow.likelihoods.Gaussian(), # # Z=Z, # # #Z=X_train.copy(), # # minibatch_size=100, # # whiten=options.normalize # # ) # # #model.likelihood.variance = 0.01 # # # # model.compile(session=sess) # # opt = gpflow.train.ScipyOptimizer() # # opt.minimize(model) # # model = GP(dim=19, # Z=Z # ) # model.fit(X_train.astype(np.float64), # y_train.reshape((-1,1)).astype(np.float64)) # # model.save(options.save_file) # # print('Training finished') # print(model.model) #fname=options.output_path+'/svga_performance.png' #viz.plot_svga(model, fname) # k_long_term = 66.0**2 * RBF(length_scale=67.0) # k_seasonal = 2.4**2 * RBF(length_scale=90.0)* ExpSineSquared(length_scale=150, periodicity=1.0, periodicity_bounds=(0,10000)) # k_medium_term = 0.66**2 * RationalQuadratic(length_scale=1.2, alpha=0.78) # k_noise = 0.18**2 * RBF(length_scale=0.134) + WhiteKernel(noise_level=0.19**2) # #kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise # kernel_gpml = k_long_term + k_seasonal + k_medium_term + k_noise # # model = GaussianProcessRegressor(kernel=kernel_gpml, #alpha=0, # optimizer=None, normalize_y=True) # Metrics y_pred, var = model.predict_f(X_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) rmses.append(rmse) maes.append(mae) r2s.append(r2) vars.append(var.mean()) start_times.append(start.strftime('%Y-%m-%dT%H:%M:%S')) end_times.append(end.strftime('%Y-%m-%dT%H:%M:%S')) end_times_obj.append(end) print('RMSE: {:.2f}'.format(rmse)) print('MAE: {:.2f}'.format(mae)) print('Variance: {:.2f}-{:.2f}'.format(var.min(), var.max())) print('R2 score: {:.2f}'.format(r2)) #io.save_scikit_model(model, filename=options.save_file, ext_filename=options.save_file) if options.model == 'rf': fname = options.output_path + '/rfc_feature_importance.png' viz.rfc_feature_importance(model.feature_importances_, fname) io._upload_to_bucket(filename=fname, ext_filename=fname) try: fname = options.output_path + '/learning_over_time.png' viz.plot_learning_over_time(end_times_obj, rmses, maes, r2s, filename=fname) io._upload_to_bucket(filename=fname, ext_filename=fname) except Exception as e: logging.error(e) error_data = { 'start_times': start_times, 'end_times': end_times, 'rmse': rmses, 'mae': maes, 'var': vars, 'r2': r2s } fname = '{}/training_time_validation_errors.csv'.format( options.output_path) io.write_csv(error_data, filename=fname, ext_filename=fname)
# Authors: Kyle Kastner # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.decomposition import PCA, IncrementalPCA iris = load_iris() X = iris.data y = iris.target n_components = 2 ipca = IncrementalPCA(n_components=n_components, batch_size=10) X_ipca = ipca.fit_transform(X) pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X) colors = ['navy', 'turquoise', 'darkorange'] for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]: plt.figure(figsize=(8, 8)) for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names): plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1], color=color, lw=2, label=target_name) if "Incremental" in title: err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean() plt.title(title + " of iris dataset\nMean absolute unsigned error "
nhalf = 40000 itrain = 0 print 'DFT Difference PC= 100' matname = 'X2.mat' clasname = 'cover_dft2.pkl' n_components = 100 mat = scipy.io.loadmat('myFile.mat') songData = mat['songData'][:,1:17000] labels = mat['songData'][:,0] songScaled = preprocessing.scale(songData) ipca = IncrementalPCA(n_components=n_components, batch_size=2000) X_ipca = ipca.fit_transform(songScaled,labels) def getTrainData(X_ipca,labels): i = 0 X_pos = [] y = [] while(i<15500): label = labels[i] j =0 temp = [] while(labels[i] == label): temp.append(X_ipca[i]) j=j+1 i=i+1