def task(args): import pandas #data_set, = args logging.info("dataset = %s", data_set) # read the data sets logging.info("Reading data...") data = pandas.read_hdf("data/%s.h5" % (data_set), data_type) logging.info(" * gene expression shape: %d x %d" % data.shape) x = data.as_matrix() if normalize_data: # these shouldn't affect the results x -= np.mean(x) x /= np.std(x) x -= np.mean(x, axis=0) logging.info("Running PCA...") pca = sk_PCA() pca.fit(x) logging.info("Writing results...") res_dir = 'res/pca-explained-variance' res_filename = "%s/%s.txt" % (res_dir, data_set) ensure_dir_exists(res_dir) np.savetxt(res_filename, pca.explained_variance_ratio_)
def correct_tvn(df, embeds_cols, verbose=False): if verbose: print('Do TVN') dmso = df.loc[(df['compound'] == 'DMSO'), embeds_cols].to_numpy(copy=True) p = sk_PCA(n_components=len(embeds_cols), whiten=True).fit(dmso) df.loc[:, embeds_cols] = p.transform(df.loc[:, embeds_cols]) return df
def assign_clusters(df_well, embeds_cols, min_cluster_size=10, min_samples=3): pca_image = sk_PCA(n_components=number_of_components_95(df_well, embeds_cols)).fit_transform(df_well[embeds_cols]) tsne_image = sk_TSNE(metric='cosine', n_jobs=1).fit_transform(pca_image) clusterer = HDBSCAN(min_cluster_size=min_cluster_size, metric='manhattan', min_samples=min_samples).fit(tsne_image) # tsne_image = sk_TSNE(metric='cosine', n_jobs=1).fit_transform(df_well[embeds_cols]) # clusterer = HDBSCAN(min_cluster_size=min_cluster_size, metric='manhattan', min_samples=min_samples).fit(tsne_image) return clusterer.labels_, clusterer.labels_.max(), tsne_image
def PCA(self, X, cols, n_components): X = X.to_pandas() pca = sk_PCA(n_components=n_components) pca.fit(X[cols]) X = pca.transform(X[cols]) X = pd.DataFrame(X) print( f'PCA number of used components: {len(pca.explained_variance_ratio_)}' ) features = list(np.arange(len(pca.explained_variance_ratio_))) return cudf.from_pandas(X), features
def number_of_components_95(df, embeds_cols): # PCA on all embeddings pca = sk_PCA().fit(df[embeds_cols]) # Find the number of dimensions to explain 95% of variance i = 0 s = 0 for j in range(100): s += pca.explained_variance_ratio_[j] if s > 0.95: i = j break # There should be at least 8 dimensions if i < 8: return 8 else: return i
def learn(self, x, y, log_file_prefix=None, callbacks=[]): # define optimizer and loss function self.encoder.compile(optimizer=self.optimizer, loss=self.loss) if (self.init_pca): # initialize weights of the first and last layer using PCA from sklearn.decomposition import PCA as sk_PCA #weights = self.encoder.layers[1].get_weights() weights = self.first_dense_enc_layer.get_weights() dim = weights[1].size w = sk_PCA(n_components=dim).fit(x).components_ weights[0][:, :] = w.T weights[1][:] = -np.mean(np.dot(x, w.T), axis=0) #self.encoder.layers[1].set_weights(weights) self.first_dense_enc_layer.set_weights(weights) # optionally add callbacks keras_callbacks = [] if log_file_prefix: #keras_callbacks.append(keras.callbacks.CSVLogger(log_file_prefix + ".log")) keras_callbacks.append(WeightLogger(self.encoder, log_file_prefix)) keras_callbacks.append(LossLogger(log_file_prefix)) for callback in callbacks: class CB(keras.callbacks.Callback): def on_train_begin(self, logs={}): callback() def on_epoch_end(self, epoch, logs={}): callback() keras_callbacks.append(CB()) # train self.encoder.fit(x, y, nb_epoch=self.n_epochs, batch_size=self.batch_size, shuffle=True, callbacks=keras_callbacks, verbose=2)
def task(args): data_type, seed, (algName, _, makeAlg) = args logging.info("datatype = %s, seed = %d, algorithm = %s", data_type, seed, algName) # read the data sets logging.info("Reading data...") y_train, x_train, y_test, x_test = dataReader.main("%s_%d" % (data_type, seed)) data_dim = x_train.shape[1] logging.info(" * training set: %d x %d" % x_train.shape) logging.info(" * testing set: %d x %d" % x_test.shape) # init rng np.random.seed(seed) x_test = x_train logging.info("Running and evaluating the algorithm...") # init the algorithm alg = makeAlg(data_dim, repr_dim) # create output dir if does not exist ensure_dir_exists('res') from sklearn.decomposition import PCA as sk_PCA pca = sk_PCA(n_components=repr_dim) pca.fit(x_train) y_train = pca.transform(x_train) y_test = pca.transform(x_test) # define the progress saving function progress_filename = 'res/progress-enc-mse-%s-%d-%s.txt' % (data_type, seed, algName) progress_file = open(progress_filename, 'w', encoding='utf-8') def save_progress(): y_test_pred = alg.encode(x_test) rel_mse = relative_mean_squared_error(y_test, y_test_pred) progress_file.write("%g\n" % rel_mse) # fit alg.learn(x_train, y_train, log_file_prefix=("log/%s-%d-%s" % (data_type, seed, algName)), callbacks=[save_progress])
def learn(self, x, y=None, secondary_y=None, validation_split=0.0, validation_data=None, secondary_validation_data=None, log_file_prefix=None, per_epoch_callback_funs=[], callbacks=[]): # define optimizer and loss function if secondary_y is None: loss = self.params.loss loss_weights = None else: assert self.params.secondary_dims is not None loss = [self.params.loss, self.params.secondary_loss] loss_weights = [ 1 - self.params.secondary_loss_weight, self.params.secondary_loss_weight ] self.autoencoder.compile(optimizer=self.params.optimizer, loss=loss, loss_weights=loss_weights) if (self.params.init_pca): # initialize weights of the first and last layer using PCA from sklearn.decomposition import PCA as sk_PCA #weights = self.autoencoder.layers[1].get_weights() weights = self.first_dense_enc_layer.get_weights() dim = weights[1].size w = sk_PCA(n_components=dim).fit(x).components_ weights[0][:, :] = w.T weights[1][:] = -np.mean(np.dot(x, w.T), axis=0) #self.autoencoder.layers[1].set_weights(weights) self.first_dense_enc_layer.set_weights(weights) #weights = self.autoencoder.layers[-1].get_weights() weights = self.last_dense_dec_layer.get_weights() weights[0][:, :] = w weights[1][:] = np.mean(x, axis=0) #self.autoencoder.layers[-1].set_weights(weights) self.last_dense_dec_layer.set_weights(weights) # possible validation data if validation_data is not None and y is None: if secondary_validation_data is not None: validation_data = (validation_data, [ validation_data, secondary_validation_data ]) else: validation_data = (validation_data, validation_data) validation = (validation_data is not None or validation_split > 0) # by default predict the data itself if y is None: y = x if secondary_y is not None: y = [y, secondary_y] # optionally add callbacks keras_callbacks = [] # 'built-in' callbacks if log_file_prefix: #keras_callbacks.append(keras.callbacks.CSVLogger(log_file_prefix + ".log")) if self.params.log_weights: keras_callbacks.append( WeightLogger(self.autoencoder, log_file_prefix)) if self.params.log_weights_diff_norm is not None: keras_callbacks.append( WeightDiffStatLogger(self.autoencoder, log_file_prefix, self.params.log_weights_diff_norm)) if self.params.log_loss: keras_callbacks.append( LossLogger(log_file_prefix, per_patch=self.log_loss_per_patch)) if secondary_y is not None: keras_callbacks.append( LossLogger(log_file_prefix, loss='decoded_loss', per_patch=self.log_loss_per_patch)) keras_callbacks.append( LossLogger(log_file_prefix, loss='secondary_loss', per_patch=self.log_loss_per_patch)) if self.params.log_loss and validation: keras_callbacks.append( LossLogger(log_file_prefix, loss='val_loss')) if secondary_y is not None: keras_callbacks.append( LossLogger(log_file_prefix, loss='val_decoded_loss')) keras_callbacks.append( LossLogger(log_file_prefix, loss='val_secondary_loss')) # externally defined keras callback objects for callback in callbacks: keras_callbacks.extend(callbacks) # externally defined callbacks functions for callback in per_epoch_callback_funs: class CB(keras.callbacks.Callback): def on_train_begin(self, logs={}): callback() def on_epoch_end(self, epoch, logs={}): callback() keras_callbacks.append(CB()) if self.params.early_stopping: if self.params.early_stopping == True: monitor = ('val_loss' if validation else 'loss') else: monitor = self.params.early_stopping keras_callbacks.append( keras.callbacks.EarlyStopping( monitor=monitor, patience=self.params.early_stopping_patience)) # optional pre train if self.params.pre_train is not None: method, params = self.params.pre_train if method == "pca": # fit encoder and decoder separately to PCA output pre_n_epochs, = params from sklearn.decomposition import PCA as sk_PCA y = sk_PCA( n_components=self.params.output_dim).fit_transform(x) pretrain_combined = True if pretrain_combined: # (pre)train both encoder and decoder at the same time encoder_and_decoder = Model( input=[self.input, self.encoded_input], output=[self.encoded, self.decoded]) encoder_and_decoder.compile( optimizer=self.params.optimizer, loss=self.params.loss) pretrain_keras_callbacks = keras_callbacks.copy() if log_file_prefix: pretrain_keras_callbacks.append( LossLogger(log_file_prefix, loss='encoded_loss')) pretrain_keras_callbacks.append( LossLogger(log_file_prefix, loss='decoded_loss')) encoder_and_decoder.fit([x, y], [y, x], nb_epoch=pre_n_epochs, batch_size=self.params.batch_size, shuffle=True, callbacks=pretrain_keras_callbacks, verbose=2) else: self.encoder.compile(optimizer=self.params.optimizer, loss=self.params.loss) self.encoder.fit(x, y, nb_epoch=pre_n_epochs, batch_size=self.params.batch_size, shuffle=True, callbacks=keras_callbacks, verbose=2) self.decoder.compile(optimizer=self.params.optimizer, loss=self.params.loss) self.decoder.fit(y, x, nb_epoch=pre_n_epochs, batch_size=self.params.batch_size, shuffle=True, callbacks=keras_callbacks, verbose=2) else: raise ValueError("Invalid pre train method '%s'" % method) # train self.autoencoder.fit(x, y, nb_epoch=self.params.n_epochs, batch_size=self.params.batch_size, shuffle=True, validation_split=validation_split, validation_data=validation_data, callbacks=keras_callbacks, verbose=2)
def init(self, input_dim, output_dim): self.pca = sk_PCA(n_components=output_dim) return self
def preprocess_features(npdata, n_components=16, method='PCA', n_jobs=1): """Preprocess an array of features. Args: npdata (np.array N * ndim): features to preprocess pca (int): dim of output Returns: np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized """ _, ndim = npdata.shape npdata = npdata.astype('float32') # Apply PCA-whitening with Faiss if method == 'PCA': mat = faiss.PCAMatrix(ndim, n_components, eigen_power=-0.5) mat.train(npdata) assert mat.is_trained npdata = mat.apply_py(npdata) # Apply UMAP for dimensionality reduction elif method == 'UMAP': fit = UMAP(n_components=n_components, metric='cosine') npdata = np.ascontiguousarray(fit.fit_transform(npdata)) # Apply T-SNE for dimensionality reduction elif method == 'TSNE': if n_components > 3: X = sk_PCA().fit_transform(npdata) PCAinit = X[:, :n_components] / np.std(X[:, 0]) * 0.0001 fit = TSNE(n_components=n_components, init=PCAinit, n_jobs=n_jobs) npdata = np.ascontiguousarray(fit.fit_transform(npdata), dtype='float32') else: fit = sk_TSNE(n_components=n_components, metric='cosine', n_jobs=n_jobs) npdata = np.ascontiguousarray(fit.fit_transform(npdata)) # Apply adaptive T-SNE for dimensionality reduction elif method == 'AdaptiveTSNE': pca = sk_PCA().fit(npdata) # Find all the eigenvectors that explain 95% of the variance i = 0 s = 0 for j in range(len(pca.explained_variance_ratio_)): s += pca.explained_variance_ratio_[j] if s > 0.95: i = j # Prevent smaller than 8 if i < 8: i = 8 break # Fit and transform the data with the number of components that explain 95% pca95_well = sk_PCA(n_components=i).fit_transform(npdata) # Do a similarity measure with TSNE on the pca data if n_components > 3: PCAinit = pca95_well[:, :n_components] / np.std( pca95_well[:, 0]) * 0.0001 fit = TSNE(n_components=n_components, init=PCAinit, n_jobs=n_jobs) npdata = np.ascontiguousarray(fit.fit_transform(pca95_well)) else: fit = sk_TSNE(n_components=n_components, metric='cosine', n_jobs=n_jobs) npdata = np.ascontiguousarray(fit.fit_transform(pca95_well)) # L2 normalization row_sums = np.linalg.norm(npdata, axis=1) npdata = npdata / row_sums[:, np.newaxis] return npdata
def main(): data_size_fix_dim = [ (100, 2), (500, 2), (1000, 2), (5000, 2), (10000, 2), (15000, 2), (50000, 2), (100000, 2), (150000, 2), (1000000, 2), ] data_size_fix_num = [ (10000, 5), (10000, 10), (10000, 30), (10000, 50), (10000, 100), (10000, 150), (10000, 300), (10000, 500), # (10000, 1000), # (10000, 1500), ] """fix dim""" result = [] for data_size in data_size_fix_dim: print(data_size) fake_data = np.random.rand(*data_size) res = {'sk_pca': [], 'my_pca': []} for _ in range(10): sk_pca = sk_PCA(n_components=fake_data.shape[1]) sk_res = calc_process_time({'X': fake_data}, sk_pca.fit) res['sk_pca'].append(sk_res) my_pca = my_PCA(dim=data_size[1]) mine_res = calc_process_time({'X': fake_data}, my_pca.fit) res['my_pca'].append(mine_res) result.append(res) aves = {'sk_pca': [], 'my_pca': []} for _, res in enumerate(result): print('-' * 10) for k, v in res.items(): print(k, sum(v) / len(v)) aves[k].append(sum(v) / len(v)) print('-' * 10) x_ax = np.array(data_size_fix_dim)[:, 0] plt.plot(x_ax, aves['sk_pca'], marker="o", label='sklearn') plt.plot(x_ax, aves['my_pca'], marker="o", label='mine') plt.legend(loc=2) timestamp = datetime.now().strftime(TIME_TEMPLATE) plt.savefig('./result/bench-res-fix-dim{}.png'.format(timestamp)) plt.clf() plt.close() """fix data num""" result = [] for data_size in data_size_fix_num: print(data_size) fake_data = np.random.rand(*data_size) res = {'sk_pca': [], 'my_pca': []} for _ in range(10): sk_pca = sk_PCA(n_components=fake_data.shape[1]) sk_res = calc_process_time({'X': fake_data}, sk_pca.fit) res['sk_pca'].append(sk_res) my_pca = my_PCA(dim=data_size[1]) mine_res = calc_process_time({'X': fake_data}, my_pca.fit) res['my_pca'].append(mine_res) result.append(res) aves = {'sk_pca': [], 'my_pca': []} for _, res in enumerate(result): print('-' * 10) for k, v in res.items(): print(k, sum(v) / len(v)) aves[k].append(sum(v) / len(v)) print('-' * 10) x_ax = np.array(data_size_fix_num)[:, 1] plt.plot(x_ax, aves['sk_pca'], marker="o", label='sklearn') plt.plot(x_ax, aves['my_pca'], marker="o", label='mine') plt.legend(loc=2) timestamp = datetime.now().strftime(TIME_TEMPLATE) plt.savefig('./result/bench-res-fix-num{}.png'.format(timestamp)) plt.clf() plt.close()
a = np.dot(mean_x.T, mean_x) #numpy.linalg.eig() 计算方形矩阵的特征值和特征向量 # w,v = numpy.linalg.eig(a) 计算方形矩阵a的特征值和右特征向量 # w: 多个特征值组成的一个矢量。备注:多个特征值并没有按特定的次序排列。特征值中可能包含复数。 # v: 多个特征向量组成的一个矩阵。每一个特征向量都被归一化了。第i列的特征向量v[:,i]对应第i个特征值w[i]。 w, v = np.linalg.eig(a) z = np.dot(mean_x, v) return z if __name__ == '__main__': np.random.seed(12) np.set_printoptions(precision=6, suppress=True, linewidth=120) data = np.random.random((6, 5)) python_PCA = PCA(n_components=5) sk_PCA_reduction = sk_PCA(n_components=0.95) sk_PCA_all = sk_PCA(n_components=5) x = np.array(data) sk_reduction_out = sk_PCA_reduction.fit_transform(x) sk_all_out = sk_PCA_all.fit_transform(x) python_svg_out = python_PCA.fit_transform(x) python_eig_out = python_PCA.eig_transform(x) print("sklearn_reduction") print(sk_reduction_out) print("sklearn_all") print(sk_all_out) print("vanilla_svg") print(python_svg_out) print("python_eig")
def f_PCA(n_comp=30, reuse=False): # LOADING TRAIN SET, Y IS DOWNLOADED FOR PLOTTING PURPOSE X_train = np.load('Data_files/X_train_processed.npy', mmap_mode='r') Y_train = np.load('Data_files/Y_train.npy', mmap_mode='r') if reuse: with open('Algos/PCA_folder/PCA.pkl', 'rb') as filehandler: pca = pickle.load(filehandler) else: # CREATES PCA WITH n_comp COMPONENTS TO KEEP # AND WHITENS THE DATA (NORMALIZATION) # NOTE THAT PCA AUTOMATICALLY CENTERS THE DATA pca = sk_PCA(n_components=n_comp, whiten=True) X_train = np.load('Data_files/X_train_processed.npy', mmap_mode='r') pca.fit(X_train) # EXTRACTING THE VARIANCES AND THE EIGENSPECTRA FOR VISUALIZATION variances = pca.explained_variance_ratio_ eig_vec = pca.components_ # LOADING THE WAVELENGTHS FOR PLOTTING wavelengths = np.load('Data_files/wavelengths.npy', mmap_mode='r') fig = plt.figure() # MEANINGFUL EIGENVECTORS/COMPONENTS SELECTED IN 1-BASED indexs_list = [8, 11, 20, 26, 30] for i in indexs_list: plt.plot(wavelengths, eig_vec[i - 1], label='e' + str(i)) plt.legend(fontsize=18) plt.grid() plt.xlabel('Wavelengths', fontsize=20) plt.ylabel('Intensity', fontsize=20) plt.xticks(fontsize=18) plt.yticks(fontsize=18) # plt.savefig('Images/PCA_first_components.eps', # format="eps", bbox_inches='tight') plt.show() del wavelengths # DICTIONARY CREATION TO ASSOCIATE TARGET TO COLOR dic = {} dic['0'] = 'b' dic['1'] = 'g' dic['2'] = 'r' dic['3'] = 'k' dic['4'] = 'c' coords = pca.transform(X_train) del X_train labels = ['QSO _', 'QSO BAL', 'GALAXY LRG', 'GALAXY ELG', 'STAR'] # THE FOLLOWING FIGURES PLOT THE DISTRIBUTION OF LABELS # ON THE SELECTED COMPONENTS fig = plt.figure() for i in np.unique(Y_train): plt.plot(coords[Y_train == i][19], coords[Y_train == i][25], dic[str(int(i))] + '.', label=labels[int(i)], markersize=7) plt.legend(fontsize=18) plt.grid() plt.xlabel('e20', fontsize=20) plt.ylabel('e26', fontsize=20) plt.xticks(fontsize=18) plt.yticks(fontsize=18) plt.savefig('Images/PCA_distribution2D_12.eps', format="eps", bbox_inches='tight') plt.show() fig = plt.figure() for i in np.unique(Y_train): plt.plot(coords[Y_train == i][19], coords[Y_train == i][29], dic[str(int(i))] + '.', label=labels[int(i)], markersize=7) plt.legend(fontsize=18) plt.grid() plt.xlabel('e20', fontsize=20) plt.ylabel('e30', fontsize=20) plt.xticks(fontsize=18) plt.yticks(fontsize=18) plt.savefig('Images/PCA_distribution2D_13.eps', format="eps", bbox_inches='tight') plt.show() fig = plt.figure() for i in np.unique(Y_train): plt.plot(coords[Y_train == i][25], coords[Y_train == i][29], dic[str(int(i))] + '.', label=labels[int(i)], markersize=7) plt.legend(fontsize=18) plt.grid() plt.xlabel('e26', fontsize=20) plt.ylabel('e30', fontsize=20) plt.xticks(fontsize=18) plt.yticks(fontsize=18) plt.savefig('Images/PCA_distribution2D_23.eps', format="eps", bbox_inches='tight') plt.show() # SAVES THE PCA OBJECT with open('Algos/PCA_folder/PCA.pkl', 'wb') as filehandler: pickle.dump(pca, filehandler) return pca, variances, eig_vec