class _PLSSVDImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
microbe_iv['group'] = microbe_iv['group'].map(catdict) metabolite_iv['group'] = metabolite_iv['group'].map(catdict) # highlight features with p-value <= 0.001 max_pval = 0.001 microbe_iv.loc[microbe_iv.pval > max_pval, 'group'] = 'None' print('Number of significant microbes: %d' % microbe_iv[microbe_iv['group'] != 'None'].shape[0]) metabolite_iv.loc[metabolite_iv.pval > max_pval, 'group'] = 'None' print('Number of significant metabolites: %d' % metabolite_iv[metabolite_iv['group'] != 'None'].shape[0]) plssvd = PLSSVD(n_components=3) plssvd.fit(X=clr(centralize(multiplicative_replacement(microbes))), Y=clr(centralize(multiplicative_replacement(metabolites)))) def standardize(A): A = (A - np.mean(A, axis=0)) / np.std(A, axis=0) return A pls_microbes = pd.DataFrame(standardize(plssvd.x_weights_), columns=['PCA1', 'PCA2', 'PCA3'], index=microbes.columns) pls_metabolites = pd.DataFrame(standardize(plssvd.y_weights_), columns=['PCA1', 'PCA2', 'PCA3'], index=metabolites.columns) color_map = {
def fit(self, X, y, split_type: str = "extreme"): """Split multi-label y dataset into train and test subsets. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features). y : {array-like, sparse matrix} of shape (n_samples, n_labels). split_type : Splitting type of {naive, extreme, iterative}. Returns ------- data partition : two lists of indices representing the resulted data split """ if X is None: raise Exception("Please provide a dataset.") if y is None: raise Exception("Please provide labels for the dataset.") assert X.shape[0] == y.shape[0] check, X = check_type(X=X, return_list=False) if not check: tmp = "The method only supports scipy.sparse, numpy.ndarray, and list type of data" raise Exception(tmp) check, y = check_type(X=y, return_list=False) if not check: tmp = "The method only supports scipy.sparse, numpy.ndarray, and list type of data" raise Exception(tmp) num_examples, num_labels = y.shape # check whether data is singly labeled if num_labels == 1: # transform it to multi-label data classes = list(set([i[0] if i else 0 for i in y.data])) mlb = LabelBinarizer(labels=classes) y = mlb.transform(y) # 1)- Compute covariance of X and y using SVD if not self.is_fit: model = PLSSVD(n_components=self.num_clusters, scale=True, copy=False) optimal_init = self.__optimal_learning_rate(alpha=self.lr) list_batches = np.arange(start=0, stop=num_examples, step=self.batch_size) total_progress = self.num_epochs * len(list_batches) for epoch in np.arange(start=1, stop=self.num_epochs + 1): for idx, batch_idx in enumerate(list_batches): current_progress = epoch * (idx + 1) desc = '\t>> Computing the covariance of X and y using PLSSVD: {0:.2f}%...'.format( (current_progress / total_progress) * 100) if total_progress == current_progress: print(desc) else: print(desc, end="\r") model.fit( X[batch_idx:batch_idx + self.batch_size].toarray(), y[batch_idx:batch_idx + self.batch_size].toarray()) U = model.x_weights_ learning_rate = 1.0 / (self.lr * (optimal_init + epoch - 1)) U = U + learning_rate * (0.5 * 2 * U) U = U + learning_rate * (0.5 * np.sign(U)) model.x_weights_ = U self.U = lil_matrix(model.x_weights_) del U, model # 2)- Project X onto a low dimension via U orthonormal basis obtained from SVD # using SVD desc = '\t>> Projecting examples onto the obtained low dimensional U orthonormal basis...' print(desc) Z = X.dot(self.U) # 3)- Cluster low dimensional examples if not self.is_fit: desc = '\t>> Clustering the resulted low dimensional examples...' print(desc) self.centroid_kmeans, label_kmeans = kmeans2(data=Z.toarray(), k=self.num_clusters, iter=self.num_epochs, minit='++') else: label_kmeans = np.array( [np.argmin(z.dot(self.centroid_kmeans), 1)[0] for z in Z]) mlb = LabelBinarizer(labels=list(range(self.num_clusters))) y = mlb.reassign_labels(y, mapping_labels=label_kmeans) self.is_fit = True # perform splitting if split_type == "extreme": st = ExtremeStratification( swap_probability=self.swap_probability, threshold_proportion=self.threshold_proportion, decay=self.decay, shuffle=self.shuffle, split_size=self.split_size, num_epochs=self.num_epochs, verbose=False) train_list, test_list = st.fit(X=X, y=y) elif split_type == "iterative": st = IterativeStratification(shuffle=self.shuffle, split_size=self.split_size, verbose=False) train_list, test_list = st.fit(y=y) else: st = NaiveStratification(shuffle=self.shuffle, split_size=self.split_size, batch_size=self.batch_size, num_jobs=self.num_jobs, verbose=False) train_list, test_list = st.fit(y=y) return train_list, test_list
plt.show() pos_max = np.argmax(acc_val) num_opt_feat = rang_feat[pos_max] test_acc_opt = acc_test[pos_max] print 'Number optimum of features: ' + str(num_opt_feat) print("The optimum test accuracy is %2.2f%%" % (100 * test_acc_opt)) ########################### PLS ##################################3 from sklearn.cross_decomposition import PLSSVD N_feat_max = n_classes # As many new features as classes minus 1 # 1. Obtain PLS projections pls = PLSSVD(n_components=N_feat_max) pls.fit(X_train, Y_train_bin) X_train_pls = pls.transform(X_train) X_val_pls = pls.transform(X_val) X_test_pls = pls.transform(X_test) # 2. Compute and plot accuracy evolution rang_feat = np.arange(1, N_feat_max, 1) [acc_tr, acc_val, acc_test] = SVM_accuracy_evolution(X_train_pls, Y_train, X_val_pls, Y_val, X_test_pls, Y_test, rang_feat, C, gamma) plt.figure() plot_accuracy_evolution(rang_feat, acc_tr, acc_val, acc_test) plt.show() # 3. Find the optimum number of features pos_max = np.argmax(acc_val)
c=classColors[i, :]) plt.title('Projected data over the components') plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.show() if (0): #%% PARTIAL LEAST SQUARES #%% PLS SVD nComponents = np.arange(1, nClasses + 1) plsSvdScores = np.zeros((5, np.alen(nComponents))) for i, n in enumerate(nComponents): plssvd = PLSSVD(n_components=n) plssvd.fit(Xtrain, Ytrain) XtrainT = plssvd.transform(Xtrain) XtestT = plssvd.transform(Xtest) plsSvdScores[:, i] = util.classify(XtrainT, XtestT, labelsTrain, labelsTest) plssvd = PLSSVD(n_components=2) xt, yt = plssvd.fit_transform(Xtrain, Ytrain) fig = plt.figure() util.plotData(fig, xt, labelsTrain, classColors) plt.title('First 2 components of projected data') #%% Plot accuracies for PLSSVD plt.figure() for i in range(5): plt.plot(nComponents, plsSvdScores[i, :], lw=3)
dataTrainT = pca.fit_transform(dataTrain) dataTestT = pca.transform(dataTest) pcaScores[:,i] = util.classify(dataTrainT,dataTestT,labelsTrain,labelsTest) # Training data with 2 dimensions pca = PCA(n_components=2) xtPCA = pca.fit_transform(dataTrain) uPCA = pca.components_ #%% PARTIAL LEAST SQUARES #%% PLS SVD nComponents = np.arange(1,nClasses+1) plsSvdScores = np.zeros((2,np.alen(nComponents))) for i,n in enumerate(nComponents): plssvd = PLSSVD(n_components=n) plssvd.fit(dataTrain,Ytrain) dataTrainT = plssvd.transform(dataTrain) dataTestT = plssvd.transform(dataTest) plsSvdScores[:,i] = util.classify(dataTrainT,dataTestT,labelsTrain,labelsTest) fig = plt.figure() util.plotAccuracy(fig,nComponents,plsSvdScores) plt.title('PLS SVD accuracy',figure=fig) plssvd = PLSSVD(n_components=2) xt,yt = plssvd.fit_transform(dataTrain,Ytrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) u = plssvd.x_weights_ plt.quiver(u[0,0],u[1,0],color='k',edgecolor='k',lw=1,scale=0.1,figure=fig) plt.quiver(-u[1,0],u[0,0],color='k',edgecolor='k',lw=1,scale=0.4,figure=fig)
plt.scatter(xtPCA[labelsTrain==l,0],xtPCA[labelsTrain==l,1],alpha=0.5,c=classColors[i,:]) plt.title('Projected data over the components') plt.xlim([-4,4]) plt.ylim([-4,4]) plt.show() if (0): #%% PARTIAL LEAST SQUARES #%% PLS SVD nComponents = np.arange(1,nClasses+1) plsSvdScores = np.zeros((5,np.alen(nComponents))) for i,n in enumerate(nComponents): plssvd = PLSSVD(n_components=n) plssvd.fit(Xtrain,Ytrain) XtrainT = plssvd.transform(Xtrain) XtestT = plssvd.transform(Xtest) plsSvdScores[:,i] = util.classify(XtrainT,XtestT,labelsTrain,labelsTest) plssvd = PLSSVD(n_components=2) xt,yt = plssvd.fit_transform(Xtrain,Ytrain) fig = plt.figure() util.plotData(fig,xt,labelsTrain,classColors) plt.title('First 2 components of projected data') #%% Plot accuracies for PLSSVD plt.figure() for i in range (5): plt.plot(nComponents,plsSvdScores[i,:],lw=3)