def test_toy_example_collapse_points(): """Test on a toy example of three points that should collapse We build a simple example: two points from the same class and a point from a different class in the middle of them. On this simple example, the new (transformed) points should all collapse into one single point. Indeed, the objective is 2/(1 + exp(d/2)), with d the euclidean distance between the two samples from the same class. This is maximized for d=0 (because d>=0), with an objective equal to 1 (loss=-1.). """ rng = np.random.RandomState(42) input_dim = 5 two_points = rng.randn(2, input_dim) X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]]) y = [0, 0, 1] class LossStorer: def __init__(self, X, y): self.loss = np.inf # initialize the loss to very high # Initialize a fake NCA and variables needed to compute the loss: self.fake_nca = NeighborhoodComponentsAnalysis() self.fake_nca.n_iter_ = np.inf self.X, y, _ = self.fake_nca._validate_params(X, y) self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :] def callback(self, transformation, n_iter): """Stores the last value of the loss function""" self.loss, _ = self.fake_nca._loss_grad_lbfgs( transformation, self.X, self.same_class_mask, -1.0) loss_storer = LossStorer(X, y) nca = NeighborhoodComponentsAnalysis(random_state=42, callback=loss_storer.callback) X_t = nca.fit_transform(X, y) print(X_t) # test that points are collapsed into one point assert_array_almost_equal(X_t - X_t[0], 0.) assert abs(loss_storer.loss + 1) < 1e-10
random_state=1337) df = pd.DataFrame(data=X, columns=['V' + str(i) for i in range(1, 20 + 1)]) df.insert(0, 'ID', np.array(range(2000))) df['Target'] = y # check data #df.info() #df.isnull().values.sum() # mtx X = df.iloc[:, 1:21].values y = df['Target'].values # demension reduction nca = NeighborhoodComponentsAnalysis(random_state=1234) X = nca.fit_transform(X, y) # transformation mms = MinMaxScaler() X = mms.fit_transform(X) def run_simulation(n_neighbors=10, metric='minkowski', thres1=10, thres2=None): nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric=metric).fit(X) # Let's find the k-neighbors of each point in object X. To do that we call the kneighbors() function on object X. distances, indices = nbrs.kneighbors(X) # get all the sub sub_id = df.loc[df.Target == 1, 'ID'].tolist() sub = indices[sub_id]
def plot_single_projection(holder, labels, class_name='Antioxidants', fp_name='fps_e3fp_1024bit', standardize=True, preprocess_lda='PCA'): ''' holder should be a dictionary with df's as values and fp-filenames as keys labels should be a mapping of DrugCombID: ATC_class ''' from mlxtend.preprocessing import standardize as st from sklearn.preprocessing import LabelEncoder from sklearn.cluster import KMeans from mlxtend.feature_extraction import LinearDiscriminantAnalysis #in sklearn LDA i'd need to add a dummy class if i want to have 2 components after trasnformation from scipy.spatial.distance import pdist df_cluster = holder[fp_name].copy() df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())] df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')] if standardize: classes = df_cluster.index.copy() df_cluster.reset_index(inplace=True, drop=True) df_cluster = st(df_cluster) else: classes = df_cluster.index.copy() df_cluster[ 'classes'] = classes # our classes are mapped to index in labels dictionary df_cluster['classes'] = df_cluster['classes'].map(labels) df_cluster.loc[df_cluster.classes != class_name, 'classes'] = 'not ' + 'class_name' #dummy = [0]*(df_cluster.shape[1]-1) + ['dummy'] #df_cluster.loc[df_cluster.shape[0]] = dummy # change labels from str to int enc = LabelEncoder() real_classes = df_cluster.loc[:, 'classes'] df_cluster.loc[:, 'classes'] = enc.fit_transform(df_cluster['classes']) classes = df_cluster.pop('classes') if preprocess_lda == 'PLS': from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=10, scale=False) temp = pls.fit_transform(df_cluster.values, classes.values)[0] elif preprocess_lda == 'PCA': from sklearn.decomposition import PCA pca = PCA(n_components=0.95, svd_solver='full', whiten=False) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'kernelPCA': from sklearn.decomposition import KernelPCA pca = KernelPCA(kernel="rbf", gamma=5) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'NONE': temp = df_cluster.values elif preprocess_lda == 'NCA': from sklearn.neighbors import NeighborhoodComponentsAnalysis nca = NeighborhoodComponentsAnalysis() temp = nca.fit_transform(df_cluster.values, classes.values) #lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage='auto') #lda.fit(temp, classes.values) #temp1 = lda.transform(temp) lda = LinearDiscriminantAnalysis(n_discriminants=2) lda.fit(temp, classes.values) temp = lda.transform(temp) with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', 'Casting complex values to real discards the imaginary part') temp = temp.astype(np.float) # in case of complex numbers/// df = pd.DataFrame(index=df_cluster.index, columns=[0, 1], data=temp) df['classes'] = real_classes km = KMeans(init='k-means++', n_clusters=1, n_init=10) km.fit(df.loc[df.classes != class_name, [0, 1]]) km1 = KMeans(init='k-means++', n_clusters=1, n_init=10) km1.fit(df.loc[df.classes == class_name, [0, 1]]) d = pdist([km.cluster_centers_[0], km1.cluster_centers_[0]]) d = str(round(d[0], 3)) fig, ax = plt.subplots(figsize=(6, 6)) ax.scatter(df.loc[df.classes != class_name, 0], df.loc[df.classes != class_name, 1], marker=',', color='grey') ax.scatter(df.loc[df.classes == class_name, 0], df.loc[df.classes == class_name, 1], marker=',', color='orange') ax.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='X', color='green', linewidths=30) ax.scatter(km1.cluster_centers_[:, 0], km1.cluster_centers_[:, 1], marker='X', color='red', linewidths=30) fig.suptitle(class_name + ' ' + d) return fig
random_state=1337) df = pd.DataFrame(data=X, columns=['V' + str(i) for i in range(1, 20 + 1)]) df.insert(0, 'ID', np.array(range(2000))) df['Target'] = y # check data #df.info() #df.isnull().values.sum() # mtx X = df.iloc[:, 1:21].values y = df['Target'].values # demension reduction nca = NeighborhoodComponentsAnalysis(random_state=1234) X = nca.fit_transform(X, y) # transformation mms = MinMaxScaler() X = mms.fit_transform(X) def run_simulation(n_neighbors=10, metric='minkowski', thres1=10, thres2=None): nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric=metric).fit(X) # Let's find the k-neighbors of each point in object X. To do that we call the kneighbors() function on object X. distances, indices = nbrs.kneighbors(X) # get all the sub sub_id = df.loc[df.Target == 1, 'ID'].tolist() sub = indices[sub_id]