예제 #1
0
def test_toy_example_collapse_points():
    """Test on a toy example of three points that should collapse

    We build a simple example: two points from the same class and a point from
    a different class in the middle of them. On this simple example, the new
    (transformed) points should all collapse into one single point. Indeed, the
    objective is 2/(1 + exp(d/2)), with d the euclidean distance between the
    two samples from the same class. This is maximized for d=0 (because d>=0),
    with an objective equal to 1 (loss=-1.).

    """
    rng = np.random.RandomState(42)
    input_dim = 5
    two_points = rng.randn(2, input_dim)
    X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]])
    y = [0, 0, 1]

    class LossStorer:
        def __init__(self, X, y):
            self.loss = np.inf  # initialize the loss to very high
            # Initialize a fake NCA and variables needed to compute the loss:
            self.fake_nca = NeighborhoodComponentsAnalysis()
            self.fake_nca.n_iter_ = np.inf
            self.X, y, _ = self.fake_nca._validate_params(X, y)
            self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]

        def callback(self, transformation, n_iter):
            """Stores the last value of the loss function"""
            self.loss, _ = self.fake_nca._loss_grad_lbfgs(
                transformation, self.X, self.same_class_mask, -1.0)

    loss_storer = LossStorer(X, y)
    nca = NeighborhoodComponentsAnalysis(random_state=42,
                                         callback=loss_storer.callback)
    X_t = nca.fit_transform(X, y)
    print(X_t)
    # test that points are collapsed into one point
    assert_array_almost_equal(X_t - X_t[0], 0.)
    assert abs(loss_storer.loss + 1) < 1e-10
예제 #2
0
                           random_state=1337)
df = pd.DataFrame(data=X, columns=['V' + str(i) for i in range(1, 20 + 1)])
df.insert(0, 'ID', np.array(range(2000)))
df['Target'] = y

# check data
#df.info()
#df.isnull().values.sum()

# mtx
X = df.iloc[:, 1:21].values
y = df['Target'].values

# demension reduction
nca = NeighborhoodComponentsAnalysis(random_state=1234)
X = nca.fit_transform(X, y)

# transformation
mms = MinMaxScaler()
X = mms.fit_transform(X)


def run_simulation(n_neighbors=10, metric='minkowski', thres1=10, thres2=None):
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric=metric).fit(X)

    # Let's find the k-neighbors of each point in object X. To do that we call the kneighbors() function on object X.
    distances, indices = nbrs.kneighbors(X)

    # get all the sub
    sub_id = df.loc[df.Target == 1, 'ID'].tolist()
    sub = indices[sub_id]
def plot_single_projection(holder,
                           labels,
                           class_name='Antioxidants',
                           fp_name='fps_e3fp_1024bit',
                           standardize=True,
                           preprocess_lda='PCA'):
    '''
    holder should be a dictionary with df's as values and fp-filenames as keys
    labels should be a mapping of DrugCombID: ATC_class
    '''

    from mlxtend.preprocessing import standardize as st
    from sklearn.preprocessing import LabelEncoder
    from sklearn.cluster import KMeans
    from mlxtend.feature_extraction import LinearDiscriminantAnalysis  #in sklearn LDA i'd need to add a dummy class if i want to have 2 components after trasnformation
    from scipy.spatial.distance import pdist

    df_cluster = holder[fp_name].copy()
    df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())]
    df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')]

    if standardize:
        classes = df_cluster.index.copy()
        df_cluster.reset_index(inplace=True, drop=True)
        df_cluster = st(df_cluster)
    else:
        classes = df_cluster.index.copy()
    df_cluster[
        'classes'] = classes  # our classes are mapped to index in labels dictionary
    df_cluster['classes'] = df_cluster['classes'].map(labels)

    df_cluster.loc[df_cluster.classes != class_name,
                   'classes'] = 'not ' + 'class_name'
    #dummy = [0]*(df_cluster.shape[1]-1) + ['dummy']
    #df_cluster.loc[df_cluster.shape[0]] = dummy

    # change labels from str to int
    enc = LabelEncoder()
    real_classes = df_cluster.loc[:, 'classes']
    df_cluster.loc[:, 'classes'] = enc.fit_transform(df_cluster['classes'])
    classes = df_cluster.pop('classes')

    if preprocess_lda == 'PLS':
        from sklearn.cross_decomposition import PLSRegression
        pls = PLSRegression(n_components=10, scale=False)
        temp = pls.fit_transform(df_cluster.values, classes.values)[0]
    elif preprocess_lda == 'PCA':
        from sklearn.decomposition import PCA
        pca = PCA(n_components=0.95, svd_solver='full', whiten=False)
        temp = pca.fit_transform(df_cluster.values)
    elif preprocess_lda == 'kernelPCA':
        from sklearn.decomposition import KernelPCA
        pca = KernelPCA(kernel="rbf", gamma=5)
        temp = pca.fit_transform(df_cluster.values)
    elif preprocess_lda == 'NONE':
        temp = df_cluster.values
    elif preprocess_lda == 'NCA':
        from sklearn.neighbors import NeighborhoodComponentsAnalysis
        nca = NeighborhoodComponentsAnalysis()
        temp = nca.fit_transform(df_cluster.values, classes.values)

    #lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage='auto')
    #lda.fit(temp, classes.values)
    #temp1 = lda.transform(temp)

    lda = LinearDiscriminantAnalysis(n_discriminants=2)
    lda.fit(temp, classes.values)
    temp = lda.transform(temp)
    with warnings.catch_warnings():
        warnings.filterwarnings(
            'ignore',
            'Casting complex values to real discards the imaginary part')
        temp = temp.astype(np.float)  # in case of complex numbers///

    df = pd.DataFrame(index=df_cluster.index, columns=[0, 1], data=temp)
    df['classes'] = real_classes

    km = KMeans(init='k-means++', n_clusters=1, n_init=10)
    km.fit(df.loc[df.classes != class_name, [0, 1]])

    km1 = KMeans(init='k-means++', n_clusters=1, n_init=10)
    km1.fit(df.loc[df.classes == class_name, [0, 1]])

    d = pdist([km.cluster_centers_[0], km1.cluster_centers_[0]])
    d = str(round(d[0], 3))

    fig, ax = plt.subplots(figsize=(6, 6))
    ax.scatter(df.loc[df.classes != class_name, 0],
               df.loc[df.classes != class_name, 1],
               marker=',',
               color='grey')
    ax.scatter(df.loc[df.classes == class_name, 0],
               df.loc[df.classes == class_name, 1],
               marker=',',
               color='orange')

    ax.scatter(km.cluster_centers_[:, 0],
               km.cluster_centers_[:, 1],
               marker='X',
               color='green',
               linewidths=30)

    ax.scatter(km1.cluster_centers_[:, 0],
               km1.cluster_centers_[:, 1],
               marker='X',
               color='red',
               linewidths=30)

    fig.suptitle(class_name + ' ' + d)
    return fig
예제 #4
0
                           random_state=1337)
df = pd.DataFrame(data=X, columns=['V' + str(i) for i in range(1, 20 + 1)])
df.insert(0, 'ID', np.array(range(2000)))
df['Target'] = y

# check data
#df.info()
#df.isnull().values.sum()

# mtx
X = df.iloc[:, 1:21].values
y = df['Target'].values

# demension reduction
nca = NeighborhoodComponentsAnalysis(random_state=1234)
X = nca.fit_transform(X, y)

# transformation
mms = MinMaxScaler()
X = mms.fit_transform(X)


def run_simulation(n_neighbors=10, metric='minkowski', thres1=10, thres2=None):
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric=metric).fit(X)

    # Let's find the k-neighbors of each point in object X. To do that we call the kneighbors() function on object X.
    distances, indices = nbrs.kneighbors(X)

    # get all the sub
    sub_id = df.loc[df.Target == 1, 'ID'].tolist()
    sub = indices[sub_id]