def extract_features(extraction_type, n_components):

    if extraction_type == 'pca':
        ext = PCA(n_components=n_components)

        return ext
    elif extraction_type == 'lda':
        ext = LDA(n_discriminants=n_components)

        return ext

    else:
        print("Input a valid method for (PCA or LDA)\n")
Exemplo n.º 2
0
def extract_features(tipo, n):
    
    if tipo == 'pca':
    
        ext = PCA(n_components=n)
    
        return ext

    elif tipo == 'lda':
        
        ext = LDA(n_discriminants=n)
        
        return ext
    
    else:
        print ("Ingrese un método válido (pca o lda)\n")
def plot_projections(holder,
                     labels,
                     preprocess_lda='PCA',
                     class_name='Antioxidants',
                     only_pca=False,
                     binarize_class=True,
                     standardize=True,
                     cluster=True,
                     return_distances=False):
    '''
    holder should be a dictionary with df's as values and fp-filenames as keys
    labels should be a mapping of DrugCombID: ATC_class
    '''
    if only_pca:
        from sklearn.decomposition import PCA

        df = dict()
        for ind, i in enumerate([
                'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit',
                'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new',
                'fps_transformer_1024bit_new', 'fps_transformer_64bit_new',
                'fps_gae_64bit_new'
        ]):

            df_cluster = holder[i].copy()
            df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())]
            df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')]
            if standardize:
                from mlxtend.preprocessing import standardize as st
                classes = df_cluster.index.copy()
                df_cluster.reset_index(inplace=True, drop=True)
                df_cluster = st(df_cluster)
            else:
                classes = df_cluster.index.copy()
            pca = PCA(n_components=2)
            temp = pca.fit_transform(df_cluster)
            df[ind] = pd.DataFrame(index=df_cluster.index, data=temp)
            df[ind]['classes'] = classes
            df[ind]['classes'] = df[ind]['classes'].map(labels)
        title = 'PCA'

    else:  # to LDA
        from mlxtend.feature_extraction import LinearDiscriminantAnalysis as LDA
        from sklearn.preprocessing import LabelEncoder
        # binary https://stats.stackexchange.com/questions/178587/why-is-the-rank-of-covariance-matrix-at-most-n-1/180366#180366

        df = dict()
        for ind, i in enumerate([
                'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit',
                'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new',
                'fps_transformer_1024bit_new', 'fps_transformer_64bit_new',
                'fps_gae_64bit_new'
        ]):

            df_cluster = holder[i].copy()
            df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())]
            df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')]
            if standardize:
                from mlxtend.preprocessing import standardize as st
                from sklearn.preprocessing import MinMaxScaler

                classes = df_cluster.index.copy()
                df_cluster.reset_index(inplace=True, drop=True)
                mms = MinMaxScaler()
                df_cluster = pd.DataFrame(data=mms.fit_transform(df_cluster),
                                          index=df_cluster.index,
                                          columns=df.columns)
            else:
                classes = df_cluster.index.copy()
            df_cluster['classes'] = classes
            df_cluster['classes'] = df_cluster['classes'].map(labels)
            if binarize_class:
                df_cluster.loc[df_cluster.classes != class_name,
                               'classes'] = 'not ' + 'class_name'

            # change labels from str to int
            enc = LabelEncoder()
            real_classes = df_cluster.loc[:, 'classes']
            df_cluster.loc[:, 'classes'] = enc.fit_transform(
                df_cluster['classes'])
            classes = df_cluster.pop('classes')

            if preprocess_lda == 'PLS':
                from sklearn.cross_decomposition import PLSRegression
                pls = PLSRegression(n_components=10, scale=False)
                temp = pls.fit_transform(df_cluster.values, classes.values)[0]
            elif preprocess_lda == 'PCA':
                from sklearn.decomposition import PCA
                pca = PCA(n_components=0.95, svd_solver='full', whiten=False)
                temp = pca.fit_transform(df_cluster.values)
            elif preprocess_lda == 'kernelPCA':
                from sklearn.decomposition import KernelPCA
                pca = KernelPCA(kernel="rbf", gamma=5)
                temp = pca.fit_transform(df_cluster.values)
            elif preprocess_lda == 'NONE':
                temp = df_cluster.values

            # lda
            lda = LDA(n_discriminants=2)
            lda.fit(temp, classes.values)
            temp = lda.transform(temp)
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    'ignore',
                    'Casting complex values to real discards the imaginary part'
                )
                temp = temp.astype(np.float)  # in case of complex numbers///
            df[ind] = pd.DataFrame(index=df_cluster.index,
                                   columns=[0, 1],
                                   data=temp)
            df[ind]['classes'] = real_classes

        title = 'LDA'

    sns.set_context(context='talk')
    sns.set_style('dark')
    sns.set_style({'font.family': 'serif', 'font.sans-serif': ['Helvetica']})
    fig, ((ax1, ax2, ax3), (ax4, ax5, ax6),
          (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize=(13, 14))
    cm = plt.cm.get_cmap('Spectral')
    my_cmap = cm(np.linspace(0, 1, len(np.unique(df[ind]['classes']))),
                 alpha=0.6)

    if return_distances:
        distances = dict()
        sil_scores = dict()
        chs_scores = dict()
    for ax_n, key, x, name in zip(
        [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9], df.keys(), df.values(),
        [
            'E3FP', 'Morgan_300', 'Topo_1024', 'Infomax', 'VAE_256', 'VAE_16',
            'Trans_1024', 'Trans_64', 'GAE_64'
        ]):
        if not binarize_class:
            for ind, i in enumerate(np.unique(x['classes'])):
                color = my_cmap[ind]
                marker = '.'
                if i == class_name:
                    color = 'black',
                    marker = ','
                ax_n.scatter(
                    x.loc[x.classes == i, 0],
                    x.loc[x.classes == i, 1],
                    marker=marker,
                    label=i +
                    f' (n={str(len(x.loc[x.classes==i, 0]))}) vs Rest ({str(len(x.loc[x.classes!=i, 0]))})',
                    color=color)
                ax_n.title.set_text(name)
        else:
            ax_n.scatter(x.loc[:, 0], x.loc[:, 1], marker='.')
            ax_n.scatter(
                x.loc[x.classes == class_name, 0],
                x.loc[x.classes == class_name, 1],
                marker=',',
                label=class_name +
                f' (n={str(len(x.loc[x.classes==class_name, 0]))}) vs rest (n={str(len(x.loc[x.classes!=class_name, 0]))})',
                color='darkorange')
            ax_n.title.set_text(name)
            if cluster:
                from sklearn.cluster import KMeans
                from scipy.spatial.distance import pdist
                from sklearn.metrics import silhouette_score as sil
                from sklearn.metrics import calinski_harabasz_score as chs

                km = KMeans(init='k-means++', n_clusters=1, n_init=10)
                km.fit(x.loc[x.classes != class_name, [0, 1]])

                km1 = KMeans(init='k-means++', n_clusters=1, n_init=10)
                km1.fit(x.loc[x.classes == class_name, [0, 1]])

                ax_n.scatter(km.cluster_centers_[:, 0],
                             km.cluster_centers_[:, 1],
                             marker='X',
                             color='darkblue',
                             s=100,
                             linewidth=3)
                ax_n.scatter(km1.cluster_centers_[:, 0],
                             km1.cluster_centers_[:, 1],
                             marker='X',
                             color='red',
                             s=100,
                             linewidth=3)

                d = round(
                    pdist([km.cluster_centers_[0], km1.cluster_centers_[0]],
                          metric='euclidean')[0], 3)
                d_sc = round(sil(x.loc[:, [0, 1]], x['classes']), 3)
                d_chs = round(chs(x.loc[:, [0, 1]], x['classes']), 3)
                if return_distances:
                    cl_name = class_name + ' ' + name
                    distances[cl_name] = d
                    sil_scores[cl_name] = d_sc
                    chs_scores[cl_name] = d_chs
                name = name + '\n|d:' + str(d) + '|sil:' + str(
                    d_sc) + '|chs:' + str(d_chs)
                ax_n.title.set_text(name)
    for ax in [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9]:
        ax.set_xticks([])
        ax.set_yticks([])

    labels = ax_n.get_legend_handles_labels()[1]
    if only_pca:
        fig.suptitle(labels[0] + "\n classified with: " + title)
    else:
        fig.suptitle(labels[0] + "\n classified with: " + title +
                     f', preprocessed with: {preprocess_lda}')
    fig.tight_layout()
    if not return_distances:
        return fig
    else:
        return fig, distances, sil_scores, chs_scores
def test_fail_array_transform():
    lda = LDA()
    lda.fit(X, y)
    exp = lda.transform(X[1])
def test_fail_array_fit():
    lda = LDA()
    lda.fit(X[1], y[1])
def test_evals():
    lda = LDA(n_discriminants=2)
    res = lda.fit(X, y).transform(X)
    np.set_printoptions(suppress=True)
    print('%s' % lda.e_vals_)
    assert_almost_equal(lda.e_vals_, [20.90, 0.14, 0.0, 0.0], decimal=2)
def test_default_components():
    lda = LDA(n_discriminants=0)
    lda.fit(X, y)
    res = lda.fit(X).transform(X)
def test_default_2components():
    lda = LDA(n_discriminants=2)
    lda.fit(X, y)
    res = lda.fit(X, y).transform(X)
    assert res.shape[1] == 2
def test_default_components():
    lda = LDA()
    lda.fit(X, y)
    res = lda.fit(X).transform(X)
    assert res.shape[1] == 4
def test_fail_array_transform():
    lda = LDA()
    lda.fit(X, y)
    with pytest.raises(ValueError):
        lda.transform(X[1])
def test_fail_array_fit():
    lda = LDA()
    with pytest.raises(ValueError):
        lda.fit(X[1], y[1])
def test_default_0components():
    with pytest.raises(AttributeError):
        LDA(n_discriminants=0)