예제 #1
0
    def run_tsne(self,
                 subsample=True,
                 perplexity=5.0,
                 n_iter=1e6,
                 seed=None,
                 **kwargs):
        """
        Calls TSNE model from scikit-learn on the SNP or subsampled SNP data
        set. The 'seed' argument is used for subsampling SNPs. Perplexity
        is the primary parameter affecting the TSNE, but any additional 
        params supported by scikit-learn can be supplied as kwargs.
        """
        seed = (seed if seed else self._seed())
        if subsample:
            data = self.snps[:, jsubsample_snps(self.snpsmap, seed)]
            print("Subsampling SNPs: {}/{}".format(data.shape[1],
                                                   self.snps.shape[1]))
        else:
            data = self.snps

        # init TSNE model object with params (sensitive)
        tsne_kwargs = {
            'perplexity': perplexity,
            'init': 'pca',
            'n_iter': int(n_iter),
            'random_state': seed,
        }
        tsne_kwargs.update(kwargs)
        tsne_model = TSNE(**tsne_kwargs)

        # fit the model
        tsne_data = tsne_model.fit_transform(data)
        self.pcaxes = {0: tsne_data}
        self.variances = {0: [-1.0, -2.0]}
        self._model = "TSNE"
예제 #2
0
    def run_tsne(self, subsample=True, perplexity=5.0, n_iter=1e6, seed=None):
        """
        Calls TSNE model from scikit-learn on 
        """
        seed = (seed if seed else self._seed())
        if subsample:
            data = self.snps[:, jsubsample_snps(self.snpsmap, seed)]
            print(
                "Subsampling SNPs: {}/{}"
                .format(data.shape[1], self.snps.shape[1])
            )
        else:
            data = self.snps

        # init TSNE model object with params (sensitive)
        tsne_model = TSNE(
            perplexity=perplexity,
            init='pca', 
            n_iter=int(n_iter), 
            random_state=seed,
        )

        # fit the model
        tsne_data = tsne_model.fit_transform(data)
        self.pcaxes = {0: tsne_data}
        self.variances = {0: [-1.0, -2.0]}
예제 #3
0
 def subsample_snps(self, random_seed=None, quiet=False):
     "Calls jitted subsample SNPs function to sample snps using snpsmap."
     if not random_seed:
         random_seed = np.random.randint(0, 1e9)
     subarr = self.snps[:, jsubsample_snps(self.snpsmap, random_seed)]
     if not quiet:
         self._print("subsampled {} unlinked SNPs".format(subarr.shape[1]))
     return subarr
예제 #4
0
    def _impute_kmeans(self, topcov=0.9, niters=5, quiet=False):

        # the ML models to fit
        pca_model = decomposition.PCA(n_components=None)  # self.ncomponents)
        kmeans_model = KMeans(n_clusters=self.impute_method)

        # start kmeans with a global imap
        kmeans_imap = {'global': self.names}

        # iterate over step values
        iters = np.linspace(topcov, self.mincov, niters)
        for it, kmeans_mincov in enumerate(iters):

            # start message
            kmeans_minmap = {i: self.mincov for i in kmeans_imap}
            self._print(
                "Kmeans clustering: iter={}, K={}, mincov={}, minmap={}".
                format(it, self.impute_method, kmeans_mincov, kmeans_minmap))

            # 1. Load orig data and filter with imap, minmap, mincov=step
            se = SNPsExtracter(
                self.data,
                imap=kmeans_imap,
                minmap=kmeans_minmap,
                mincov=kmeans_mincov,
                quiet=self.quiet,
            )
            se.parse_genos_from_hdf5()

            # update snpsmap to new filtered data to use for subsampling
            self.snpsmap = se.snpsmap

            # 2. Impute missing data using current kmeans clusters
            impdata = SNPsImputer(se.snps, se.names, kmeans_imap, "sample",
                                  self.quiet).run()

            # x. On final iteration return this imputed array as the result
            if it == 4:
                return impdata

            # 3. subsample unlinked SNPs
            subdata = impdata[:, jsubsample_snps(se.snpsmap, self._seed())]

            # 4. PCA on new imputed data values
            pcadata = pca_model.fit_transform(subdata)

            # 5. Kmeans clustering to find new imap grouping
            kmeans_model.fit(pcadata)
            labels = np.unique(kmeans_model.labels_)
            kmeans_imap = {
                i:
                [se.names[j] for j in np.where(kmeans_model.labels_ == i)[0]]
                for i in labels
            }
            self._print(kmeans_imap)
            self._print("")
예제 #5
0
 def subsample_snps(self, random_seed=None, quiet=False):
     """
     Calls jitted functions to subsample 1 SNP per locus/linkage-block
     using snpsmap.
     """
     if not random_seed:
         random_seed = np.random.randint(0, 1e9)
     subarr = self.snps[:, jsubsample_snps(self.snpsmap, random_seed)]
     if not quiet:
         self._print("subsampled {} unlinked SNPs".format(subarr.shape[1]))
     return subarr
예제 #6
0
 def subsample_loci(self, random_seed=None, quiet=False):
     """
     Calls jitted functions to subsample loci/linkage-blocks with
     replacement to the same number as the original assembly. This does
     not subsample unlinked SNPs per locus.
     """
     raise NotImplementedError("TODO")
     if not random_seed:
         random_seed = np.random.randint(0, 1e9)
     subarr = self.snps[:, jsubsample_snps(self.snpsmap, random_seed)]
     if not quiet:
         self._print("subsampled {} loci w/ replacement".format(
             subarr.shape[1]))
     return subarr
예제 #7
0
파일: pca.py 프로젝트: yaominzoe/ipyrad
    def run(self, seed=None, subsample=True):  # , model="pca"):
        """
        Decompose genotype array (.snps) into n_components axes. 

        Parameters:
        -----------
        seed: (int)
            Random number seed used if/when subsampling SNPs.
        subsample: (bool)
            Subsample one SNP per RAD locus to reduce effect of linkage.

        Returns:
        --------
        A tuple with two numpy arrays. The first is the new data decomposed
        into principal coordinate space; the second is an array with the 
        variance explained by each PC axis. 
        """
        # update seed. Numba seed cannot be None, so get random int if None
        if seed:
            self.seed = seed
        else:
            self.seed = self._seed()

        # sample one SNP per locus
        if subsample:
            data = self.snps[:, jsubsample_snps(self.snpsmap, self.seed)]
            self._print("Subsampling SNPs: {}/{}".format(
                data.shape[1], self.snps.shape[1]))
        else:
            data = self.snps

        # decompose pca call
        # if model == "pca":
        model = decomposition.PCA(None)  # self.ncomponents)
        model.fit(data)
        newdata = model.transform(data)
        variance = model.explained_variance_ratio_
        # elif model in ("tSNE", "t-SNE", "TSNE", "T-SNE"):
        #     model = TSNE(
        #         init="pca", perplexity=data.shape[0] / 2., n_iter=100000)
        #     newdata = model.fit_transform(data)
        #     variance = "", ""

        # return tuple with new coordinates and variance explained
        return newdata, variance
예제 #8
0
    def _run(self, seed, subsample, quiet):
        """
        Called inside .run(). A single iteration. 
        """
        # sample one SNP per locus
        if subsample:
            data = self.snps[:, jsubsample_snps(self.snpsmap, seed)]
            if not quiet:
                print("Subsampling SNPs: {}/{}".format(data.shape[1],
                                                       self.snps.shape[1]))
        else:
            data = self.snps

        # decompose pca call
        model = decomposition.PCA(None)  # self.ncomponents)
        model.fit(data)
        newdata = model.transform(data)
        variance = model.explained_variance_ratio_

        # return tuple with new coordinates and variance explained
        return newdata, variance
예제 #9
0
파일: pca.py 프로젝트: Irvilma/ipyrad
    def run_umap(self, subsample=True, seed=123, n_neighbors=15, **kwargs):
        """


        """
        # check just-in-time install
        try:
            import umap
        except ImportError:
            raise ImportError(
                "to use this function you must install umap with:\n"
                "  conda install umap-learn -c conda-forge "
                )

        # subsample SNPS
        seed = (seed if seed else self._seed())
        if subsample:
            data = self.snps[:, jsubsample_snps(self.snpsmap, seed)]
            print(
                "Subsampling SNPs: {}/{}"
                .format(data.shape[1], self.snps.shape[1])
            )
        else:
            data = self.snps

        # init TSNE model object with params (sensitive)
        umap_kwargs = {
            'n_neighbors': n_neighbors,
            'init': 'spectral', 
            'random_state': seed,
        }
        umap_kwargs.update(kwargs)
        umap_model = umap.UMAP(**umap_kwargs)

        # fit the model
        umap_data = umap_model.fit_transform(data)
        self.pcaxes = {0: umap_data}
        self.variances = {0: [-1.0, -2.0]}
        self._model = "UMAP"