def run_tsne(self, subsample=True, perplexity=5.0, n_iter=1e6, seed=None, **kwargs): """ Calls TSNE model from scikit-learn on the SNP or subsampled SNP data set. The 'seed' argument is used for subsampling SNPs. Perplexity is the primary parameter affecting the TSNE, but any additional params supported by scikit-learn can be supplied as kwargs. """ seed = (seed if seed else self._seed()) if subsample: data = self.snps[:, jsubsample_snps(self.snpsmap, seed)] print("Subsampling SNPs: {}/{}".format(data.shape[1], self.snps.shape[1])) else: data = self.snps # init TSNE model object with params (sensitive) tsne_kwargs = { 'perplexity': perplexity, 'init': 'pca', 'n_iter': int(n_iter), 'random_state': seed, } tsne_kwargs.update(kwargs) tsne_model = TSNE(**tsne_kwargs) # fit the model tsne_data = tsne_model.fit_transform(data) self.pcaxes = {0: tsne_data} self.variances = {0: [-1.0, -2.0]} self._model = "TSNE"
def run_tsne(self, subsample=True, perplexity=5.0, n_iter=1e6, seed=None): """ Calls TSNE model from scikit-learn on """ seed = (seed if seed else self._seed()) if subsample: data = self.snps[:, jsubsample_snps(self.snpsmap, seed)] print( "Subsampling SNPs: {}/{}" .format(data.shape[1], self.snps.shape[1]) ) else: data = self.snps # init TSNE model object with params (sensitive) tsne_model = TSNE( perplexity=perplexity, init='pca', n_iter=int(n_iter), random_state=seed, ) # fit the model tsne_data = tsne_model.fit_transform(data) self.pcaxes = {0: tsne_data} self.variances = {0: [-1.0, -2.0]}
def subsample_snps(self, random_seed=None, quiet=False): "Calls jitted subsample SNPs function to sample snps using snpsmap." if not random_seed: random_seed = np.random.randint(0, 1e9) subarr = self.snps[:, jsubsample_snps(self.snpsmap, random_seed)] if not quiet: self._print("subsampled {} unlinked SNPs".format(subarr.shape[1])) return subarr
def _impute_kmeans(self, topcov=0.9, niters=5, quiet=False): # the ML models to fit pca_model = decomposition.PCA(n_components=None) # self.ncomponents) kmeans_model = KMeans(n_clusters=self.impute_method) # start kmeans with a global imap kmeans_imap = {'global': self.names} # iterate over step values iters = np.linspace(topcov, self.mincov, niters) for it, kmeans_mincov in enumerate(iters): # start message kmeans_minmap = {i: self.mincov for i in kmeans_imap} self._print( "Kmeans clustering: iter={}, K={}, mincov={}, minmap={}". format(it, self.impute_method, kmeans_mincov, kmeans_minmap)) # 1. Load orig data and filter with imap, minmap, mincov=step se = SNPsExtracter( self.data, imap=kmeans_imap, minmap=kmeans_minmap, mincov=kmeans_mincov, quiet=self.quiet, ) se.parse_genos_from_hdf5() # update snpsmap to new filtered data to use for subsampling self.snpsmap = se.snpsmap # 2. Impute missing data using current kmeans clusters impdata = SNPsImputer(se.snps, se.names, kmeans_imap, "sample", self.quiet).run() # x. On final iteration return this imputed array as the result if it == 4: return impdata # 3. subsample unlinked SNPs subdata = impdata[:, jsubsample_snps(se.snpsmap, self._seed())] # 4. PCA on new imputed data values pcadata = pca_model.fit_transform(subdata) # 5. Kmeans clustering to find new imap grouping kmeans_model.fit(pcadata) labels = np.unique(kmeans_model.labels_) kmeans_imap = { i: [se.names[j] for j in np.where(kmeans_model.labels_ == i)[0]] for i in labels } self._print(kmeans_imap) self._print("")
def subsample_snps(self, random_seed=None, quiet=False): """ Calls jitted functions to subsample 1 SNP per locus/linkage-block using snpsmap. """ if not random_seed: random_seed = np.random.randint(0, 1e9) subarr = self.snps[:, jsubsample_snps(self.snpsmap, random_seed)] if not quiet: self._print("subsampled {} unlinked SNPs".format(subarr.shape[1])) return subarr
def subsample_loci(self, random_seed=None, quiet=False): """ Calls jitted functions to subsample loci/linkage-blocks with replacement to the same number as the original assembly. This does not subsample unlinked SNPs per locus. """ raise NotImplementedError("TODO") if not random_seed: random_seed = np.random.randint(0, 1e9) subarr = self.snps[:, jsubsample_snps(self.snpsmap, random_seed)] if not quiet: self._print("subsampled {} loci w/ replacement".format( subarr.shape[1])) return subarr
def run(self, seed=None, subsample=True): # , model="pca"): """ Decompose genotype array (.snps) into n_components axes. Parameters: ----------- seed: (int) Random number seed used if/when subsampling SNPs. subsample: (bool) Subsample one SNP per RAD locus to reduce effect of linkage. Returns: -------- A tuple with two numpy arrays. The first is the new data decomposed into principal coordinate space; the second is an array with the variance explained by each PC axis. """ # update seed. Numba seed cannot be None, so get random int if None if seed: self.seed = seed else: self.seed = self._seed() # sample one SNP per locus if subsample: data = self.snps[:, jsubsample_snps(self.snpsmap, self.seed)] self._print("Subsampling SNPs: {}/{}".format( data.shape[1], self.snps.shape[1])) else: data = self.snps # decompose pca call # if model == "pca": model = decomposition.PCA(None) # self.ncomponents) model.fit(data) newdata = model.transform(data) variance = model.explained_variance_ratio_ # elif model in ("tSNE", "t-SNE", "TSNE", "T-SNE"): # model = TSNE( # init="pca", perplexity=data.shape[0] / 2., n_iter=100000) # newdata = model.fit_transform(data) # variance = "", "" # return tuple with new coordinates and variance explained return newdata, variance
def _run(self, seed, subsample, quiet): """ Called inside .run(). A single iteration. """ # sample one SNP per locus if subsample: data = self.snps[:, jsubsample_snps(self.snpsmap, seed)] if not quiet: print("Subsampling SNPs: {}/{}".format(data.shape[1], self.snps.shape[1])) else: data = self.snps # decompose pca call model = decomposition.PCA(None) # self.ncomponents) model.fit(data) newdata = model.transform(data) variance = model.explained_variance_ratio_ # return tuple with new coordinates and variance explained return newdata, variance
def run_umap(self, subsample=True, seed=123, n_neighbors=15, **kwargs): """ """ # check just-in-time install try: import umap except ImportError: raise ImportError( "to use this function you must install umap with:\n" " conda install umap-learn -c conda-forge " ) # subsample SNPS seed = (seed if seed else self._seed()) if subsample: data = self.snps[:, jsubsample_snps(self.snpsmap, seed)] print( "Subsampling SNPs: {}/{}" .format(data.shape[1], self.snps.shape[1]) ) else: data = self.snps # init TSNE model object with params (sensitive) umap_kwargs = { 'n_neighbors': n_neighbors, 'init': 'spectral', 'random_state': seed, } umap_kwargs.update(kwargs) umap_model = umap.UMAP(**umap_kwargs) # fit the model umap_data = umap_model.fit_transform(data) self.pcaxes = {0: umap_data} self.variances = {0: [-1.0, -2.0]} self._model = "UMAP"