import pandas as pd from sklearn.feature_extraction.text import TfidfTransformer import phate import scattertext as st from scipy.sparse.linalg import svds convention_df = st.SampleCorpora.ConventionData2012.get_data() convention_df['parse'] = convention_df['text'].apply( st.whitespace_nlp_with_sentences) corpus = (st.CorpusFromParsedDocuments( convention_df, category_col='party', parsed_col='parse').build().get_stoplisted_unigram_corpus()) corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['speaker']) embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat()) projection_raw = phate.PHATE().fit_transform(embeddings).T projection = pd.DataFrame({ 'term': corpus.get_metadata(), 'x': projection_raw[0], 'y': projection_raw[1] }).set_index('term') category = 'democrat' scores = (corpus.get_category_ids() == corpus.get_categories().index(category) ).astype(int) html = st.produce_pca_explorer(corpus, category=category, category_name='Democratic', not_category_name='Republican', metadata=convention_df['speaker'], width_in_pixels=1000,
def spectral(data, label, gamma_): data_shape = data.shape label_shape = label.shape data = data.to_numpy() label = label.to_numpy() if gamma_ != 0: # test spectral_cluster = SpectralClustering(n_clusters=10, gamma=gamma_) y_pred = spectral_cluster.fit_predict(data) true = np.squeeze(label) ARI = adjusted_rand_score(true, y_pred) print('Testing ARI: ', ARI) phate_operator = phate.PHATE(t=25) spectral_phate = phate_operator.fit_transform(data) phate.plot.scatter2d(spectral_phate, c=y_pred) else: # train index = np.arange(data_shape[0]) np.random.shuffle(index) sample = 10 subsample_size = int(data_shape[0] / sample) ARI_subsample = np.zeros(sample) gamma_subsample = np.zeros(sample) for i in range(sample): print('current:', i) start = int(i * subsample_size) end = int((i + 1) * subsample_size) # subsampleing selected_data = np.zeros((subsample_size, data_shape[1])) selected_label = np.zeros((subsample_size, label_shape[1])) location = 0 for id in index[start:end]: selected_data[location, :] = data[id, :] selected_label[location] = label[id] location += 1 best_ARI_i = 0 best_gamma = 0 best_y_pred = np.zeros((subsample_size, label_shape[1])) for gamma_value in np.arange(0, 2, 0.2): spectral_cluster = SpectralClustering( n_clusters=10, gamma=gamma_value) # affinity:default ‘rbf’ y_pred = spectral_cluster.fit_predict(selected_data) selected_label = np.squeeze(selected_label) current_ARI = adjusted_rand_score(selected_label, y_pred) if current_ARI > best_ARI_i: best_ARI_i = current_ARI best_gamma = gamma_value best_y_pred = y_pred ARI_subsample[i] = best_ARI_i gamma_subsample[i] = best_gamma # plot phate for the last subsample if i == int(sample - 1): phate_operator = phate.PHATE(t=25) spectral_phate = phate_operator.fit_transform(selected_data) phate.plot.scatter2d(spectral_phate, c=best_y_pred) print('ARI: ', ARI_subsample) print('The average ARI: ', np.average(ARI_subsample)) print('Gamma: ', gamma_subsample) print('The average gamma: ', np.average(gamma_subsample))
def phate( adata: AnnData, n_components: int = 2, k: int = 5, a: int = 15, n_landmark: int = 2000, t: Union[int, str] = 'auto', gamma: float = 1.0, n_pca: int = 100, knn_dist: str = 'euclidean', mds_dist: str = 'euclidean', mds: Literal['classic', 'metric', 'nonmetric'] = 'metric', n_jobs: Optional[int] = None, random_state: Optional[Union[int, RandomState]] = None, verbose: Union[bool, int, None] = None, copy: bool = False, **kwargs, ) -> Optional[AnnData]: """\ PHATE [Moon17]_. Potential of Heat-diffusion for Affinity-based Trajectory Embedding (PHATE) embeds high dimensional single-cell data into two or three dimensions for visualization of biological progressions. For more information and access to the object-oriented interface, read the `PHATE documentation <https://phate.readthedocs.io/>`__. For tutorials, bug reports, and R/MATLAB implementations, visit the `PHATE GitHub page <https://github.com/KrishnaswamyLab/PHATE/>`__. For help using PHATE, go `here <https://krishnaswamylab.org/get-help>`__. Parameters ---------- adata Annotated data matrix. n_components number of dimensions in which the data will be embedded k number of nearest neighbors on which to build kernel a sets decay rate of kernel tails. If None, alpha decaying kernel is not used n_landmark number of landmarks to use in fast PHATE t power to which the diffusion operator is powered sets the level of diffusion. If 'auto', t is selected according to the knee point in the Von Neumann Entropy of the diffusion operator gamma Informational distance constant between -1 and 1. `gamma=1` gives the PHATE log potential, `gamma=0` gives a square root potential. n_pca Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in log(n_samples) time. knn_dist recommended values: 'euclidean' and 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph mds_dist recommended values: 'euclidean' and 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for MDS mds Selects which MDS algorithm is used for dimensionality reduction. n_jobs The number of jobs to use for the computation. If `None`, `sc.settings.n_jobs` is used. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used random_state Random seed. Defaults to the global `numpy` random number generator verbose If `True` or an `int`/`Verbosity` ≥ 2/`hint`, print status messages. If `None`, `sc.settings.verbosity` is used. copy Return a copy instead of writing to `adata`. kwargs Additional arguments to `phate.PHATE` Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. **X_phate** : `np.ndarray`, (`adata.obs`, shape=[n_samples, n_components], dtype `float`) PHATE coordinates of data. Examples -------- >>> from anndata import AnnData >>> import scanpy.external as sce >>> import phate >>> tree_data, tree_clusters = phate.tree.gen_dla( ... n_dim=100, ... n_branch=20, ... branch_length=100, ... ) >>> tree_data.shape (2000, 100) >>> adata = AnnData(tree_data) >>> sce.tl.phate(adata, k=5, a=20, t=150) >>> adata.obsm['X_phate'].shape (2000, 2) >>> sce.pl.phate(adata) """ start = logg.info('computing PHATE') adata = adata.copy() if copy else adata verbosity = settings.verbosity if verbose is None else verbose verbose = verbosity if isinstance(verbosity, bool) else verbosity >= 2 n_jobs = settings.n_jobs if n_jobs is None else n_jobs try: import phate except ImportError: raise ImportError( 'You need to install the package `phate`: please run `pip install ' '--user phate` in a terminal.') X_phate = phate.PHATE( n_components=n_components, k=k, a=a, n_landmark=n_landmark, t=t, gamma=gamma, n_pca=n_pca, knn_dist=knn_dist, mds_dist=mds_dist, mds=mds, n_jobs=n_jobs, random_state=random_state, verbose=verbose, **kwargs, ).fit_transform(adata) # update AnnData instance adata.obsm['X_phate'] = X_phate # annotate samples with PHATE coordinates logg.info( ' finished', time=start, deep=('added\n' " 'X_phate', PHATE coordinates (adata.obsm)"), ) return adata if copy else None
def phate(adata, n_components=2, k=5, a=15, n_landmark=2000, t='auto', gamma=1, n_pca=100, knn_dist='euclidean', mds_dist='euclidean', mds='metric', n_jobs=None, random_state=None, verbose=None, copy=False, **kwargs): """PHATE [Moon17]_. Potential of Heat-diffusion for Affinity-based Trajectory Embedding (PHATE) embeds high dimensional single-cell data into two or three dimensions for visualization of biological progressions. For more information and access to the object-oriented interface, read the `PHATE documentation <https://phate.readthedocs.io/>`__. For tutorials, bug reports, and R/MATLAB implementations, visit the `PHATE GitHub page <https://github.com/KrishnaswamyLab/PHATE/>`__. For help using PHATE, go `here <https://krishnaswamylab.org/get-help>`__. Parameters ---------- adata : :class:`~anndata.AnnData` Annotated data matrix. n_components : `int`, optional (default: 2) number of dimensions in which the data will be embedded k : `int`, optional (default: 5) number of nearest neighbors on which to build kernel a : `int`, optional (default: 15) sets decay rate of kernel tails. If None, alpha decaying kernel is not used n_landmark : `int`, optional (default: 2000) number of landmarks to use in fast PHATE t : `int` or 'auto', optional (default: 'auto') power to which the diffusion operator is powered sets the level of diffusion. If 'auto', t is selected according to the knee point in the Von Neumann Entropy of the diffusion operator gamma : float, optional, default: 1 Informational distance constant between -1 and 1. `gamma=1` gives the PHATE log potential, `gamma=0` gives a square root potential. n_pca : `int`, optional (default: 100) Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in log(n_samples) time. knn_dist : string, optional (default: 'euclidean') recommended values: 'euclidean' and 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph mds_dist : string, optional (default: 'euclidean') recommended values: 'euclidean' and 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for MDS mds : {'classic', 'metric', 'nonmetric'}, optional (default: 'metric') Selects which MDS algorithm is used for dimensionality reduction n_jobs : `int` or `None`, optional (default: `sc.settings.n_jobs`) The number of jobs to use for the computation. If `None`, `sc.settings.n_jobs` is used. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used random_state : `int`, `numpy.RandomState` or `None`, optional (default: `None`) Random seed. Defaults to the global `numpy` random number generator verbose : `bool`, `int` or `None`, optional (default: `sc.settings.verbosity`) If `True` or an integer `>= 2`, print status messages. If `None`, `sc.settings.verbosity` is used. copy : `bool` (default: `False`) Return a copy instead of writing to `adata`. kwargs : additional arguments to `phate.PHATE` Returns ------- Depending on `copy`, returns or updates `adata` with the following fields. X_phate : `np.ndarray`, (`adata.obs`, shape=[n_samples, n_components], dtype `float`) PHATE coordinates of data. Examples -------- >>> import scanpy.api as sc >>> import phate >>> tree_data, tree_clusters = phate.tree.gen_dla(n_dim=100, n_branch=20, branch_length=100) >>> tree_data.shape (2000, 100) >>> adata = sc.AnnData(tree_data) >>> sc.tl.phate(adata, k=5, a=20, t=150) >>> adata.obsm['X_phate'].shape (2000, 2) >>> sc.pl.phate(adata) """ logg.info('computing PHATE', r=True) adata = adata.copy() if copy else adata verbose = settings.verbosity if verbose is None else verbose if isinstance(settings.verbosity, (str, int)): verbose = _settings_verbosity_greater_or_equal_than(2) n_jobs = settings.n_jobs if n_jobs is None else n_jobs try: import phate except ImportError: raise ImportError( 'You need to install the package `phate`: please run `pip install ' '--user phate` in a terminal.') X_phate = phate.PHATE(n_components=n_components, k=k, a=a, n_landmark=n_landmark, t=t, gamma=gamma, n_pca=n_pca, knn_dist=knn_dist, mds_dist=mds_dist, mds=mds, n_jobs=n_jobs, random_state=random_state, verbose=verbose, **kwargs).fit_transform(adata) logg.info( ' finished', time=True, end=' ' if _settings_verbosity_greater_or_equal_than(3) else '\n') # update AnnData instance adata.obsm['X_phate'] = X_phate # annotate samples with PHATE coordinates logg.hint('added\n' ' \'X_phate\', PHATE coordinates (adata.obsm)') return adata if copy else None
def test_simple(): tree_data, tree_clusters = phate.tree.gen_dla() phate_operator = phate.PHATE(k=15, t=100) tree_phate = phate_operator.fit_transform(tree_data) assert tree_phate.shape == (tree_data.shape[0], 2)
if True: # save adata obj with batch correction adata.write(os.path.join(pdfp, 'mouse_200614.h5ad')) print('\n... saved @' + datetime.datetime.now().strftime('%y%m%d.%H:%M:%S')) print('... sc embeddings in {:.2f}-min'.format((time.time() - start) / 60)) # compute PHATE G = gt.Graph(data=adata.uns['neighbors']['connectivities'] + sparse.diags([1] * adata.shape[0], format='csr'), precomputed='adjacency', use_pygsp=True) G.knn_max = None phate_op = phate.PHATE(knn_dist='precomputed', gamma=0, n_jobs=-1, random_state=rs) adata.obsm['X_phate'] = phate_op.fit_transform(G.K) if True: # save adata obj with batch correction adata.write(os.path.join(pdfp, 'mouse_200614.h5ad')) print('\n... saved @' + datetime.datetime.now().strftime('%y%m%d.%H:%M:%S')) print('... full PHATE in {:.2f}-min'.format((time.time() - start) / 60)) if True: # MELD adata.obs['res_sca1'] = [ 1 if i == 'SCA1' else -1 for i in adata.obs['genotype'] ]
def run_phate_from_file( filename, # data loading params sparse=True, gene_names=None, cell_names=None, cell_axis=None, gene_labels=None, allow_duplicates=None, genome=None, metadata_channels=None, # filtering params min_library_size=2000, min_cells_per_gene=10, # normalization params library_size_normalize=True, transform='sqrt', pseudocount=None, cofactor=None, # kernel params knn=5, decay=15, n_pca=100, knn_dist='euclidean', n_jobs=1, random_state=42, verbose=1, # phate params n_components=2, t_phate='auto', gamma=1, mds_dist='euclidean', mds='metric', # output params output='phate.csv', validate=False): """Run PHATE and MAGIC on a file Parameters ---------- filename : str Allowed types: csv, tsv, mtx, hdf5/h5 (10X format), directory/zip (10X format) sparse : bool (recommended: True for scRNAseq, False for CyTOF) Force data sparsity. If `None`, sparsity is determined by data type. gene_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says gene names are data headers, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, `False` means no gene names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says cell names are data headers, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, `False` means no cell names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_axis : {'row', 'column'} States whether cells are on rows or columns. If cell_axis=='row', data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of shape [n_genes, n_cells]. Only valid for filetype mtx and csv gene_labels : {'symbol', 'id', 'both'} Choice of gene labels for 10X data. Recommended: 'both' Only valid for directory, zip, hdf5, h5 allow_duplicates : bool Allow duplicate gene names in 10X data. Recommended: True Only valid for directory, zip, hdf5, h5 genome : str Genome name. Only valid for hdf5, h5 metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1']) Names of channels in fcs data which are not real measurements. Only valid if datatype is fcs. min_library_size : int or `None`, optional (default: 2000) Cutoff for library size normalization. If `None`, library size filtering is not used min_cells_per_gene : int or `None`, optional (default: 10) Minimum non-zero cells for a gene to be used. If `None`, genes are not removed library_size_normalize : `bool`, optional (default: True) Use library size normalization transform : {'sqrt', 'log', 'arcsinh', None} How to transform the data. If `None`, no transformation is done pseudocount : float (recommended: 1) Number of pseudocounts to add to genes prior to log transformation cofactor : float (recommended: 5) Factor by which to divide genes prior to arcsinh transformation knn : int, optional, default: 10 number of nearest neighbors on which to build kernel decay : int, optional, default: 15 sets decay rate of kernel tails. If None, alpha decaying kernel is not used n_pca : int, optional, default: 100 Number of principal components to use for calculating neighborhoods. For extremely large datasets, using n_pca < 20 allows neighborhoods to be calculated in roughly log(n_samples) time. knn_dist : string, optional, default: 'euclidean' recommended values: 'euclidean', 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. n_jobs : integer, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize random PCA If an integer is given, it fixes the seed Defaults to the global `numpy` random number generator verbose : `int` or `boolean`, optional (default: 1) If `True` or `> 0`, print status messages n_components : int, optional, default: 2 number of dimensions in which the data will be embedded for PHATE mds_dist : string, optional, default: 'euclidean' recommended values: 'euclidean' and 'cosine' Any metric from `scipy.spatial.distance` can be used distance metric for MDS mds : string, optional, default: 'metric' choose from ['classic', 'metric', 'nonmetric']. Selects which MDS algorithm is used for dimensionality reduction gamma : float, optional, default: 1 Informational distance constant between -1 and 1. `gamma=1` gives the PHATE log potential, `gamma=0` gives a square root potential. t_phate : int, optional, default: 'auto' power to which the diffusion operator is powered for PHATE. This sets the level of diffusion. If 'auto', t is selected according to the knee point in the Von Neumann Entropy of the diffusion operator output : str, optional (default: 'phate.csv') Output CSV file to save low-dimensional embedding """ # check arguments filetype = check_filetype(filename) load_fn, load_kws = check_load_args(filetype, sparse=sparse, gene_names=gene_names, cell_names=cell_names, cell_axis=cell_axis, gene_labels=gene_labels, allow_duplicates=allow_duplicates, genome=genome, metadata_channels=metadata_channels) transform_fn, transform_kws = check_transform_args(transform=transform, pseudocount=pseudocount, cofactor=cofactor) # set up logging # https://github.com/scottgigante/tasklogger tasklogger.set_level(verbose) # load data # example: scprep.io.load_csv("data.csv") # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.io tasklogger.log_info("Loading data from {}...".format(filename)) data = load_fn(filename, **load_kws) data = scprep.sanitize.check_numeric(data, copy=True) tasklogger.log_info("Loaded {} cells and {} genes.".format( data.shape[0], data.shape[1])) # filter data # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.filter if min_library_size is not None: tasklogger.log_info("Filtering cells by library size >= {}...".format( min_library_size)) data = scprep.filter.filter_library_size(data, cutoff=min_library_size) tasklogger.log_info("Retained {} cells.".format(data.shape[0])) if min_cells_per_gene is not None: tasklogger.log_info( "Filtering genes by min cells >= {}...".format(min_cells_per_gene)) data = scprep.filter.filter_rare_genes(data, min_cells=min_cells_per_gene) tasklogger.log_info("Retained {} genes.".format(data.shape[1])) # normalize data # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.normalize if library_size_normalize: tasklogger.log_info("Library size normalizing data...") data = scprep.normalize.library_size_normalize(data) # transform data # example: data = scprep.transform.sqrt(data) # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.transform if transform is not None: tasklogger.log_info("Applying {} transform...".format(transform)) data = transform_fn(data, **transform_kws) # run PHATE # https://phate.readthedocs.io/ phate_op = phate.PHATE(knn=knn, decay=decay, t=t_phate, n_pca=n_pca, knn_dist=knn_dist, n_jobs=n_jobs, random_state=random_state, verbose=verbose, n_components=n_components, gamma=gamma, mds_dist=mds_dist, mds=mds) phate_data = phate_op.fit_transform(data) # save as csv phate_data = pd.DataFrame( phate_data, columns=["PHATE{}".format(i + 1) for i in range(n_components)], index=data.index if hasattr(data, 'index') else np.arange(phate_data.shape[0])) if cell_axis in ['col', 'column']: phate_data = phate_data.T tasklogger.log_info("Saving data to {}...".format(output)) phate_data.to_csv(output) tasklogger.log_info("Complete.".format(output)) if validate: correct_phate_data = scprep.io.load_csv( 'https://raw.githubusercontent.com/KrishnaswamyLab/phate-docker/' 'master/phate-validate.csv', sparse=False) try: np.testing.assert_equal(scprep.utils.toarray(phate_data), scprep.utils.toarray(correct_phate_data)) tasklogger.log_debug( "Validation complete, output is equal to expected") except AssertionError: np.testing.assert_allclose( scprep.utils.toarray(phate_data), scprep.utils.toarray(correct_phate_data), atol=1e-14) tasklogger.log_debug( "Validation complete, output is numerically equivalent to expected" )
import scattertext as st movie_df = st.SampleCorpora.RottenTomatoes.get_data() movie_df.category = movie_df.category \ .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x]) corpus = st.CorpusFromPandas( movie_df, category_col='movie_name', text_col='text', nlp=st.whitespace_nlp_with_sentences ).build().get_stoplisted_unigram_corpus() html = st.produce_pairplot( corpus, category_projector=st.CategoryProjector(projector=phate.PHATE()), metadata=movie_df['category'] + ': ' + movie_df['movie_name'], #scaler=st.Scalers.scale_0_to_1, #show_halo=False, #d3_url_struct=st.D3URLs( # d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js', # d3_url='scattertext/data/viz/scripts/d3.min.js' #), default_to_term_comparison=False ) file_name = 'movie_pair_plot_phates.html' open(file_name, 'wb').write(html.encode('utf-8')) print('./' + file_name)
embedding = umap.UMAP().fit_transform(subset_sc_df.loc[:, morph_features]) # Combine results with single cell dataframe embedding_df = pd.concat( [ subset_sc_df.drop(morph_features, axis="columns").reset_index(drop=True), pd.DataFrame(embedding, columns=["umap_0", "umap_1"]), ], axis="columns", ) all_sc_umap_embeddings.append(embedding_df.assign(grit_gene=gene)) # Apply PHATE phate_operator = phate.PHATE(n_jobs=-2) phate_operator.set_params(decay=20, t="auto", gamma=0, verbose=0) Y_phate = phate_operator.fit_transform( subset_sc_df.loc[:, morph_features]) # Combine results with single cell dataframe phate_embedding_df = pd.concat( [ subset_sc_df.drop(morph_features, axis="columns").reset_index(drop=True), pd.DataFrame(Y_phate, columns=["phate_0", "phate_1"]), ], axis="columns", ) all_sc_phate_embeddings.append(
def PHATE(data, verbose=False, n_jobs=-1, **kwargs): return phate.PHATE(verbose=verbose, n_jobs=n_jobs, **kwargs).fit_transform(data)
def main(): usage = "" # TODO parser = OptionParser(usage=usage) parser.add_option("-o", "--out_dir", help="Directory to write output") (options, args) = parser.parse_args() data_root = args[0] cancer_biomarker = args[1] cell_type_biomarkers = args[2].split(',') out_dir = options.out_dir sc.settings.verbosity = 3 sc.logging.print_versions() sc.settings.set_figure_params(dpi=80) data = H5COUNTS(join(data_root, DATA_F)) data.preprocess_data() data.add_clustering_results(path=join(data_root, 'interim/')) for tumor in TUMORS: ad = data.tumor_to_ad[tumor] obs_filt = ad.obs.loc[ad.obs['cluster'].notnull()] indices = [int(x) for x in obs_filt.index] X_filt = ad.X.iloc[indices] X_filt = X_filt.set_index(obs_filt.index) ad_filt = AnnData(X=X_filt, obs=obs_filt, var=ad.var) phate_operator = phate.PHATE(n_jobs=-2, random_state=1) X_phate = phate_operator.fit_transform(ad_filt.X) ad_filt.obs = pd.DataFrame( data=[[x, y, cluster] for (x, y), cluster in zip(X_phate, ad_filt.obs['cluster'])], columns=['PHATE 1', 'PHATE 2', 'cluster']) # Color points by cluster fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax = sc.pl.scatter(ad_filt, x='PHATE 1', y='PHATE 2', color='cluster', ax=ax, legend_loc='right margin', show=False) ax.set_xticks([]) ax.set_yticks([]) l, b, w, h = fig.axes[-1].get_position().bounds ll, bb, ww, hh = fig.axes[0].get_position().bounds plt.tight_layout() fig.savefig(join(out_dir, '{}_color_by_cluster.png'.format(tumor)), format='png', dpi=150 #bbox_inches='tight' ) # Color by genes genes = [cancer_biomarker] + cell_type_biomarkers for gene in genes: fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax = sc.pl.scatter(ad_filt, x='PHATE 1', y='PHATE 2', color=gene, ax=ax, legend_loc='right margin', show=False) ax.set_xticks([]) ax.set_yticks([]) l, b, w, h = fig.axes[-1].get_position().bounds ll, bb, ww, hh = fig.axes[0].get_position().bounds plt.tight_layout() fig.savefig(join(out_dir, '{}_color_by_{}.png'.format(tumor, gene)), format='png', dpi=150 #bbox_inches='tight' )
def test_tree(): # generate DLA tree M, C = phate.tree.gen_dla(n_dim=50, n_branch=4, branch_length=50, rand_multiplier=2, seed=37, sigma=4) # instantiate phate_operator phate_operator = phate.PHATE( n_components=2, decay=10, knn=5, t=30, mds="classic", knn_dist="euclidean", mds_dist="euclidean", n_jobs=-2, n_landmark=None, verbose=False, ) # run phate with classic MDS print("DLA tree, classic MDS") Y_cmds = phate_operator.fit_transform(M) assert Y_cmds.shape == (M.shape[0], 2) # run phate with metric MDS # change the MDS embedding without recalculating diffusion potential phate_operator.set_params(mds="metric") print("DLA tree, metric MDS (log)") Y_mmds = phate_operator.fit_transform(M) assert Y_mmds.shape == (M.shape[0], 2) # run phate with nonmetric MDS phate_operator.set_params(gamma=0) print("DLA tree, metric MDS (sqrt)") Y_sqrt = phate_operator.fit_transform(M) assert Y_sqrt.shape == (M.shape[0], 2) D = squareform(pdist(M)) K = phate_operator.graph.kernel phate_operator.set_params(knn_dist="precomputed", random_state=42, verbose=False) phate_precomputed_D = phate_operator.fit_transform(D) phate_precomputed_K = phate_operator.fit_transform(K) phate_operator.set_params(knn_dist="precomputed_distance") phate_precomputed_distance = phate_operator.fit_transform(D) phate_operator.set_params(knn_dist="precomputed_affinity") phate_precomputed_affinity = phate_operator.fit_transform(K) np.testing.assert_allclose(phate_precomputed_K, phate_precomputed_affinity, atol=5e-4) np.testing.assert_allclose(phate_precomputed_D, phate_precomputed_distance, atol=5e-4) return 0
x="Model", y=['Accuracy', 'F1', 'Recall', 'Precision'], barmode='group', height=400) fig.update_yaxes(title_text="Model Metrics") fig.update_layout(title_text="Model Performance") fig.show() # - # ### PHATE # !pip install phate import phate p = phate.PHATE(random_state=42) X_phate = p.fit_transform(X_train_prepared) X_phate.shape fig, ax = plt.subplots(figsize=(6, 4)) phate.plot.scatter2d(p, c=y_train['Bankrupt?'], ax=ax, alpha=0.5) # + from warnings import simplefilter simplefilter(action='ignore', category=FutureWarning) from sklearn.exceptions import ConvergenceWarning simplefilter("ignore", category=ConvergenceWarning) models = get_model() names, results, result_df = bl_performance(X_phate, y_train, models) # -
def compute_diffusion_potential(data, N, decay, gamma, knn, landmarks=2000, n_jobs=10, random_state=None): """Short summary. Parameters ---------- data : type Description of parameter `data`. N : type Description of parameter `N`. decay : type Description of parameter `decay`. gamma : type Description of parameter `gamma`. knn : type Description of parameter `knn`. landmarks : type Description of parameter `landmarks`. n_jobs : type Description of parameter `n_jobs`. random_state : integer or numpy.RandomState, optional, default: None The generator used to initialize PHATE and PCA. If an integer is given, it fixes the seed. Defaults to the global `numpy` random number generator Returns ------- type Description of returned object. """ with tasklogger.log_task("diffusion potential"): if landmarks != None and landmarks > data.shape[0]: landmarks = None diff_op = phate.PHATE( verbose=False, n_landmark=landmarks, decay=decay, gamma=gamma, n_pca=None, knn=knn, n_jobs=n_jobs, random_state=random_state, ) diff_op.fit(data) pca = sklearn.decomposition.PCA(n_components=25, random_state=random_state) diff_potential_pca = pca.fit_transform(diff_op.diff_potential) return ( diff_potential_pca[:, pca.explained_variance_ / np.sum(pca.explained_variance_) > 0.01], diff_op, pca, )
def PHATE(X, *args, is_graph=False, knn_dist='euclidean', verbose=0, **kwargs): if knn_dist is None: if is_graph: knn_dist = 'precomputed' return phate.PHATE(*args, knn_dist=knn_dist, verbose=verbose, **kwargs).fit_transform(X)
def run_phate( filename, # data loading params sparse=None, gene_names=None, cell_names=None, cell_axis=None, delimiter=None, gene_labels=None, allow_duplicates=None, genome=None, metadata_channels=None, # filtering params min_library_size=2000, min_cells_per_gene=10, # normalization params library_size_normalize=True, transform='sqrt', pseudocount=None, cofactor=None, **phate_kws): """Run PHATE on a file Parameters ---------- filename : str Allowed types: csv, tsv, mtx, hdf5/h5 (10X format), directory/zip (10X format) sparse : bool (recommended: True for scRNAseq, False for CyTOF) Force data sparsity. If `None`, sparsity is determined by data type. gene_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says gene names are data headers, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, `False` means no gene names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing gene names, list gives an array of gene names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_names : str, list or bool Allowed values: - if filetype is csv or fcs, `True` says cell names are data headers, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, `False` means no cell names are given - if filetype is mtx, `str` gives a path to a separate csv or tsv file containing cell names, list gives an array of cell names, or `False` means no gene names are given - if filetype is hdf5, h5, directory or zip, must be `None`. cell_axis : {'row', 'column'} States whether cells are on rows or columns. If cell_axis=='row', data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of shape [n_genes, n_cells]. Only valid for filetype mtx and csv gene_labels : {'symbol', 'id', 'both'} Choice of gene labels for 10X data. Recommended: 'both' Only valid for directory, zip, hdf5, h5 allow_duplicates : bool Allow duplicate gene names in 10X data. Recommended: True Only valid for directory, zip, hdf5, h5 genome : str Genome name. Only valid for hdf5, h5 metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1']) Names of channels in fcs data which are not real measurements. Only valid if datatype is fcs. min_library_size : int or `None`, optional (default: 2000) Cutoff for library size normalization. If `None`, library size filtering is not used min_cells_per_gene : int or `None`, optional (default: 10) Minimum non-zero cells for a gene to be used. If `None`, genes are not removed library_size_normalize : `bool`, optional (default: True) Use library size normalization transform : {'sqrt', 'log', 'arcsinh', None} How to transform the data. If `None`, no transformation is done pseudocount : float (recommended: 1) Number of pseudocounts to add to genes prior to log transformation cofactor : float (recommended: 5) Factor by which to divide genes prior to arcsinh transformation **phate_kws : keyword arguments for PHATE """ # check arguments if os.path.isdir(filename): filetype = 'dir' elif os.path.isfile(filename): filetype = filename.split('.')[-1] else: raise RuntimeError("file {} not found".format(filename)) load_args = [ 'gene_names', 'cell_names', 'cell_axis', 'delimiter', 'sparse', 'gene_labels', 'allow_duplicates', 'metadata_channels' ] if filetype == 'zip': load_fn = scpreprocess.io.load_10X_zip load_kws = { 'sparse': sparse, 'gene_labels': gene_labels, 'allow_duplicates': allow_duplicates } elif filetype == 'dir': load_fn = scpreprocess.io.load_10X load_kws = { 'sparse': sparse, 'gene_labels': gene_labels, 'allow_duplicates': allow_duplicates } elif filetype in ['hdf5', 'h5']: load_fn = scpreprocess.io.load_10X_HDF5 load_kws = { 'sparse': sparse, 'gene_labels': gene_labels, 'allow_duplicates': allow_duplicates, 'genome': genome } elif filetype == 'tsv': load_fn = scpreprocess.io.load_tsv load_kws = { 'sparse': sparse, 'gene_names': gene_names, 'cell_names': cell_names, 'cell_axis': cell_axis } elif filetype == 'csv': load_fn = scpreprocess.io.load_csv load_kws = { 'sparse': sparse, 'gene_names': gene_names, 'cell_names': cell_names, 'cell_axis': cell_axis } elif filetype == 'mtx': load_fn = scpreprocess.io.load_mtx load_kws = { 'sparse': sparse, 'gene_names': gene_names, 'cell_names': cell_names, 'cell_axis': cell_axis } elif filetype == 'fcs': load_fn = scpreprocess.io.load_fcs load_kws = { 'sparse': sparse, 'gene_names': gene_names, 'cell_names': cell_names, 'metadata_channels': metadata_channels } else: raise RuntimeError("filetype {} not recognized. Expected 'csv', " "'tsv', 'mtx', 'zip', 'hdf5', 'h5', 'fcs' or a " "directory".format(filetype)) for arg in load_args: if arg == 'sparse': # allow None pass elif arg in load_kws: assert eval(arg) is not None, \ "Expected {} not None for filetype {}".format(arg, filetype) else: assert eval(arg) is None, \ "Expected {} to be None for filetype {}. Got {}".format( arg, filetype, eval(arg)) transform_args = ['pseudocount', 'cofactor'] if transform == 'sqrt': transform_fn = scpreprocess.transform.sqrt_transform transform_kws = {} elif transform == 'log': transform_fn = scpreprocess.transform.log_transform transform_kws = {'cofactor': cofactor} elif transform == 'arcsinh': transform_fn = scpreprocess.transform.arcsinh_transform transform_kws = {'pseudocount': pseudocount} elif transform is None: transform_kws = {} else: raise RuntimeError("transformation {} not recognized. " "Choose from ['sqrt', 'log', 'arcsinh', " "None]".format(transform)) for arg in transform_args: if arg in transform_kws: assert eval(arg) is not None, \ "Expected {} not None for {} transformation".format( arg, transform) else: assert eval(arg) is None, \ "Expected {} to be None for {} transformation. Got {}".format( arg, transform, eval(arg)) data = load_fn(filename, **load_kws) if min_library_size is not None: data = scpreprocess.filter.filter_library_size(data, cutoff=min_library_size) if min_cells_per_gene is not None: data = scpreprocess.filter.remove_rare_genes(data, cutoff=min_cells_per_gene) if library_size_normalize: data = scpreprocess.normalize.library_size_normalize(data) if transform is not None: data = transform_fn(data, **transform_kws) phate_op = phate.PHATE(**phate_kws) phate_data = phate_op.fit_transform(data) return phate_data, phate_op
os.path.abspath(os.sep), "data", "lab", "DataSets", "Krause_2018_primary_parathyroid_adenoma", "ParaY9_HHT_cellranger", "filtered_gene_bc_matrices_h5.h5", # raw_gene_bc_matrices ), gene_labels='both', allow_duplicates=True) data = scprep.filter.remove_rare_genes(data, min_cells=3) data = scprep.normalize.library_size_normalize(data) data = scprep.transform.sqrt(data) ph = phate.PHATE(n_components=2) phate_data = ph.fit_transform(data) np.save("{}Phate2d.npy".format(data_name), phate_data) ph.set_params(n_components=3) phate3_data = ph.transform() np.save("{}Phate3d.npy".format(data_name), phate3_data) mg = magic.MAGIC() _ = mg.fit_transform(data) # reduce memory footprint del mg.graph.data del mg.graph.data_nu del mg.graph._kernel del mg.graph._diff_op