def test_mantonbm(): print("Testing on MantonBM dataset...") z_files = [f for f in os.listdir("./result") if re.match("MantonBM.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/MantonBM_result.h5ad"): adata = pg.read_input("./data/MantonBM/original_data.h5ad") adata.obs['Individual'] = pd.Categorical(adata.obs['Channel'].apply(lambda s: s.split('_')[0][-1])) if os.path.exists("./result/MantonBM_torch_z.npy"): Z_torch = np.load("./result/MantonBM_torch_z.npy") print("Precalculated embedding by harmony-pytorch is loaded.") else: start_torch = time.time() Z_torch = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = 'Channel') end_torch = time.time() print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)) np.save("./result/MantonBM_torch_z.npy", Z_torch) if os.path.exists("./result/MantonBM_py_z.npy"): Z_py = np.load("./result/MantonBM_py_z.npy") print("Precalculated embedding by harmonypy is loaded.") else: start_py = time.time() ho = run_harmony(adata.obsm['X_pca'], adata.obs, ['Channel']) end_py = time.time() print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) Z_py = np.transpose(ho.Z_corr) np.save("./result/MantonBM_py_z.npy", Z_py) Z_R = np.loadtxt("./result/MantonBM_harmony_z.txt") check_metric(Z_torch, Z_py, Z_R, prefix = "MantonBM", norm = 'r') check_metric(Z_torch, Z_py, Z_R, prefix = "MantonBM", norm = 'L2') if os.path.exists("./result/MantonBM_result.h5ad"): adata = None umap_list = [f for f in os.listdir("./plots") if re.match("MantonBM.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "MantonBM", batch_key = "Individual")
def test_pbmc(): print("Testing on 10x pbmc dataset...") z_files = [f for f in os.listdir("./result") if re.match("pbmc.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/pbmc_result.h5ad"): adata = pg.read_input("./data/10x_pbmc/original_data.h5ad") if os.path.exists("./result/pbmc_torch_z.npy"): Z_torch = np.load("./result/pbmc_torch_z.npy") print("Precalculated embedding by harmony-pytorch is loaded.") else: start_torch = time.time() Z_torch = harmonize(adata.obsm['X_pca'], adata.obs, batch_key = 'Channel') end_torch = time.time() print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)) np.save("./result/pbmc_torch_z.npy", Z_torch) if os.path.exists("./result/pbmc_py_z.npy"): Z_py = np.load("./result/pbmc_py_z.npy") print("Precalculated embedding by harmonypy is loaded.") else: start_py = time.time() ho = run_harmony(adata.obsm['X_pca'], adata.obs, ['Channel']) end_py = time.time() print(ho.objective_harmony) print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) Z_py = np.transpose(ho.Z_corr) np.save("./result/pbmc_py_z.npy", Z_py) Z_R = np.loadtxt("./result/pbmc_harmony_z.txt") check_metric(Z_torch, Z_py, Z_R, prefix = "pbmc", norm = 'r') check_metric(Z_torch, Z_py, Z_R, prefix = "pbmc", norm = 'L2') if os.path.exists("./result/pbmc_result.h5ad"): adata = None umap_list = [f for f in os.listdir("./plots") if re.match("pbmc.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "pbmc", batch_key = "Channel")
def run(self, **kwargs): """ Run the harmony algorithm (see https://github.com/slowkow/harmonypy for details). Resulting object is stored in 'harmony' attribute Parameters ---------- kwargs: Additional keyword arguments passed to harmonypy.run_harmony Returns ------- Harmony """ data = self.data[self.features].astype(float) self.harmony = harmonypy.run_harmony(data_mat=data.values, meta_data=self.meta, vars_use="sample_id", **kwargs) return
# 296 t293_TAGAATTGTTGGTG t293 3097 0.021769 t293 # 297 t293_CGGATAACACCACA t293 3157 0.020411 t293 # 298 t293_GGTACTGAGTCGAT t293 2685 0.027846 t293 # 299 t293_ACGCTGCTTCTTAC t293 3513 0.021240 t293 # [300 rows x 5 columns] # data_mat[:5,:5] # # array([[ 0.0071695 , -0.00552724, -0.0036281 , -0.00798025, 0.00028931], # [-0.011333 , 0.00022233, -0.00073589, -0.00192452, 0.0032624 ], # [ 0.0091214 , -0.00940727, -0.00106816, -0.0042749 , -0.00029096], # [ 0.00866286, -0.00514987, -0.0008989 , -0.00821785, -0.00126997], # [-0.00953977, 0.00222714, -0.00374373, -0.00028554, 0.00063737]]) ho = hm.run_harmony(data_mat, meta_data, vars_use) # Write the adjusted PCs to a new file. res = pd.DataFrame(ho.Z_corr) res.columns = ['X{}'.format(i + 1) for i in range(res.shape[1])] res.to_csv("data/adj.tsv.gz", sep="\t", index=False) # Test 2 ######################################################################## import pandas as pd import numpy as np from scipy.cluster.vq import kmeans from scipy.stats.stats import pearsonr import harmonypy as hm
def test_cell_lines(): print("Testing on cell lines dataset...") z_files = [f for f in os.listdir("./result") if re.match("cell_lines.*_z.(txt|npy)", f)] if len(z_files) < 3 or not os.path.exists("./result/cell_lines_result.h5ad"): X = np.loadtxt("./data/cell_lines/pca.txt") df_metadata = pd.read_csv("./data/cell_lines/metadata.csv") source_loaded = True if os.path.exists("./result/cell_lines_torch_z.npy"): Z_torch = np.load("./result/cell_lines_torch_z.npy") print("Precalculated embedding by harmony-pytorch is loaded.") else: start_torch = time.time() Z_torch = harmonize(X, df_metadata, batch_key = 'dataset') end_torch = time.time() print("Time spent for harmony-pytorch = {:.2f}s.".format(end_torch - start_torch)) np.save("./result/cell_lines_torch_z.npy", Z_torch) if os.path.exists("./result/cell_lines_py_z.npy"): Z_py = np.load("./result/cell_lines_py_z.npy") print("Precalculated embedding by harmonypy is loaded.") else: start_py = time.time() ho = run_harmony(X, df_metadata, ['dataset']) end_py = time.time() print("Time spent for harmonypy = {:.2f}s.".format(end_py - start_py)) print(ho.objective_harmony) Z_py = np.transpose(ho.Z_corr) np.save("./result/cell_lines_py_z.npy", Z_py) Z_R = np.loadtxt("./result/cell_lines_harmony_z.txt") check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'r') check_metric(Z_torch, Z_py, Z_R, prefix = "cell_lines", norm = 'L2') if os.path.exists("./result/cell_lines_result.h5ad"): adata = None else: n_obs = X.shape[0] adata = AnnData(X = csr_matrix((n_obs, 2)), obs = df_metadata) adata.obsm['X_pca'] = X pg.neighbors(adata, rep = 'pca') pg.umap(adata) umap_list = [f for f in os.listdir("./plots") if re.match("cell_lines.*.pdf", f)] if len(umap_list) < 4: plot_umap(adata, Z_torch, Z_py, Z_R, prefix = "cell_lines", batch_key = "dataset") if os.path.exists("./result/cell_lines_result.h5ad"): adata = pg.read_input("./result/cell_lines_result.h5ad", h5ad_mode = 'r') stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'harmony') print("kBET for Harmony: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate)) stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'py') print("kBET for harmonypy: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate)) stat, pvalue, ac_rate = pg.calc_kBET(adata, attr = 'dataset', rep = 'torch') print("kBET for harmony-pytorch: statistic = {stat}, p-value = {pval}, ac rate = {ac_rate}".format(stat = stat, pval = pvalue, ac_rate = ac_rate))
import os #import sys abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) # os.chdir("U:\\GitHub\\scGEAToolbox\\+run\\thirdparty\\harmony") import pandas as pd import numpy as np from scipy.cluster.vq import kmeans from scipy.stats.stats import pearsonr import harmonypy as hm meta_data = pd.read_csv("input2.csv") # data_mat = pd.read_csv("input1.csv", header=None) data_mat = pd.read_csv("input1.csv") data_mat = np.array(data_mat) vars_use = ['batchidx'] ho = hm.run_harmony(data_mat, meta_data, vars_use) res = pd.DataFrame(ho.Z_corr.T) # res.columns = ['X{}'.format(i + 1) for i in range(res.shape[1])] res.to_csv("output.csv", sep="\t", index=False, header=False)
morphology_features = infer_cp_features( df, compartments=["Cells", "Cytoplasm", "Nuclei"]) metadata_cols = ["Image_Metadata_Well"] + infer_cp_features(df, metadata=True) # Fit PCA pca = PCA(n_components=num_pcs) pca.fit(df.loc[:, morphology_features]) # Transform PCA pc_df = pca.transform(df.loc[:, morphology_features]) pc_df = pd.DataFrame(pc_df).add_prefix("pca_") # Apply harmony per plate harmony_out = (hm.run_harmony(data_mat=pc_df, meta_data=df.loc[:, metadata_cols], vars_use=harmony_adjust_vars_perplate, random_state=harmony_random_state)) # Compile harmony output file harmony_df = pd.concat([ df.loc[:, metadata_cols], pd.DataFrame(harmony_out.Z_corr).transpose().add_prefix("harmonized_") ], axis="columns") # Output harmonized file output_file = pathlib.Path(f"{data_dir}/{plate}_{output_file_suffix}") harmony_df.to_csv(output_file, index=False, sep=",") # Apply an inverse transform to get back to original feature space inverse_harmony_df = pd.concat([
def harmony_integrate( adata: AnnData, key: str, basis: str = "X_pca", adjusted_basis: str = "X_pca_harmony", **kwargs, ): """\ Use harmonypy [Korunsky19]_ to integrate different experiments. Harmony [Korunsky19]_ is an algorithm for integrating single-cell data from multiple experiments. This function uses the python port of Harmony, ``harmonypy``, to integrate single-cell data stored in an AnnData object. As Harmony works by adjusting the principal components, this function should be run after performing PCA but before computing the neighbor graph, as illustrated in the example below. Parameters ---------- adata The annotated data matrix. key The name of the column in ``adata.obs`` that differentiates among experiments/batches. basis The name of the field in ``adata.obsm`` where the PCA table is stored. Defaults to ``'X_pca'``, which is the default for ``sc.tl.pca()``. adjusted_basis The name of the field in ``adata.obsm`` where the adjusted PCA table will be stored after running this function. Defaults to ``X_pca_harmony``. kwargs Any additional arguments will be passed to ``harmonypy.run_harmony()``. Returns ------- Updates adata with the field ``adata.obsm[obsm_out_field]``, containing principal components adjusted by Harmony such that different experiments are integrated. Example ------- First, load libraries and example dataset, and preprocess. >>> import scanpy as sc >>> import scanpy.external as sce >>> adata = sc.datasets.pbmc3k() >>> sc.pp.recipe_zheng17(adata) >>> sc.tl.pca(adata) We now arbitrarily assign a batch metadata variable to each cell for the sake of example, but during real usage there would already be a column in ``adata.obs`` giving the experiment each cell came from. >>> adata.obs['batch'] = 1350*['a'] + 1350*['b'] Finally, run harmony. Afterwards, there will be a new table in ``adata.obsm`` containing the adjusted PC's. >>> sce.pp.harmony_integrate(adata, 'batch') >>> 'X_pca_harmony' in adata.obsm True """ try: import harmonypy except ImportError: raise ImportError( "\nplease install harmonypy:\n\n\tpip install harmonypy") harmony_out = harmonypy.run_harmony(adata.obsm[basis], adata.obs, key, **kwargs) adata.obsm[adjusted_basis] = harmony_out.Z_corr.T
def main(): """Run CLI.""" parser = argparse.ArgumentParser(description=""" Runs Harmony on PCs. """) parser.add_argument( '-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__)) parser.add_argument( '-pc', '--pca_file', action='store', dest='pc', required=True, help='Tab-delimited file of PCs for each cell. First column is\ cell_barcode. Subsequent columns are PCs.') parser.add_argument( '-mf', '--metadata_file', action='store', dest='mf', required=True, help='Tab-delimited metadata file, must have a column labelled\ cell_barcode that maps to pca_file.') parser.add_argument( '-mc', '--metadata_columns', action='store', dest='mc', required=True, help='Comma separated string of columns to use in metadata_file.') parser.add_argument( '-t', '--theta', action='store', dest='theta', default='', help='Comma separated string of theta values (corresponding to\ metadata_columns). If "" then sets theta to 2 for all\ columns. Larger values of theta result in more diverse\ clusters.\ (default: "")') parser.add_argument('-npc', '--n_pcs', action='store', dest='npc', default=0, type=int, help='Number of PCs to use.\ (default: maximum number in tsv_pcs file)') parser.add_argument( '-of', '--out_file', action='store', dest='of', default='', help='Basename of output files, assuming output in current working \ directory.\ (default: <tsv_pcs>-harmony)') options = parser.parse_args() # Fixed settings. verbose = True # Get the out file base. out_file_base = options.of if out_file_base == '': out_file_base = '{}-harmony'.format( os.path.basename(options.pc.rstrip('tsv.gz').rstrip('.'))) # Load the PCs. df_pca = pd.read_csv(options.pc, sep='\t', index_col='cell_barcode') # Check that nPCs is valid. n_pcs = options.npc if n_pcs == 0: n_pcs = len(df_pca.columns) elif n_pcs > len(df_pca.columns): raise Exception( '--number_pcs ({}) is > than n_pcs in --tsv_pcs ({}).'.format( n_pcs, len(df_pca.columns))) if verbose: print('Using {} PCs.'.format(n_pcs)) # Subset down to these PCs. df_pca = df_pca.iloc[:, range(0, n_pcs)] # Get the metadata_file columns that we want to adjust with Harmony. metadata_columns = options.mc.split(',') # Read in the metadata file. df_meta = pd.read_csv(options.mf, sep='\t', index_col='cell_barcode') # Ensure cell order in df_meta is the same as df_pca df_meta = df_meta.loc[df_pca.index, metadata_columns] # Also ensure that the metadata columns are categorical -- run_harmony # fails if not categorical try: df_meta[metadata_columns].describe().loc['unique'] except KeyError: print( "metadata_columns contains non-categorical attributes. Harmony does \ not work with continuous variables. Either make attributes a string or \ use a different column.") # Get the theta values for each column (if none, set to 2 for all columns). theta = [2] * len(metadata_columns) if options.theta != '': theta = [float(i) for i in options.theta.split(',')] # Run Harmony harmony_embeddings = hm.run_harmony( data_mat=df_pca.values, # Pandas dataframe to numpy.ndarray meta_data=df_meta, vars_use=metadata_columns, theta=theta, max_iter_kmeans=500, verbose=verbose) # NOTE: harmony_embeddings.result() == harmony_embeddings.Z_corr df_harmony = pd.DataFrame(np.transpose(harmony_embeddings.Z_corr)) harmony_cols = [ 'harmony{}'.format(i + 1) for i in range(df_harmony.shape[1]) ] df_harmony.columns = harmony_cols df_harmony['cell_barcode'] = df_pca.index final_col_order = ['cell_barcode'] final_col_order.extend(harmony_cols) df_harmony = df_harmony.loc[:, final_col_order] # Save the clustered data to a data frame. df_harmony.to_csv( '{}.tsv.gz'.format(out_file_base), sep='\t', index=False, quoting=csv.QUOTE_NONNUMERIC, # index_label='cell_barcode', na_rep='', compression='gzip')