def mtx_to_scanpy(url, tmpdir): """ Build a scanpy object with the .mtx files from an ebi scAtlas endpoint. Caution, creates a tmp directory in ./tmp then deletes the contents right afterwards. :param url: single cell expression atlas url :param tmpdir: path for temporary write of *mtx* contents. :return: scanpy object with .X and .obs_names and var_names filled. """ opened_url = open_url(url) # Dump the zipped files into a temporary folder. zipped = zipfile.ZipFile(BytesIO(opened_url.read())) os.mkdir(tmpdir) zipped.extractall(tmpdir) # Grab the three filenames for access. filenames = zipped.namelist() # Get all the ./tmp file names mtxfile = [f for f in filenames if f.endswith(".mtx")][0] mtxfile = os.path.join(tmpdir, mtxfile) colsfile = [f for f in filenames if f.endswith(".mtx_cols")][0] colsfile = os.path.join(tmpdir, colsfile) rowsfile = [f for f in filenames if f.endswith(".mtx_rows")][0] rowsfile = os.path.join(tmpdir, rowsfile) # Fill the anndata object. anndata = sc.read_mtx(mtxfile).transpose() anndata.obs_names = pd.read_csv(colsfile, header=None, sep="\t")[1] anndata.var_names = pd.read_csv(rowsfile, header=None, sep="\t")[1] # Empty out the tmp dir... rmtree(tmpdir) return anndata
def scanpy_load_alevin_mtx(analysis_dir, *, force_var_names=None, force_obs_names=None): analysis_dir = Path(analysis_dir) quant_dir = analysis_dir / 'alevin' alevin = scanpy.read_mtx(quant_dir / 'quants_mat.mtx.gz') alevin_vars = pandas.read_csv(quant_dir / 'quants_mat_cols.txt', header=None).values.T alevin_obs = pandas.read_csv(quant_dir / 'quants_mat_rows.txt', header=None).values.T alevin.obs_names = alevin_obs[0] alevin.var_names = alevin_vars[0] alevin_df = alevin.to_df() if force_obs_names is not None: alevin_df = alevin_df.reindex(force_obs_names).fillna(0) else: force_obs_names = alevin_obs[0] if force_var_names is not None: alevin_df = alevin_df.T.reindex(force_var_names).T.fillna(0) else: force_var_names = alevin_vars[0] alevin = anndata.AnnData(X=scipy.sparse.csr_matrix(alevin_df)) alevin.obs_names = force_obs_names alevin.var_names = force_var_names alevin.obs['counts'] = alevin.X.sum(axis=1) alevin.obs['ngenes'] = numpy.array((alevin.X > 0).sum(axis=1)) return alevin
def read_mtx(path): """\ Read mtx format data folder including: * matrix file: e.g. count.mtx or matrix.mtx or their gz format * barcode file: e.g. barcode.txt * feature file: e.g. feature.txt Parameters ---------- path the path store the mtx files Return ------ AnnData """ for filename in glob(path + '/*'): if ('count' in filename or 'matrix' in filename or 'data' in filename) and ('mtx' in filename): adata = sc.read_mtx(filename).T for filename in glob(path + '/*'): if 'barcode' in filename: barcode = pd.read_csv(filename, sep='\t', header=None).iloc[:, -1].values adata.obs = pd.DataFrame(index=barcode) if 'gene' in filename or 'peaks' in filename: gene = pd.read_csv(filename, sep='\t', header=None).iloc[:, -1].values adata.var = pd.DataFrame(index=gene) elif 'feature' in filename: gene = pd.read_csv(filename, sep='\t', header=None).iloc[:, 1].values adata.var = pd.DataFrame(index=gene) return adata
def read_mtx(path): """ Read mtx format data folder including: matrix file: e.g. count.mtx or matrix.mtx barcode file: e.g. barcode.txt feature file: e.g. feature.txt """ for filename in glob(path + '/*'): if ('count' in filename or 'matrix' in filename or 'data' in filename) and ('mtx' in filename): adata = sc.read_mtx(filename).T for filename in glob(path + '/*'): if 'barcode' in filename: barcode = pd.read_csv(filename, sep='\t', header=None).iloc[:, -1].values print(len(barcode), adata.shape[0]) if len(barcode) != adata.shape[0]: adata = adata.transpose() adata.obs = pd.DataFrame(index=barcode) if 'gene' in filename or 'peaks' in filename or 'feature' in filename: gene = pd.read_csv(filename, sep='\t', header=None).iloc[:, -1].values if len(gene) != adata.shape[1]: adata = adata.transpose() adata.var = pd.DataFrame(index=gene) return adata
def read_GSE132044(): min_library_size = 5000 min_genes = 1000 folder = data_location+'/Single_Cell/Ding/' fname = data_location+'/Single_Cell/Ding/counts.read.txt' #Read in and form clusters data = sc.read_mtx(fname) data = sc.AnnData(X=data.X.transpose()) data.uns['min_library_size'] = min_library_size data.uns['min_genes'] = min_genes data.uns['folder'] = folder samples = pd.read_csv(folder+"/cells.read.txt", skiprows=0, header=None, sep='\t') genes = pd.read_csv(folder+"/genes.read.txt", skiprows=0, header=None, sep='\t') genes = [item.split("_")[0] for item in genes[0].values.astype(str)] name_map = pd.read_csv(folder+"/map.CPM.names.Count.names.txt", sep='\t', index_col=0) samples_meta = pd.read_csv(folder+"/meta.txt", sep="\t", index_col=0) name_map = name_map.merge(samples_meta, how='left', left_index=True, right_index=True) samples = samples.merge(name_map, how='left', left_on=0, right_index=True).set_index(0) #The authors do not provide meta data for all samples, presumably they are excluded from the analysis for a good reason sel = ~samples.Method.isnull().values & ~samples.CellType.isnull().values & (np.array(data.X.sum(1)).reshape(-1)>=min_library_size) \ & (np.array(data.X.astype(bool).sum(axis=1)).reshape(-1) >= min_genes) return None
def load_bustools_counts(prefix): prefix = str(prefix) data = sc.read_mtx(str(prefix) + '.mtx') data.obs.index = pd.read_csv(prefix + '.barcodes.txt', header=None)[0].values data.var.index = pd.read_csv(prefix + '.genes.txt', header=None)[0].values return data
def add_modality(self, modality: str, file_x: str, file_obs: str = None, file_var: str = None, obs_index: str = None, var_index: str = None, parent_folder: str = "", transpose_x=False, overwrite=False): """ Given up to 3 matrix files, creates AnnData file and adds it to Multimeasure object Parameters ---------- file_x Filename for the data matrix itself (as a Matrix Market file). file_obs Filename for the observation annotation matrix in csv format. file_var Filename for variables annotation matrix in csv format. obs_index column label in obs for the column that should be assigned to the index. Optional, but required for plotting and some filtering with scanpy. var_index column label in var for the column that should be assigned to the index. Optional, but required for plotting and some filtering with scanpy. """ if modality not in SUPPORTED_MODALITIES: raise AttributeError('Unsupported modality. Must be one of ' + str(SUPPORTED_MODALITIES)) X = sc.read_mtx(os.path.join(parent_folder, file_x)) if transpose_x: X = X.transpose() if file_obs: obs = pd.read_csv(os.path.join(parent_folder, file_obs)) else: obs = None if file_var: var = pd.read_csv(os.path.join(parent_folder, file_var)) else: var = None if modality in self.measures.keys(): if not overwrite: raise AttributeError("Modality of type: {}, already exist in Multimeasure object".format(modality)) else: logging.warn("Overwriting modality: {}".format(modality)) X.obs = obs X.var = var if var_index: X.var_names = X.var[var_index].tolist() if obs_index: X.obs_names = X.obs[obs_index].tolist() self.measures[modality] = X print("Modality {} added.".format(modality))
def get_matrix(countFile, outdir, barcodes, varlist, groups, annotation, type): # load adata adata = sc.read_mtx(countFile) # set obs.index and add other obs obs = pd.read_csv(barcodes, header=None) adata.obs.index = obs[0].values obs_new = obs[0].str.split("_", n=2, expand=True) n_col = obs_new.shape[1] if n_col > 1: adata.obs['sample'] = obs_new.iloc[:, 0:n_col-1].apply(lambda x: '_'.join(x), axis=1).astype('category').values adata.obs['barcode'] = obs_new[2].astype('category').values else: adata.obs['sample'] = 'one_sample' # set index if type == 'ECs': var = pd.read_csv(varlist, sep='\t', header=None, index_col=2) var.index = var.index.astype('str').values n_col = var.shape[1] if n_col > 2: var.drop([1]+list(range(3,n_col+1)), axis=1, inplace=True) else: var.drop([1], axis=1, inplace=True) var.columns = ['geneID'] adata.var = var elif type == 'genes': var = pd.read_csv(varlist, sep='\t', header=None, index_col=0) adata.var.index = var.index.values # extend variable df assumes same index if annotation is not None and type == 'genes': an = pd.read_csv(annotation, sep='\t', header=None, index_col=0) adata.var = pd.merge(adata.var, an, left_index=True, right_index=True, how='left') elif annotation is not None and type == 'ECs': an = pd.read_csv(annotation, sep='\t', header=None, index_col=0) adata.var = pd.merge(adata.var, an, left_on='geneID', right_index=True, how='left') # add predivined groups to obs, assumes categorical data and same index and header group_keys = None if groups is not None: obs_groups = pd.read_csv(groups, sep="\t") obs_groups.set_index(obs_groups.keys()[0], inplace=True) group_keys = obs_groups.keys() obs_groups = obs_groups[group_keys].astype('category') # assumed category based but maybe make this optional for continues scales adata.obs = adata.obs.merge(obs_groups, how='left', left_index=True, right_index=True) # color by gene print("\nData before filtering:") print(adata) return(adata, group_keys)
def _load_bustools_count(self): data = sc.read_mtx(self.input_filename + '.mtx') data.var = pd.read_csv(self.input_filename + '.genes.txt', sep='\t', header=None, index_col=0) data.obs = pd.read_csv(self.input_filename + '.barcodes.txt', sep='\t', header=None, index_col=0) return data
def _load(self): if self.input_format == 'h5ad': return sc.read_h5ad(self.input_filename) elif self.input_format == 'loom': return sc.read_loom(self.input_filename) elif self.input_format == '10x': return self._load_10x() elif self.input_format == 'mtx' or self.input_format == 'mex': return sc.read_mtx(self.input_filename) elif self.input_format == 'bustools-count': return self._load_bustools_count() return None
def scanpy_load_kallisto_gene_mtx(analysis_dir, filter_file=None): analysis_dir = Path(analysis_dir) kallisto = scanpy.read_mtx(analysis_dir / 'gene.mtx') kallisto_vars = pandas.read_csv(analysis_dir / 'gene.genes.txt', header=None).values.T kallisto_obs = pandas.read_csv(analysis_dir / 'gene.barcodes.txt', header=None).values.T kallisto.obs_names = kallisto_obs[0] kallisto.var_names = kallisto_vars[0] kallisto.obs['counts'] = kallisto.X.sum(axis=1) kallisto.obs['ngenes'] = numpy.array((kallisto.X > 0).sum(axis=1)) return kallisto
def read_dropest(dir_path, reorder=True): data_matrix = glob.glob(dir_path + "/*.mtx")[0] data_genes = glob.glob(dir_path + "/*features.*")[0] data_barcodes = glob.glob(dir_path + "/*barcodes.*")[0] adata = sc.read_mtx(data_matrix).T adata.var.index = pd.read_csv(data_genes, header=None)[0].values adata.obs.index = pd.read_csv(data_barcodes, header=None)[0].values adata.obs.index.name = 'Cells' adata.var.index.name = 'Genes' adata = reorder_AnnData(adata, descending=True) adata.raw = adata return (adata)
def scanpy_load_solo_mtx(analysis_dir, mode='filtered'): assert mode in ['filtered', 'raw'], 'STAR Solo only produces raw or filtered files' analysis_dir = Path(analysis_dir) solo_dir = analysis_dir / 'Solo.out' / 'Gene' / mode solo = scanpy.read_mtx(solo_dir / 'matrix.mtx').T solo_vars = pandas.read_csv(solo_dir / 'features.tsv', header=None, sep='\t').values.T solo_obs = pandas.read_csv(solo_dir / 'barcodes.tsv', header=None, sep='\t').values.T solo.obs_names = solo_obs[0] solo.var_names = solo_vars[0] solo.obs['counts'] = solo.X.sum(axis=1) solo.obs['ngenes'] = numpy.array((solo.X > 0).sum(axis=1)) return solo
def load_AnnData(file_x: str, file_obs: str = None, file_var: str = None, obs_index: str = None, var_index: str = None, parent_folder: str = "", transpose_x=False): """ Given up to 3 matrix files, creates AnnData file and adds it to Multimeasure object Parameters ---------- file_x Filename for the data matrix itself (as a Matrix Market file). file_obs Filename for the observation annotation matrix in csv format. file_var Filename for variables annotation matrix in csv format. obs_index column label in obs for the column that should be assigned to the index. Optional, but required for plotting and some filtering with scanpy. var_index column label in var for the column that should be assigned to the index. Optional, but required for plotting and some filtering with scanpy. """ X = sc.read_mtx(os.path.join(parent_folder, file_x)) if transpose_x: X = X.transpose() if file_obs: obs = pd.read_csv(os.path.join(parent_folder, file_obs)) else: obs = None if file_var: var = pd.read_csv(os.path.join(parent_folder, file_var)) else: var = None X.obs = obs X.var = var if var_index: X.var_names = X.var[var_index].tolist() if obs_index: X.obs_names = X.obs[obs_index].tolist() return X
def load_alevin(library_names, input_path): ''' Mirrors the functionality of load_inDrops (see below) Imports data files generated by Salmon-Alevin, when run with the --dumpMtx option. Specifically, this function will expect files at the following locations: /input_path/library_name/alevin/quants_mat.mtx.gz /input_path/library_name/alevin/quants_mat_rows.txt /input_path/library_name/alevin/quants_mat_cols.txt where 'library_names' contains one or more inDrops.py output folders located at the indicated path. ''' # Create a dictionary to hold data D = {} for j, s in enumerate(library_names): D[s] = {} # Load counts data, metadata, & convert to AnnData objects for s in library_names: # Load counts, gene names into AnnData structure D[s] = sc.read_mtx(input_path + '/' + s + '/alevin/quants_mat.mtx.gz', dtype='float32') D[s].var_names = np.loadtxt(input_path + '/' + s + '/alevin/quants_mat_cols.txt', dtype='str') D[s].obs['library_id'] = np.tile(s, [D[s].n_obs, 1]) D[s].uns['library_id'] = s # Load cell barcodes into AnnData structure cell_bcds = np.loadtxt(input_path + '/' + s + '/alevin/quants_mat_rows.txt', dtype='str') # Append library name to each cell barcode to create unique cell IDs lib_cell_bcds = [] for bcd in cell_bcds: lib_cell_bcds.append(s + '_' + bcd) D[s].obs['unique_cell_id'] = lib_cell_bcds # Compute total counts & number of genes per cell D[s].obs['n_counts'] = D[s].X.sum(1).A1 D[s].obs['n_genes'] = D[s].X.astype(bool).sum(axis=1) return D
def mtx_to_h5ad(args): """Converts .mtx files from 10x CellRanger or DropEst to .h5ad""" if args.dropest: if args.verbose: print("Reading DropEst .mtx files from {}".format(args.dir)) mtx_file = [ x for x in os.listdir(args.dir) if x.endswith("_counts.mtx") ][0] # get name of .mtx # define filenames mtx = args.dir + "/" + mtx_file genes = args.dir + "/" + mtx_file.split("_counts")[0] + "_features.txt" barcodes = args.dir + "/" + mtx_file.split( "_counts")[0] + "_barcodes.txt" # read files a = sc.read_mtx(mtx) # read matrix a = a.T # transpose DropEst matrix to cells x genes g = pd.read_csv(genes, delimiter="\t", header=None) # read genes b = pd.read_csv(barcodes, delimiter="\t", header=None) # read barcodes # add gene and barcode names a.obs_names = b[0].values a.var_names = g[0].values if args.verbose: print( "Writing counts to {}/{}.h5ad - {} cells and {} genes".format( args.outdir, mtx_file.split("_counts")[0], a.shape[0], a.shape[1])) a.write( "{}/{}.h5ad".format(args.outdir, mtx_file.split("_counts")[0]), compression="gzip", ) else: if args.verbose: print("Reading 10x CellRanger .mtx files from {}".format(args.dir)) a = sc.read_10x_mtx(args.dir) name = args.dir.split("_gene_bc_matrices")[ 0] # extract name from 10x directory name if args.verbose: print( "Writing counts to {}/{}.h5ad - {} cells and {} genes".format( args.outdir, name, a.shape[0], a.shape[1])) a.write("{}/{}.h5ad".format(args.outdir, name), compression="gzip")
def atac(path, ad=False, path_mtx=None, path_genes=None, path_barcodes=None): # Import and transpose data if ad is True: df = sc.read(path_mtx).transpose() df = pd.DataFrame(data=df.X, index=df.obs_names, columns=df.var_names) else: df = sc.read_mtx(path_mtx).transpose() barcodes = pd.read_csv(path_barcodes, delimiter="\t", header=None) df.obs_names = barcodes[0].values genes = pd.read_csv(path_genes, delimiter="\t", header=None) df.var_names = genes[0].values df = pd.DataFrame(data=df.X.toarray(), index=df.obs_names, columns=df.var_names) # Use only cells not discarded by RNAseq filtering inds = pd.read_pickle(f"{path}RNAseq_prepped.pkl").index df = df.loc[inds] # Normalize data df[df.columns] = MaxAbsScaler().fit_transform(df[df.columns]) # Term frequency - Inverse document frequency tf_idf = TfidfTransformer(norm=None, sublinear_tf=False) transformed = tf_idf.fit_transform(df.values).toarray() transformed_scaled = MaxAbsScaler().fit_transform(transformed) df = pd.DataFrame(transformed_scaled, index=df.index, columns=df.columns) # Select top peaks df = df.loc[:, df.columns[(df != 0).sum() >= ( df[df.columns] != 0).sum().quantile(0.75)]] col_vars = df.var(axis=0) highly_variable_cols = df.columns[col_vars >= col_vars.quantile(0.75)] df = df.loc[:, highly_variable_cols] df.to_pickle(f"{path}ATACseq_ae_prepped.pkl") return df
def _read_raw_dataset(self): data_path = self._data_path matrix_path = next(data_path.glob('*UMI.count.matrix')) cells_path = next(data_path.glob('*cell.annotations.txt')) genes_path = next(data_path.glob('*gene.annotations.txt')) hash_sheet_path = next(data_path.glob('*hashSampleSheet.txt')) hash_table_out_path = next(data_path.glob('*hashTable.out.txt')) cells = pd.read_table(cells_path, delim_whitespace=True, usecols=[0], index_col=0, names=[None]) genes = pd.read_table(genes_path, delim_whitespace=True, names=['gene_ids', 'gene_symbols'], index_col='gene_symbols') genes.index.name = None treatment_map = pd.read_table(hash_sheet_path, delim_whitespace=True, names=['treatment', 'umi', 'umi_count']) cell_treatment_map = pd.read_table( hash_table_out_path, delim_whitespace=True, names=['sample', 'barcode', 'treatment', 'axis', 'umi_count']) matrix = sc.read_mtx(matrix_path) dataset = matrix.T dataset.obs = cells dataset.var = genes return dataset, treatment_map, cell_treatment_map
import scanpy as sc import numpy as np import pandas as pd hemberg = sc.read_mtx( '/tyrone-data/bharris/metaneighbor_protocol_data/pancreas.mtx') hemberg_coldata = pd.read_csv( '/tyrone-data/bharris/metaneighbor_protocol_data/pancreas_col.csv', index_col=0) hemberg_genes = np.genfromtxt( '/tyrone-data/bharris/metaneighbor_protocol_data/pancreas_genes.csv', dtype=str) hemberg = hemberg.T hemberg.obs = hemberg_coldata hemberg.var_names = hemberg_genes hemberg.obs.columns = np.string_(hemberg.obs.columns) hemberg.write_h5ad( '/tyrone-data/bharris/metaneighbor_protocol_data/hemberg.h5ad', compression='gzip', compression_opts=9)
def seurat_object_to_anndata(file_path_seurat_object, delete_tmp_file=True): """ Convert seurat object into anndata. Args: file_path_seurat_object (str): File path of seurat object. Seurat object should be saved as Rds format. delete_tmp_file (bool): Whether to delete temporary file. Returns: anndata: anndata object. """ # check file name print("input file name: " + file_path_seurat_object) if not file_path_seurat_object.lower().endswith(".rds"): raise ValueError("Seurat object should be saved as .Rds file") # run R script to extract information and make mtx files os.makedirs("tmp", exist_ok=True) command = f"Rscript {rscript_folder}/seurat_to_mtx.R {file_path_seurat_object}" #ret = os.system() #if ret == 0: # pass #else: # print("Error in R script") exec_process(command, message=True, wait_finished=True, return_process=False) print("making AnnData ...") folder = "./tmp" # load data mm = sc.read_mtx(os.path.join(folder, "data.mtx")) meta = pd.read_csv(os.path.join(folder, "meta_data.csv"), index_col=0) meta_dtype = pd.read_csv(os.path.join(folder, "meta_data_dtype.csv"), index_col=0) categorical_info = meta_dtype[meta_dtype["dtype"] == "factor"].index.values cell_ids = pd.read_csv(os.path.join(folder, "cells_index.csv")).x.values variable_ids = pd.read_csv(os.path.join(folder, "variables_index.csv")).x.values if "raw_data.mtx" in os.listdir(folder): raw_data = sc.read_mtx(os.path.join(folder, "raw_data.mtx")) mat = _constructAnnData(mm, cell_ids, variable_ids, meta, categorical_info, raw_data) else: mat = _constructAnnData(mm, cell_ids, variable_ids, meta, categorical_info) # add variable gene info if "var_genes.csv" in os.listdir(folder): variable_genes = pd.read_csv(os.path.join(folder, "var_genes.csv")).x.values mat.var["variable_gene"] = mat.var.index.isin(variable_genes) # add color data color_df = pd.read_csv("tmp/cluster_color_hex.csv", index_col=0) mat.uns["seurat_clusters_colors"] = color_df.colors_hex.values # delete temporary files if delete_tmp_file: shutil.rmtree(folder) return mat
def cluster(ctx, matrix, outdir, sample, barcodes, genes, n_top, min_genes, min_cells, n_genes_by_counts, pct_counts_mt, exclude_highly_expressed, max_fraction, n_top_genes, max_value, n_neighbors, n_pcs, debug): sample_outdir = outdir / sample / '06.cluster' sample_outdir.mkdir(parents=True, exist_ok=True) os.chdir(sample_outdir) logger.info('cluster start!') adata_mtx = sc.read_mtx(matrix).T obs = pd.read_csv(barcodes, index_col=0, header=None) var = pd.read_csv(genes, index_col=0, header=None) obs.index.set_names(None, inplace=True) var.index.set_names(None, inplace=True) adata_mtx.obs = obs adata_mtx.var = var result_file = sample_outdir / f'{sample}.h5ad' adata_mtx.var_names_make_unique() [sc.pl.highest_expr_genes(adata_mtx, n_top=n_top, save=f'_{sample}.{filetype}') for filetype in FILETYPE] sc.pp.filter_cells(adata_mtx, min_genes=min_genes) sc.pp.filter_genes(adata_mtx, min_cells=min_cells) adata_mtx.var['mt'] = adata_mtx.var_names.str.upper().str.startswith('MT-') sc.pp.calculate_qc_metrics(adata_mtx, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) [sc.pl.violin(adata_mtx, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True, save=f'_{sample}.{filetype}') for filetype in FILETYPE] [sc.pl.scatter(adata_mtx, x='total_counts', y='pct_counts_mt', save=f'_pct_mt_{sample}.{filetype}') for filetype in FILETYPE] [sc.pl.scatter(adata_mtx, x='total_counts', y='n_genes_by_counts', save=f'_n_genes_{sample}.{filetype}') for filetype in FILETYPE] adata_mtx = adata_mtx[adata_mtx.obs.n_genes_by_counts < n_genes_by_counts, :] adata_mtx = adata_mtx[adata_mtx.obs.pct_counts_mt < pct_counts_mt, :] sc.pp.normalize_total(adata_mtx, target_sum=1e6, exclude_highly_expressed=exclude_highly_expressed, max_fraction=max_fraction) sc.pp.log1p(adata_mtx) sc.pp.highly_variable_genes(adata_mtx, n_top_genes=n_top_genes) [sc.pl.highly_variable_genes(adata_mtx, save=f'_{sample}.{filetype}') for filetype in FILETYPE] adata_mtx.raw = adata_mtx adata_mtx = adata_mtx[:, adata_mtx.var.highly_variable] sc.pp.regress_out(adata_mtx, ['total_counts', 'pct_counts_mt']) sc.pp.scale(adata_mtx, max_value=max_value) sc.tl.pca(adata_mtx, svd_solver='arpack') sc.pp.neighbors(adata_mtx, n_neighbors=n_neighbors, n_pcs=n_pcs) sc.tl.umap(adata_mtx) sc.tl.leiden(adata_mtx) sc.tl.louvain(adata_mtx) [sc.pl.umap(adata_mtx, color=algo, save=f'_{algo}_{sample}.{filetype}') for filetype in FILETYPE for algo in CLUSTER_ALGORITHM] adata_mtx.write(result_file, compression='gzip') logger.info('cluster done!') stat_info = { 'visible': {}, 'invisible': {} } img = {} pngs = (sample_outdir / 'figures').rglob('*.png') for png in pngs: img[png.name] = png.resolve() # report logger.info('generate report start!') Reporter(name='cluster', stat_json=stat_info, outdir=sample_outdir.parent, img=img) logger.info('generate report done!')
def readConos(inPath): from time import time from shutil import rmtree from scipy.io import mmread from os import mkdir, path import pandas as pd dir_path = "/tmp/conos" + str(int(time())) while path.isdir(dir_path): dir_path += '2' dir_path += '/' mkdir(dir_path) ro.r('library(conos)') ro.r(f'con <- readRDS("{inPath}")') ro.r('meta <- function(sobj) {return([email protected])}') ro.r('metalist <- lapply(con$samples, meta)') ro.r('library(data.table)') ro.r('metaM <- do.call(rbind,unname(metalist))') ro.r( f'saveConosForScanPy(con, output.path="{dir_path}", pseudo.pca=TRUE, pca=TRUE, metadata.df=metaM)' ) gene_df = pd.read_csv(dir_path + "genes.csv") metadata = pd.read_csv(dir_path + "metadata.csv") metadata.index = metadata.CellId del metadata["CellId"] embedding_df = pd.read_csv(dir_path + "embedding.csv") # Decide between using PCA or pseudo-PCA pseudopca_df = pd.read_csv(dir_path + "pseudopca.csv") #pca_df = pd.read_csv(dir_path + "pca.csv") graph_conn_mtx = mmread(dir_path + "graph_connectivities.mtx") graph_dist_mtx = mmread(dir_path + "graph_distances.mtx") adata = sc.read_mtx(dir_path + "raw_count_matrix.mtx") adata.var_names = gene_df["gene"].values adata.obs_names = metadata.index.values adata.obs = metadata.copy() # Depends on which PCA you loaded adata.X_pca = pseudopca_df.values adata.obsm['X_pca'] = pseudopca_df.values # Name according to embedding you saved adata.X_umap = embedding_df.values adata.obsm['X_umap'] = embedding_df.values adata.uns['neighbors'] = dict(connectivities=graph_conn_mtx.tocsr(), distances=graph_dist_mtx.tocsr()) # Assign raw counts to .raw slot, load in normalised counts #adata.raw = adata #adata_temp = sc.read_mtx(DATA_PATH + "count_matrix.mtx") #adata.X = adata_temp.X rmtree(dir_path) return adata
def load_fry(frydir, which_counts={'X' : ['S','A']}, verbose=False): """ Parameters: frydir - The directory containing the alevin-fry quantification (i.e. the the quant.json file & alevin subdirectory). verbose - True if messages (including error messages) should be printed out, False if function should be quiet. which_count - Dictionary specifying how a USA mode matrix should be returned or combined into the resulting output matrix. If the input is not a USA mode quantification directory, this parameter is ignored and the count matrix is returned in the `X` field of the returned `AnnData` object. If the input quantification directory contains a USA mode quantification, then there are 3 sub-matrices that can be referenced in the dictionary; 'U', 'S', 'A' containing, respectively, unspliced, spliced and ambiguous counts. The dictionary should have entries of the form `key` (str) : `value` (list[str]). The following constraints apply : there should be one key-value pair with the key `X`, the resulting counts will be returned in the `X` field of the AnnData object. There can be an arbitrary number of other key-value pairs, but each will be returned as a layer of the resulting AnnData object. Within the key-value pairs, the key refers to the layer name that will be given to the combined count matrix upon output, and the value should be a subset of `['U', 'S', 'A']` that defines which sub-matrices should be summed. For example: {'X' : ['S', 'A'], 'unspliced' : ['U']} will result in a return AnnData object where the X field has a matrix in which each entry corresponds to the summed spliced and ambiguous counts for each gene in each cell, and there is an additional 'unspliced' layer, whose counts are taken directly from the unspliced sub-matrix. Returns: An AnnData object with X and layers corresponding to the requested `which_counts`, or None if an error is encountered. """ import json import os import pandas as pd # since alevin-fry 0.4.1 the generic "meta_info.json" # has been replaced by a more informative name for each # sub-command. For quantification, it is "quant.json". # we check for both files here, in order. meta_info_files = ["quant.json", "meta_info.json"] fpath = os.path.sep.join([frydir, meta_info_files[0]]) # first, check for the new file, if we don't find it, check # for the old one. if not os.path.exists(fpath): if verbose: print(f"Did not find a {meta_info_files[0]} file, checking for older {meta_info_files[1]}.") fpath = os.path.sep.join([frydir, meta_info_files[1]]) # if we don't find the old one either, then return None if not os.path.exists(fpath): if verbose: print(f"Found no {meta_info_files[1]} file either; cannot proceed.") return None # if we got here then we had a valid json file, so # use it to get the number of genes, and if we are # in USA mode or not. meta_info = json.load(open(fpath)) ng = meta_info['num_genes'] usa_mode = meta_info['usa_mode'] # if we are in USA mode if usa_mode: # make sure that num_genes is a multiple of 3 if ng %3 != 0: if verbose: print("Found USA mode, but num genes = {ng} is not a multiple of 3; cannot proceed.") return None # each gene has 3 splicing statuses, so the actual number of distinct # genes is ng/3. ng = int(ng/3) if verbose: print("processing input in USA mode, will return {}".format("+".join(which_counts))) # make sure which_counts isn't empty assert(len(which_counts) > 0) # make sure the specification in which_counts is OK if 'X' not in which_counts: if verbose: print('In USA mode some sub-matrices must be assigned to the \"X\" (default) output.') return None if verbose: print(f"will populate output field X with sum of counts frorm {which_counts['X']}.") for k,v in which_counts.items(): valid_elem = len(set(v) - set(['U', 'S', 'A'])) == 0 if not valid_elem: if verbose: print(f'Found non-USA element in which_count element list \"{v}\" for key \"{k}\"; cannot proceed.') return None if verbose and (k != 'X'): print(f'will combine {v} into output layer {k}.') elif verbose: print("Processing input in standard mode, will return processed count (which_count will be ignored).") # read the actual input matrix af_raw = scanpy.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"])) afg = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"])).readlines()][:ng] # read the gene ids afg_df = pd.DataFrame(afg, columns=["gene_ids"]) afg_df = afg_df.set_index("gene_ids") # and the barcodes abc = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"])).readlines() ] abc_df = pd.DataFrame(abc, columns=["barcodes"]) abc_df.index = abc_df["barcodes"] x = af_raw.X # if we're not in USA mode, just combine this info into # an AnnData object if not usa_mode: af = scanpy.AnnData(x.T, var=abc_df, obs=afg_df) af = af.T else: # USA mode # otherwise, combine the sub-matrices into the output object as # specified by `which_counts` rd = {'S' : range(0,ng), 'U' : range(ng, 2*ng), 'A' : range(2*ng,3*ng)} xcounts = which_counts['X'] o = x[:, rd[xcounts[0]]] for wc in xcounts[1:]: o += x[:, rd[wc]] af = scanpy.AnnData(o.T, var=abc_df, obs=afg_df) af = af.T # now, if there are other layers requested, populate those for other_layer in which_counts.keys() - 'X': xcounts = which_counts[other_layer] o = x[:, rd[xcounts[0]]] for wc in xcounts[1:]: o += x[:, rd[wc]] af.layers[other_layer] = o return af
parser.add_argument('--obs') parser.add_argument('--var') parser.add_argument('--output') args = parser.parse_args() with open(args.params, 'r') as f: json_params = json.loads(f.read()) try: working_dir = os.getcwd() # get the parameter object params = json_params.get('parameters').get('input').get('anndata-spec') # get the matrix file logging.info('Now reading the matrix market file') adata = sc.read_mtx(os.path.join(working_dir, args.matrix)).transpose() logging.debug('adata variables:') logging.info(dir(adata)) # get the var dataframe logging.info('Now reading var dataframe') df_var = pd.read_csv(os.path.join(working_dir, args.var), delimiter=params.get('var').get('delimiter')) logging.debug('var columns are {}'.format(list(df_var.columns))) logging.debug('Now mapping pandas indexes') df_var.index = df_var[params.get('var').get('index_col')] df_var.index.name = None logging.debug('Now mapping var') adata.var = df_var # get the obs data logging.info('Now reading obs dataframe') df_obs = pd.read_csv(os.path.join(working_dir, args.obs),
def get_matrix(sampleName, spliced_dir, unspliced_dir, prefix): spliced_dir = "velocity_quant/" + sampleName + "/" + spliced_dir + "/" #spliced_counts unspliced_dir = "velocity_quant/" + sampleName + "/" + unspliced_dir + "/" #unspliced_counts # get intersection of genes and barcodes for each sample s = scipy.io.mmread(spliced_dir + prefix + ".mtx") u = scipy.io.mmread(unspliced_dir + prefix + ".mtx") # get intersection of barcodes and perform on s and u df_s_bcs = pd.read_csv(spliced_dir + prefix + ".barcodes.txt", header=None) df_u_bcs = pd.read_csv(unspliced_dir + prefix + ".barcodes.txt", header=None) s_bcs = df_s_bcs[0].values.tolist() u_bcs = df_u_bcs[0].values.tolist() bcs_is = [i for i in s_bcs if i in u_bcs] s_bcs_is_int = [i for i in range(len(s_bcs)) if s_bcs[i] in bcs_is] u_bcs_is_int = [i for i in range(len(u_bcs)) if u_bcs[i] in bcs_is] s = s.tocsr()[s_bcs_is_int,:] u = u.tocsr()[u_bcs_is_int,:] s_bcs = df_s_bcs.iloc[s_bcs_is_int,:] u_bcs = df_u_bcs.iloc[u_bcs_is_int,:] # get intersection of genes and perform on s and u df_s_genes = pd.read_csv(spliced_dir + prefix + ".genes.txt", header=None) df_u_genes = pd.read_csv(unspliced_dir + prefix + ".genes.txt", header=None) s_genes = df_s_genes[0].values.tolist() u_genes = df_u_genes[0].values.tolist() genes_is = [i for i in s_genes if i in u_genes] s_genes_is_int = [i for i in range(len(s_genes)) if s_genes[i] in genes_is] u_genes_is_int = [i for i in range(len(u_genes)) if u_genes[i] in genes_is] s = s.tocsc()[:,s_genes_is_int] u = u.tocsc()[:,u_genes_is_int] s_genes = df_s_genes.iloc[s_genes_is_int,:] u_genes = df_u_genes.iloc[u_genes_is_int,:] # convert back to coo s = s.tocoo() u = u.tocoo() # save intersected matrix, barcodes and genes scipy.io.mmwrite(spliced_dir + prefix + "_isect.mtx", s) scipy.io.mmwrite(unspliced_dir + prefix + "_isect.mtx", u) df_s_bcs.to_csv(spliced_dir + prefix + ".barcodes_isect.txt", header=None, index=False) df_u_bcs.to_csv(unspliced_dir + prefix + ".barcodes_isect.txt", header=None, index=False) s_genes.to_csv(spliced_dir + prefix + ".genes_isect.txt", header=None, index=False) u_genes.to_csv(unspliced_dir + prefix + ".genes_isect.txt", header=None, index=False) s = sc.read_mtx(spliced_dir + prefix + "_isect.mtx") u = sc.read_mtx(unspliced_dir + prefix + "_isect.mtx") print(s_genes) print(u_genes) print(s_bcs) print(u_bcs) print(s) print(u) s.obs.index = s_bcs[0].values u.obs.index = u_bcs[0].values s.var.index = s_genes[0].values u.var.index = u_genes[0].values s_bcs["sample"] = sampleName u_bcs["sample"] = sampleName s_bcs.columns = ["bcs", "sample"] u_bcs.columns = ["bcs", "sample"] s_bcs.index = s_bcs["bcs"] + "." + s_bcs["sample"] u_bcs.index = u_bcs["bcs"] + "." + u_bcs["sample"] out = {'s': s, 'u': u, 's_bcs': s_bcs, 'u_bcs': u_bcs, 'genes': s_genes} return(out)
gene_names['gene_name'] = gene_names['extra'].str.extract( pat='gene_name "(.*?)";') gene_names['gene_id'] = gene_names['extra'].str.extract(pat='gene_id "(.*?)";') gene_names['transcript_type'] = gene_names['extra'].str.extract( pat='transcript_type "(.*?)";') gene_names = gene_names[gene_names['feature_type'] == 'gene'] annotation = gene_names[['gene_id', 'gene_name', 'chr']].groupby('gene_name').head(1) annotation['gene_name'] = annotation['gene_name'].str.upper() annotation = annotation.set_index('gene_id') #%% Read and log normalize single cell data adata = sc.read_mtx( 'data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx' ) data = adata.X.toarray().T adata = sc.AnnData(data) cols = pd.read_csv( 'data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx_cols', header=None) rows = pd.read_csv( 'data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx_rows', sep='\t', header=None) adata.var = rows.set_index(0).join(annotation)[[ 'gene_name' ]].reset_index().set_index('gene_name') adata.var.index = adata.var.index.astype('str')
import os # Processing alevin-fry count matrix frydir = "pancreas_quant_res" e2n_path = "data/geneid_to_name.txt" meta_info = json.load(open(os.path.sep.join([frydir, "meta_info.json"]))) ng = meta_info['num_genes'] usa_mode = meta_info['usa_mode'] if usa_mode: print("processing input in USA mode, will return A+S as the spliced count, and U as the unspliced count") else: print("please follow previous steps to generate the ount matrix in the USA mode") assert(False) af_raw = sc.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"])) ng = int(ng/3) e2n = dict([ l.rstrip().split() for l in open(e2n_path).readlines()]) var_names = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"])).readlines()][:ng] var_names = [e2n[e] for e in var_names] obs_names = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"])).readlines() ] x = af_raw.X spliced = x[:,range(0,ng)] + x[:,range(2*ng,3*ng)] unspliced = x[:,range(ng, 2*ng)] # creating AnnData using spliced and unspliced count matrix adata = anndata.AnnData(X = spliced, layers = dict(spliced = spliced, unspliced = unspliced))
import scanpy as sc import numpy as np import pandas as pd tasic = sc.read_mtx( 'tasic_counts.mtx') tasic_coldata = pd.read_csv( 'tasic_col.csv', index_col=0) tasic_genes = np.genfromtxt( 'tasic_genes.csv', dtype=str) tasic = tasic.T tasic.obs = tasic_coldata tasic.var_names = tasic_genes tasic.obs.columns = np.string_(tasic.obs.columns) tasic.write_h5ad( 'tasic.h5ad', compression='gzip', compression_opts=9)
help='Run DCA?', default=False, action='store_true') args = parser.parse_args() print(args) # --------- # load data # --------- # matrix if '.mtx' in args.data: print('Reading sparse matrix %s' % (args.data)) x = scanpy.read_mtx(args.data) else: print('Reading dense matrix %s' % (args.data)) x = pd.read_csv(args.data, sep='\t', header=None, index_col=False) x = np.array(x) # clusters if args.clusters: clusters = pd.read_csv(args.clusters, sep='\t', header=None, index_col=False) clusters = np.array(clusters) g = 'groups' else: clusters = ''
'font.sans-serif': 'Arial', 'font.family': 'sans-serif', 'axes.titlesize': 18, 'axes.labelsize': 14, }) #%% Get genes #note genes for the second mouse are identical, so no need to duplicate genes = pd.read_csv('data/raw-data/GSM344007/GSM3440071_SC01_genes.tsv', sep='\t', names=['key', 'name']) genes['name'] = genes['name'].str.upper() #%% Read and log normalize single cell data adata01 = sc.read_mtx('data/raw-data/GSM344007/GSM3440071_SC01_matrix.mtx') adata02 = sc.read_mtx('data/raw-data/GSM344007/GSM3440072_SC02_matrix.mtx') data01 = adata01.X.transpose() data02 = adata02.X.transpose() adata = sc.AnnData(scipy.sparse.vstack((data01, data02), format='csr')) adata.var = genes.set_index('name') adata.var_names_make_unique() print('Healthy Mouse: ', adata.X.max()) sc.pp.log1p(adata, base=2) #%% Exploratory plots # Unused in the main analysis, but used to parameter search for filtering cutoff. def exploratory_plots(adata):