def runIntegration(inPath, outPath, method, hvg, batch, celltype=None): """ params: method: name of method batch: name of `adata.obs` column of the batch max_genes_hvg: maximum number of HVG """ adata = sc.read(inPath) if timing: if celltype is not None: integrated_tmp = scIB.metrics.measureTM(method, adata, batch, celltype) else: integrated_tmp = scIB.metrics.measureTM(method, adata, batch) integrated = integrated_tmp[2][0] integrated.uns['mem'] = integrated_tmp[0] integrated.uns['runtime'] = integrated_tmp[1] else: if celltype is not None: integrated = method(adata, batch, celltype) else: integrated = method(adata, batch) sc.write(outPath, integrated)
def test_non_unique_names(self, adata: AnnData, path: Path, lin_key: str, _: int): names_key = _lin_names(lin_key) adata.uns[names_key][0] = adata.uns[names_key][1] sc.write(path, adata) with pytest.raises(ValueError): _ = cr.read(path)
def test_no_lineage(self, adata: AnnData, path: Path, lin_key: str, _: int): del adata.obsm[lin_key] sc.write(path, adata) adata_new = cr.read(path) assert adata_new is not adata # sanity check assert lin_key not in adata_new.obsm.keys()
def prepare_and_load_edge2shoe(file_path, restore=True, save=True, img_width=64, img_height=64, verbose=True): data_path = os.path.dirname(file_path) if restore and os.path.exists( os.path.join(data_path, f"edges2shoes_{img_width}x{img_height}.h5ad")): return sc.read( os.path.join(data_path, f"edges2shoes_{img_width}x{img_height}.h5ad")) tar = tarfile.open(file_path) images, edges = [], [] counter = 0 for member in tar.getmembers(): if member.name.endswith(".jpg"): f = tar.extractfile(member) image = Image.open(f) edge, image = image.crop((0, 0, 256, 256)), image.crop( (256, 0, 512, 256)) edge = edge.resize((64, 64), Image.BICUBIC) image = image.resize((64, 64), Image.NEAREST) edge = np.array(edge) image = np.array(image) images.append(image) edges.append(edge) counter += 1 if verbose and counter % 1000 == 0: print(counter) images = np.array(images) edges = np.array(edges) images = images.reshape(-1, np.prod(images.shape[1:])) edges = edges.reshape(-1, np.prod(edges.shape[1:])) data = np.concatenate([images, edges], axis=0) if save: data = anndata.AnnData(X=data) data.obs['id'] = np.concatenate( [np.arange(images.shape[0]), np.arange(images.shape[0])]) data.obs['condition'] = ['shoe'] * images.shape[0] + [ 'edge' ] * images.shape[0] sc.write(filename=os.path.join( data_path, f"edges2shoes_{img_width}x{img_height}.h5ad"), adata=data) return data
def save_adata(adata, filepath, ext='.h5ad', gcs=False): if gcs: temp = NamedTemporaryFile(suffix=ext, delete=False) temp.close() sc.write(temp.name, adata) subprocess.call('gsutil -m cp %s %s' % (temp.name, filepath), shell=True) subprocess.call('rm %s' % temp.name, shell=True) else: sc.write(filepath, adata)
def test_no_colors(self, adata: AnnData, path: Path, lin_key: str, n_lins: int): colors_key = _colors(lin_key) del adata.uns[colors_key] sc.write(path, adata) adata_new = cr.read(path) lins = adata_new.obsm[lin_key] assert isinstance(lins, Lineage) np.testing.assert_array_equal(lins.colors, _create_categorical_colors(n_lins)) np.testing.assert_array_equal(lins.colors, adata_new.uns[colors_key])
def normalize_by_scanpy(adata, adata_path_filenames, exclude_highly_expressed=True, raw=False): """ Normalize counts per spot (cell for scRNA-seq) with scanpy function sc.pp.normalize_total(). If target sum is 1e6 than CPM normalisation is applied By excluding highly expressed genes, the normalisation of lower expressed genes are higher weighted [Weinreb17]. Parameters ---------- adata : annData adata_path_filenames : str exclude_highly_expressed : bool raw : bool Returns ------- adata : annData The count data has been normalized and log-transformed with an offset of 1. The offset of 1 ensures that zero counts map to zeros. Keep this data in '.raw' part of the AnnData object as it will be used to visualize gene expression and perform statistical tests such as computing marker genes for clusters """ # Keep the count data in a counts layer adata.layers["counts"] = adata.X.copy() # return dictionary if inplace is False otherwise updates adata x_norm = sc.pp.normalize_total( adata, inplace=False, exclude_highly_expressed=exclude_highly_expressed, target_sum=1e6)['X'] adata.X = x_norm # log-transforms and updates adata sc.pp.log1p(adata) # modify resulting matrix adata.X = np.asarray(adata.X) # Store the full data set in 'raw' as log-normalised data for statistical testing adata.raw = adata # save adata object if raw: sc.write('{}_raw_QC_sizefactors.h5'.format(adata_path_filenames), adata=adata) else: sc.write('{}_QC_sizefactors.h5'.format(adata_path_filenames), adata=adata) return adata
def test_normal_run(self, adata: AnnData, path: Path, lin_key: str, n_lins: int): colors = _create_categorical_colors(10)[-n_lins:] names = [f"foo {i}" for i in range(n_lins)] adata.uns[_colors(lin_key)] = colors adata.uns[_lin_names(lin_key)] = names sc.write(path, adata) adata_new = cr.read(path) lins_new = adata_new.obsm[lin_key] np.testing.assert_array_equal(lins_new.colors, colors) np.testing.assert_array_equal(lins_new.names, names)
def test_no_names(self, adata: AnnData, path: Path, lin_key: str, n_lins: int): names_key = _lin_names(lin_key) del adata.uns[names_key] sc.write(path, adata) adata_new = cr.read(path) lins = adata_new.obsm[lin_key] assert isinstance(lins, Lineage) np.testing.assert_array_equal(lins.names, [f"Lineage {i}" for i in range(n_lins)]) np.testing.assert_array_equal(lins.names, adata_new.uns[names_key])
def test_wrong_names_length(self, adata: AnnData, path: Path, lin_key: str, n_lins: int): names_key = _lin_names(lin_key) adata.uns[names_key] = list(adata.uns[names_key]) adata.uns[names_key] += ["foo", "bar", "baz"] sc.write(path, adata) adata_new = cr.read(path) lins = adata_new.obsm[lin_key] assert isinstance(lins, Lineage) np.testing.assert_array_equal(lins.names, [f"Lineage {i}" for i in range(n_lins)]) np.testing.assert_array_equal(lins.names, adata_new.uns[names_key])
def test_writeable(self, adata: AnnData, interactions: Interactions_t, tmpdir): ligrec(adata, _CK, interactions=interactions, n_perms=5, copy=False, show_progress_bar=False, key_added="foo") res = adata.uns["foo"] sc.write(tmpdir / "ligrec.h5ad", adata) bdata = sc.read(tmpdir / "ligrec.h5ad") for key in ["means", "pvalues", "metadata"]: assert_frame_equal(res[key], bdata.uns["foo"][key])
def runPP(inPath, outPath, hvg, batch, rout, scale, seurat): """ params: inPath: path of the anndata object outPath: path of the preprocessed file to be written hvg: number of highly variable genes to use rout: set to true to save a Seurat object scale: set to true to activate scaling seurat: set to true to produce hvg list """ adata = sc.read(inPath) hvgs = adata.var.index # remove HVG if already precomputed if 'highly_variable' in adata.var: del adata.var['highly_variable'] if hvg > 500: print("Computing HVGs ...") if seurat: hvgs = scib.preprocessing.hvg_batch( adata, batch_key=batch, target_genes=hvg, adataOut=False ) else: adata = scib.preprocessing.hvg_batch( adata, batch_key=batch, target_genes=hvg, adataOut=True ) if scale: print("Scaling data ...") adata = scib.preprocessing.scale_batch(adata, batch) if rout: print("Save as RDS") scib.preprocessing.saveSeurat(adata, outPath, batch, hvgs) else: print("Save as HDF5") sc.write(outPath, adata)
def normalize_by_sizefactor(adata, adata_path_filenames): """Normalising the count matrix using sizefactors The basic preprocessing includes assuming all size factors are equal (library size normalization to counts per million - CPM) and log-transforming the count data Parameters ---------- adata : annData adata_path_filenames : str Returns ------- adata : annData The count data has been normalized and log-transformed with an offset of 1. The offset of 1 ensures that zero counts map to zeros. Keep this data in '.raw' part of the AnnData object as it will be used to visualize gene expression and perform statistical tests such as computing marker genes for clusters """ # Keep the count data in a counts layer adata.layers["counts"] = adata.X.copy() # Normalize adata adata.X /= adata.obs['size_factors'].values[:, None] # log-transforms and updates adata # and log or Box-Cox transformation (lambda=0) # because count distribution follows a power-law and afterwards a normal distribution -> easier to apply stat-tests sc.pp.log1p(adata) # modify resulting matrix adata.X = np.asarray(adata.X) # Store the full data set in 'raw' as log-normalised data for statistical testing adata.raw = adata # save adata object sc.write('{}_QC_sizefactors.h5'.format(adata_path_filenames), adata=adata) return adata
def read_files(filename, sample): '''import 10X data, based on filename (path to file) and sample ID (assigned as unique ID)''' path = '%s/'%(filename) adata = sc.read(path+'matrix.mtx',cache=True).transpose() try: adata.var_names = np.genfromtxt(path + 'genes.tsv',dtype=str)[:,1] adata.var['GeneName'] = np.genfromtxt(path + 'genes.tsv', dtype=str)[:, 1] adata.var['GeneID'] = np.genfromtxt(path + 'genes.tsv', dtype=str)[:, 0] except: adata.var_names = np.genfromtxt(path + 'features.tsv.gz',dtype=str)[:,1] adata.var['GeneName'] = np.genfromtxt(path + 'features.tsv.gz', dtype=str)[:, 1] adata.var['GeneID'] = np.genfromtxt(path + 'features.tsv.gz', dtype=str)[:, 0] adata.obs_names = np.genfromtxt(path + 'barcodes.tsv',dtype=str) adata.obs_names = [filename+"-"+x.strip("-1") for x in adata.obs_names] adata.obs['Sample'] = sample # caculate n_counts / n_genes per cell adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1 adata.obs['n_genes'] = np.sum(adata.X>0,axis=1) mito_genes = adata.var_names.str.startswith('MT-') adata.obs['mito'] = (np.sum(adata.X[:, mito_genes],axis=1).A1) / (np.sum(adata.X,axis=1).A1) # filter cells clist = [] clist.append(np.array(adata.obs['n_counts'] > 1000)) clist.append(np.array(adata.obs['n_genes'] > 500)) clist.append(np.array(adata.obs['n_genes'] < 7000)) clist.append(np.array(adata.obs['mito'] < 0.5)) c = np.column_stack(clist).all(axis=1) adata = adata[c].copy() sc.write('%s%s_filtered'%(version,sample),adata) return adata
def _create_dummy_adata(n_obs: int) -> AnnData: """ Create a testing :class:`anndata.AnnData` object. Call this function to regenerate the above objects. Params ------ n_obs Number of cells. Returns ------- :class:`anndata.AnnData` The created adata object. """ np.random.seed(42) adata = scv.datasets.toy_data(n_obs=n_obs) scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=1000) adata.raw = adata[:, 42:42 + 50].copy() scv.pp.moments(adata, n_pcs=30, n_neighbors=30) scv.tl.recover_dynamics(adata) scv.tl.velocity(adata, mode="dynamical") scv.tl.velocity_graph(adata, mode_neighbors="connectivities") scv.tl.latent_time(adata) adata.uns["iroot"] = 0 sc.tl.dpt(adata) adata.uns["connectivity_variances"] = np.ones((n_obs, n_obs), dtype=np.float64) adata.uns["velocity_variances"] = np.ones((n_obs, n_obs), dtype=np.float64) sc.write(f"tests/_ground_truth_adatas/adata_{n_obs}.h5ad", adata) return adata
def main(configs, adata, save_folder): """Control which pre-processing steps shall be applied before downstream analysis, DGE analysis and pathway enrichment analysis Parameters ---------- configs : configparser contains all parameters -> to add: thresholds and cut parameters adata : annData save_folder : str Returns ------- norm_pp_adata : annData adata_filename : str """ print("\n-------- Overview of data sets --------") if configs['data']['data_type'] == 'Spatial Transcriptomics': # 1.1 Add meta data like which samples belong to which donor (since 04.10.2020) adata, tissue_cell_labels, disease_labels, lesion_labels = add_metadata( adata) # 1.2 Remove spots having no tissue/cell labels (since 06.10.2020) adata = adata[np.where( adata.obs[tissue_cell_labels].to_numpy().any(axis=1))[0]] dataset_type = "st" else: dataset_type = "sc" # print info about sample 1 sample_name = adata.obs['sample'].values[1] print("\nSample {} ".format(sample_name)) print("Shape of filtered data set: ", adata.shape) print("Tissue associated No. of spots: ", adata.shape[0]) print("Tissue associated No. of genes: ", adata.shape[1]) print("Observables contained in data sets sorted by barcodes: ", adata.obs_keys()) print("Variables contained in data sets sorted by gene names: ", adata.var_keys()) if configs['data']['data_type'] == 'Spatial Transcriptomics': # plot spots on top of images (only for the first sample) plot_images(configs=configs, adata=adata, save_folder=save_folder) # 2. Pre-processing and visualization # apply the following steps 2.1 - 2.6 individually on each adata object print("\n-------- Start Pre-processing and Visualization --------") # 2.0 # show 20thst highest expressed genes (HEG) in data set and per sample determine_heg(adata=adata, save_folder=save_folder) print("\n Quality Control\n") # 2.1 QC (Quality control) of data - calculate QC covariates # 2.1.1 Cell QC # TODO Determine counts_threshold via Mean absolute deviation (MAD); find outliers :) adata_qc = sample_qc(adata=adata, save_folder=save_folder, counts_threshold=60000, lower_filter_counts=2000, upper_filter_counts=2500, upper_filter_genes=2000, log_scale=False, raw=configs.getboolean("preprocessing", "read_raw_matrix"), sample_name=sample_name) # 2.1.2 Threshold determination of UMI counts and genes cutted_adata, min_genes, min_shared_counts, mt_threshold, min_counts, max_counts, min_umi_genes, max_umi_genes = \ apply_qc_filter(adata=adata_qc, apply_mt_threshold=configs.getboolean("preprocessing", 'apply_mt_threshold')) if configs.getboolean('preprocessing', 'filter_doublets'): # 2.1.3 Filter out multiplets -- cutted_adata = doublet_detection.scrublet_algorithm( cutted_adata, sample_name=sample_name, save_folder=save_folder) # save QC adata object adata_filename = '{}_QC.h5'.format(dataset_type) sc.write(os.path.join(configs["data"]['output_path'], adata_filename), cutted_adata) # 2.2 Normalization print("\n Normalization\n") norm_adata = apply_normalisation( adata=cutted_adata, save_folder=save_folder, norm_type=configs['preprocessing']['normalisation_function'], exclude_highly_expressed=configs.getboolean( "preprocessing", "exclude_highly_expressed"), raw=configs_file.getboolean("preprocessing", "read_raw_matrix"), adata_path_filenames=configs["data"]['output_path']) # save QC and normed adata object adata_filename = '{}_QC_normed.h5'.format(dataset_type) sc.write(os.path.join(configs["data"]['output_path'], adata_filename), norm_adata) # TODO plot normalised count data distribution # -------------------------------------------------- Optional ---------------------------------------------------- # # 2.2.1 Scale data if configs.getboolean("preprocessing", "apply_scaling"): norm_adata = scaling_and_regression.scaling(adata=norm_adata) # 2.2.2 Regress out uninteresting variation """ATTENTION: Regressing out biological covariates is generally done to isolate particular processes in the data that you are interested in, while losing global structure in the data. Cell cycle stage can be a major determinant in the difference between two cell types (e.g. stem cells and proliferating cells like transit amplifying cells). Removing this effect, hides the distinction""" if configs.getboolean("preprocessing", "apply_remove_cc_effect"): norm_adata = scaling_and_regression.apply_regression_variables( adata=norm_adata) # ---------------------------------------------------------------------------------------------------------------- # print("\n Visualisation\n") # 2.3.1 Highly Variable Genes (HVG) for feature selection # HVG: highly expressed in some cells and lowly expressed in others norm_adata = highly_variable_genes.find_highly_variable_genes( norm_adata, type_dataset="uncorrected", save_folder=save_folder, num_top_genes=4000, raw=configs.getboolean("preprocessing", "read_raw_matrix")) # 2.3.2 Determine No. PCs pc_determination.pcs_combs(norm_adata, save_folder, use_highly_variable=True, copy=False, return_info=False, raw=configs.getboolean("preprocessing", "read_raw_matrix"), type_dataset="uncorrected") # 2.3.3 Visualization try: n_comps = int( input( "Please provide the No. principal components (default 50): ")) except ValueError: n_comps = int(50) norm_adata = calculate_visualizations.calc_visualization( norm_adata, save_folder=save_folder, raw=configs.getboolean("preprocessing", "read_raw_matrix"), n_comps=n_comps, batch_key="uncorrected") if configs.getboolean("preprocessing", "get_cc_effect"): print("\n Cell Cycle\n") # 2.4 Cell cycle scoring # todo find another .csv file with cell cycle phases norm_adata = cell_cycle_storing.score_cell_cycle( cc_genes_file=configs['input_files']['cell_cycle'], adata=norm_adata, save_folder=save_folder, raw=configs.getboolean("preprocessing", "read_raw_matrix")) # 2.5 Apply Batch correction if samples are from same (or different) data set but splitted into batches # Dr. Maren Buettner: # "During the QC step, we observed differences across samples for instance, in the library size per dataset. # Such differences may contribute to the batch effect." if configs.getboolean("preprocessing", "sample_concat"): print("\n Batch Correction\n") norm_bc_adata = batch_correction.apply_batch_correction( norm_adata, save_folder=save_folder, n_comps=n_comps, batch_key='sample', possible_batch_effects=[ 'project', 'phase', 'patient', 'disease', 'biopsy_type' ]) # 2.5.1 Run find highly variable genes again on integrated dataset # HVG: highly expressed in some cells and lowly expressed in others norm_pp_adata = highly_variable_genes.find_highly_variable_genes( norm_bc_adata, type_dataset="batch_corrected", save_folder=save_folder, num_top_genes=4000, raw=configs.getboolean("preprocessing", "read_raw_matrix")) # Actually already calculated in Batch correction functions.. # 2.5.2 Determine No. PCs pc_determination.pcs_combs(norm_pp_adata, save_folder, type_dataset="batch_corrected", use_highly_variable=True, copy=False, return_info=False, raw=configs.getboolean( "preprocessing", "read_raw_matrix")) # 2.5.3 Visualisation n_comps = int( input( "Please provide the No. principal components (default 50): ")) norm_pp_adata = calculate_visualizations.calc_visualization( norm_pp_adata, save_folder=save_folder, raw=configs.getboolean("preprocessing", "read_raw_matrix"), n_comps=n_comps, batch_key="batch_corrected") else: norm_pp_adata = norm_adata.copy() adata_filename = '{}_QC_normed_BC.h5'.format(dataset_type, ) sc.write(os.path.join(configs["data"]['output_path'], adata_filename), norm_pp_adata) plots_preprocessing.plot_visualization_results(adata=norm_pp_adata, save_folder=save_folder, batch_key="batch_corrected", raw=configs.getboolean( "preprocessing", "read_raw_matrix")) print("-------- Finished: Pre-processing and Visualization --------") print("Start storing pre-processed AnnData object") # 2.7 save pre-processed annData object # # transform float e.g. 0.25 -> 0_25 mt_cut_splitted = str(mt_threshold).split(".") mt_cut = mt_cut_splitted[0] + str("_") + mt_cut_splitted[1] if max_umi_genes == 0: # save pre-processed annData object filter_name = '{}_minumi_{}_maxumi_{}_mg_{}_msc_{}_mt_{}_minumig_{}'.format( dataset_type, min_counts, max_counts, min_genes, min_shared_counts, mt_cut, min_umi_genes) else: # save pre-processed annData object filter_name = '{}_minumi_{}_maxumi_{}_mg_{}_msc_{}_mt_{}_minumig{}_maxumig_{}'.format( dataset_type, min_counts, max_counts, min_genes, min_shared_counts, mt_cut, min_umi_genes, max_umi_genes) adata_filename = '{}_pp.h5'.format(filter_name) sc.write(os.path.join(configs["data"]['output_path'], adata_filename), norm_pp_adata) return norm_pp_adata, adata_filename
if args.densify: input_counts = sc.AnnData(X=input_counts.values, obs=pd.DataFrame(index=input_counts.index), var=pd.DataFrame(index=input_counts.columns)) else: input_counts = sc.AnnData(X=sp.csr_matrix(input_counts.values), obs=pd.DataFrame(index=input_counts.index), var=pd.DataFrame(index=input_counts.columns)) if sp.issparse(input_counts.X) & args.densify: input_counts.X = np.array(input_counts.X.todense()) if args.tpm is None: tpm = compute_tpm(input_counts) sc.write(cnmf_obj.paths['tpm'], tpm) elif args.tpm.endswith('.h5ad'): subprocess.call('cp %s %s' % (args.tpm, cnmf_obj.paths['tpm']), shell=True) tpm = sc.read(cnmf_obj.paths['tpm']) else: if args.tpm.endswith('.npz'): tpm = load_df_from_npz(args.tpm) else: tpm = pd.read_csv(args.tpm, sep='\t', index_col=0) if args.densify: tpm = sc.AnnData(X=tpm.values, obs=pd.DataFrame(index=tpm.index), var=pd.DataFrame(index=tpm.columns)) else: tpm = sc.AnnData(X=sp.csr_matrix(tpm.values),
k = cr.tl.transition_matrix( adata, weight_connectivities=0.2, mode="stochastic", n_jobs=n_jobs, softmax_scale=None, show_progress_bar=False, ) g = cr.tl.estimators.GPCCA(k) g.compute_schur(20) g.compute_macrostates(9, cluster_key="clusters") g.set_terminal_states_from_macrostates( ["Alpha", "Beta", "Epsilon", "Delta"]) sc.write(ROOT / "adata_preprocessed.h5ad", adata) g.terminal_states.to_csv(ROOT / "terminal_states.csv") terminal_states = g.terminal_states def compute_abs_probs( ixs: np.ndarray, adata: AnnData, terminal_states: pd.Series, c: cr.tl.kernels.ConnectivityKernel, ): res = [] for i in ixs: try: conn = c.copy()
def prepare_and_load_celeba(file_path, attr_path, landmark_path, gender='Male', attribute='Smiling', max_n_images=None, restore=True, save=True, img_width=64, img_height=78, verbose=True): data_path = os.path.dirname(file_path) zip_filename = os.path.basename(file_path).split(".")[0] if restore and os.path.exists( os.path.join( data_path, f"celeba_{attribute}_{img_width}x{img_height}_{max_n_images}.h5ad" )): return sc.read( os.path.join( data_path, f"celeba_{attribute}_{img_width}x{img_height}_{max_n_images}.h5ad" )) def load_attr_list(file_path): indices = [] attributes = [] with open(file_path) as f: lines = f.read().splitlines() columns = lines[1].split(" ") columns.remove('') for i in range(2, len(lines)): elements = lines[i].split() indices.append(elements[0]) attributes.append(list(map(int, elements[1:]))) attr_df = pd.DataFrame(attributes) attr_df.index = indices attr_df.columns = columns if verbose: print(attr_df.shape[0]) return attr_df def load_landmark_list(file_path): indices = [] landmarks = [] with open(file_path) as f: lines = f.read().splitlines() columns = lines[1].split(" ") for i in range(2, len(lines)): elements = lines[i].split() indices.append(elements[0]) landmarks.append(list(map(int, elements[1:]))) landmarks_df = pd.DataFrame(landmarks) landmarks_df.index = indices landmarks_df.columns = columns print(landmarks_df.shape[0]) return landmarks_df images = [] zfile = zipfile.ZipFile(file_path) counter = 0 attr_df = load_attr_list(attr_path) landmarks = load_landmark_list(landmark_path) landmarks = landmarks[abs(landmarks['lefteye_x'] - landmarks['righteye_x']) > 30] landmarks = landmarks[abs(landmarks['lefteye_x'] - landmarks['nose_x']) > 15] landmarks = landmarks[abs(landmarks['righteye_x'] - landmarks['nose_x']) > 15] landmarks.head() attr_df = attr_df.loc[landmarks.index] print("# of images after preprocessing: ", attr_df.shape[0]) indices = [] for filename in attr_df.index.tolist(): ifile = zfile.open(os.path.join(f"{zip_filename}/", filename)) image = Image.open(ifile) image_landmarks = landmarks.loc[filename] most_left_x = max( 0, min(image_landmarks['lefteye_x'], image_landmarks['leftmouth_x']) - 15) most_right_x = min( 178, min(image_landmarks['righteye_x'], image_landmarks['rightmouth_x']) + 15) most_up_y = max(0, image_landmarks['lefteye_y'] - 35) most_down_y = min(218, image_landmarks['rightmouth_y'] + 25) image_cropped = image.crop( (most_left_x, most_up_y, most_right_x, most_down_y)) image_cropped = image_cropped.resize((img_width, img_height), Image.NEAREST) image = image_cropped image = np.reshape(image, (img_width, img_height, 3)) if max_n_images is None: images.append(image) indices.append(filename) counter += 1 if verbose and counter % 1000 == 0: print(counter) else: if counter < max_n_images: images.append(image) indices.append(filename) counter += 1 if verbose and counter % 1000 == 0: print(counter) else: break images = np.array(images) if verbose: print(images.shape) images_df = pd.DataFrame(images.reshape(-1, np.prod(images.shape[1:]))) images_df.index = indices if save: data = anndata.AnnData(X=images_df.values) attr_df = attr_df.loc[images_df.index] print(data.shape, attr_df.shape) data.obs['labels'] = attr_df[gender].values data.obs['condition'] = attr_df[attribute].values sc.write(filename=os.path.join( data_path, f"celeba_{attribute}_{img_width}x{img_height}_{max_n_images}.h5ad" ), adata=data) return data
fig, ax = plt.subplots(1, 1, figsize=(3, 3)) groupby = "donor_sex" if "donor_sex" in a.obs.columns else "leiden" sc.pl.violin(a, groupby=groupby, keys="sex_ratio", ax=ax, show=False) ax.axhline(0, linestyle="--", color="grey") fig.savefig( output_prefix + ".single_cell.sex_ratio.svg", dpi=300, bbox_inches="tight", ) # Make sure plotting order is random a = a[np.random.choice(a.obs.index.tolist(), a.obs.shape[0], replace=False ), :] if not os.path.exists(output_prefix + ".filtered.h5ad"): sc.write(output_prefix + ".filtered.h5ad", a) a = sc.read(output_prefix + ".filtered.h5ad") sc.pl.pca_variance_ratio(a, log=True, show=False) plt.gca().figure.savefig( output_prefix + ".single_cell.pca_variance_ratio.svg", dpi=300, bbox_inches="tight", ) fig = sc.pl.pca( a, color=tech_attributes + attributes, components=["1,2", "2,3", "3,4", "4,5"], return_fig=True,
cbar_kws={"label": "log difference from normal"}, rasterized=True) clustermap_rasterize_heatmap(g) clustermap_fix_label_orientation(g) clustermap_rasterize_dendrogram(g) savefig(g, os.path.join("results", prefix + ".dca_denoised-zinb.CNVs.whole_genome.grouped.svg")) cnv = sc.AnnData(p) cnv.obs = adata.obs sc.pp.pca(cnv) sc.pp.neighbors(cnv) sc.tl.umap(cnv) cnv.obs.to_csv(prefix + ".dca_denoised-zinb.batch_combat.processed.cnv.processed.obs.csv") cnv.obs = pd.DataFrame(index=cnv.obs.index) sc.write(prefix + ".dca_denoised-zinb.batch_combat.processed.cnv.processed.h5ad", cnv) prefix = "cll-time_course-scRNA-seq.all_samples.250-50_filter" cnv = sc.read(prefix + ".dca_denoised-zinb.batch_combat.processed.cnv.processed.h5ad") cnv.obs = pd.read_csv(prefix + ".dca_denoised-zinb.batch_combat.processed.cnv.processed.obs.csv", index_col=0) c = ( pd.DataFrame(cnv.X, index=cnv.obs.index, columns=cnv.var.index) .T .join(gene_order.set_index("gene"))) c = natsort_dataframe(c, ['chr', 'start']) c_cll = c.loc[:, cnv.obs.loc[cnv.obs['cell_type'] == "CLL", :].index] chosen_cells = list()
def save_adata(adata, suffix="", subdir=""): filename = f"{adata.uns['sampleid']}{'-' + suffix if suffix else ''}-{timestamp()}.h5ad" sc.write(Path(adata.uns["output_dir"]) / subdir / filename, adata)
def save_norm_counts(self, norm_counts): self._initialize_dirs() sc.write(self.paths['normalized_counts'], norm_counts)
def plot_spring(adata, smp=None, names=None, comps='1,2', cont=None, layout='2d', legendloc='right margin', cmap=None, pal=None, right_margin=None, size=3): """ Scatter plots. Parameters ---------- adata : AnnData Annotated data matrix. smp : str, optional (default: first annotation) Sample/Cell annotation for coloring in the form "ann1,ann2,...". String annotation is plotted assuming categorical annotation, float and integer annotation is plotted assuming continuous annoation. Option 'cont' allows to switch between these default choices. names : str, optional (default: all names in smp) Allows to restrict groups in sample annotation (smp) to a few. comps : str, optional (default: '1,2') String in the form '1,2,3'. cont : bool, None (default: None) Switch on continuous layout, switch off categorical layout. layout : {'2d', '3d', 'unfolded 3d'}, optional (default: '2d') Layout of plot. legendloc : see matplotlib.legend, optional (default: 'lower right') Options for keyword argument 'loc'. cmap : str (default: 'viridis') String denoting matplotlib color map. pal : list of str (default: matplotlib.rcParams['axes.prop_cycle'].by_key()['color']) Colors cycle to use for categorical groups. right_margin : float (default: 0.2) Adjust how far the plotting panel extends to the right. size : float (default: 3) Point size. """ Y = adata['X_spring'] if True: # sett.m(0, 'set parameter add_steps > 0 to iterate. ' # 'the current step is', dspring['istep'], # '\n--> append, for example, "--plotparams add_steps 1", for a single step') from .. import plotting as plott smps = plott.scatter( adata, basis='spring', smp=smp, names=names, comps=comps, cont=cont, layout=layout, legendloc=legendloc, cmap=cmap, pal=pal, right_margin=right_margin, size=size, # defined in plotting titles=['Fruchterman-Reingold step: 12']) writekey = sett.basekey + '_spring' writekey += '_' + ('-'.join(smps) if smps[0] is not None else '') + sett.plotsuffix plott.savefig(writekey) if not sett.savefigs and sett.autoshow: from ..compat.matplotlib import pyplot as pl pl.show() else: Adj = dspring['Adj'] istep = dspring['istep'] # TODO: don't save the adjacency matrix!!! import scanpy as sc sc.write(dspring['writekey'] + '_step{:02}'.format(istep), dspring) # compute the next steps istep_init = istep + 1 add_steps = params['add_steps'] del params['add_steps'] for istep in istep_init + np.arange(add_steps, dtype=int): sett.mt(0, 'compute Fruchterman-Reingold layout: step', istep) Y = fruchterman_reingold_layout(Adj, Yinit=Y, iterations=step_size) sett.mt(0, 'finished computation') _plot({'Y': Y}, adata, istep, **params) # save state of Y to outfile dspring['Y'] = Y dspring['istep'] = istep sc.write(dspring['writekey'], dspring)
cum = df[col] else: cum += df[col] fig.savefig( prefix + f"patient_{pat}.global_projection.stacked_bar_by_{var}.svg", **figkws, ) # Compare with newly designed space sc.pp.combat(p, "processing_batch_categorical") sc.pp.pca(p) sc.pp.neighbors(p, n_neighbors=50) sc.tl.umap(p) sc.tl.leiden(p, resolution=0.5, key_added="cluster") sc.write(prefix + f"{pat}.own_projection.processed.h5ad", p) n_cols = max(n_cols, p.shape[1]) + 1 fig, axes = plt.subplots(2, n_cols, figsize=(n_cols * 4, 2 * 4)) for i, ch in enumerate(p.var.index): sc.pl.umap(p, color=[ch], ax=axes[0, i], show=False) k = dict(show=False, size=4) sc.pl.umap(p, color=["cluster"], cmap="rainbow", ax=axes[1, 0], **k) sc.pl.umap(p, color=["time_symptoms"], cmap="rainbow", ax=axes[1, 1], **k) for i, time in enumerate(times, 2): p.obs["plot"] = (p.obs["time_symptoms"] == time).astype(float) print(p.obs["plot"].sum()) sc.pl.umap( p, color=["plot"], cmap="Reds", ax=axes[1, i], vmin=-0.25, **k )
# save pre-processed annData object filter_name = '{}_minumi_{}_maxumi_{}_mg_{}_msc_{}_mt_{}_minumig{}_maxumig_{}'.format( dataset_type, min_counts, max_counts, min_genes, min_shared_counts, mt_cut, min_umi_genes, max_umi_genes) adata_filename = '{}_pp.h5'.format(filter_name) sc.write(os.path.join(configs["data"]['output_path'], adata_filename), norm_pp_adata) return norm_pp_adata, adata_filename if __name__ == '__main__': output_path = os.path.join("..", "..", "output", str(date.today())) os.makedirs(output_path, exist_ok=True) adata_savepath = init_variables.init_vars() configs_file = ht.load_config(config_path=adata_savepath) # 1. Load data print("# -- >Load data and information< -- #") _, unpp_filtered_adata, _, _ = load_dataset(configs=configs_file) # save adata unppadata_filename = '{}_unpp.h5'.format(configs_file['data']['data_type']) sc.write(os.path.join(adata_savepath, unppadata_filename), unpp_filtered_adata) print("-------- Finished: Read out values --------") pp_adata, filename_adata = main(configs=configs_file, adata=unpp_filtered_adata, save_folder=output_path)
def run_NMF(sample_name, matrix_10X, threads, K_range, K_selection, density_threshold, iteration, run_K): if not run_K: logging.info("reading matrix") adata = sc.read_10x_mtx(matrix_10X, var_names='gene_symbols', cache=False) adata.var_names_make_unique() outdir = "NMF_out/" if not os.path.exists(outdir): os.mkdir(outdir) count_adat_fn = outdir + sample_name + '.h5ad' logging.info("writing h5 file") sc.write(count_adat_fn, adata) numiter = iteration # Number of NMF replicates. Set this to a larger value ~200 for real data. We set this to a relatively low value here for illustration at a faster speed numhvgenes = 2000 ## Number of over-dispersed genes to use for running the actual factorizations ## Results will be saved to [output_directory]/[run_name] which in this example is example_PBMC/cNMF/pbmc_cNMF seed = 0 ## Specify a seed pseudorandom number generation for reproducibility numworkers = threads prepare_cmd = """python /SGRNJ/Database/script/soft/cNMF/cnmf.py \ prepare --output-dir %s --name %s -c %s -k %s --n-iter %d \ --total-workers %d --seed %d --numgenes %d --beta-loss frobenius""" % ( outdir, sample_name, count_adat_fn, K_range, numiter, numworkers, seed, numhvgenes) logging.info( 'Prepare command assuming parallelization with %d cores:\n%s' % (numworkers, prepare_cmd)) os.system(prepare_cmd) ## Using GNU parallel worker_index = ' '.join([str(x) for x in range(numworkers)]) factorize_cmd = """parallel python /SGRNJ/Database/script/soft/cNMF/cnmf.py \ factorize --output-dir %s --name %s --worker-index {} ::: %s""" % ( outdir, sample_name, worker_index) logging.info( 'Factorize command to simultaneously run factorization over %d cores using GNU parallel:\n%s' % (numworkers, factorize_cmd)) os.system(factorize_cmd) # combine combine_cmd = 'python /SGRNJ/Database/script/soft/cNMF/cnmf.py \ combine --output-dir %s --name %s' % (outdir, sample_name) logging.info(combine_cmd) os.system(combine_cmd) worker_index = ' '.join([str(x) for x in range(numworkers)]) kselect_plot_cmd = 'python /SGRNJ/Database/script/soft/cNMF/cnmf.py \ k_selection_plot --output-dir %s --name %s' % (outdir, sample_name) logging.info('K selection plot command: %s' % kselect_plot_cmd) os.system(kselect_plot_cmd) # run_K consensus_cmd = 'python /SGRNJ/Database/script/soft/cNMF/cnmf.py \ consensus --output-dir %s --name %s --local-density-threshold %.2f \ --components %d --show-clustering' % (outdir, sample_name, density_threshold, K_selection) logging.info('Consensus command for K=%d:\n%s' % (K_selection, consensus_cmd)) os.system(consensus_cmd) ## Load the Z-scored GEPs which reflect how enriched a gene is in each GEP relative to all of the others density_threshold_str = ('%.2f' % density_threshold).replace('.', '_') gene_file = '{outdir}/{sample_name}/{sample_name}.gene_spectra_score.k_{K_selection}.dt_{density_threshold_str}.txt'.format( outdir=outdir, sample_name=sample_name, K_selection=K_selection, density_threshold_str=density_threshold_str) gene_scores = pd.read_csv(gene_file, sep='\t', index_col=0).T ## Obtain the top 100 genes for each GEP in sorted order and combine them into a single dataframe top_genes = [] ngenes = 100 for gep in gene_scores.columns: top_genes.append( list( gene_scores.sort_values(by=gep, ascending=False).index[:ngenes])) top_genes = pd.DataFrame(top_genes, index=gene_scores.columns).T top_genes_file = '{outdir}/{sample_name}_top100_genes.tsv'.format( outdir=outdir, sample_name=sample_name) top_genes.to_csv(top_genes_file, sep="\t") usage_file = '{outdir}/{sample_name}/{sample_name}.usages.k_{K_selection}.dt_{density_threshold_str}.consensus.txt'.format( outdir=outdir, sample_name=sample_name, K_selection=K_selection, density_threshold_str=density_threshold_str) logging.info("usage_file:" + usage_file) usage = pd.read_csv(usage_file, sep='\t', index_col=0) usage.columns = ['Usage_%s' % i for i in usage.columns] usage_norm = usage.div(usage.sum(axis=1), axis=0) usage_norm_file = '{outdir}/{sample_name}_usage_norm.tsv'.format( outdir=outdir, sample_name=sample_name) usage_norm.to_csv(usage_norm_file, sep="\t")
def write(adata,version,name): '''write adata into [name]''' name = version + name sc.write(name,adata) print("_".join(name.split(".")) + " = '%s'"%name)
data_doublets = os.path.join(sc.settings.writedir, '..', 'doublets') if not os.path.exists(data_doublets): os.makedirs(data_doublets) for key in doublet_scores: np.savetxt(os.path.join(data_doublets, key + '_doublet_scores.txt'), doublet_scores[key]) doublet_scores_list = [] for key in doublet_scores: #print(key) doublet_scores_list += list(doublet_scores[key]) data.obs['doublet_scores'] = doublet_scores_list len(doublet_scores_list) predicted_doublets_mask = [] for key in predicted_doublets: #print(key) predicted_doublets_mask += list(predicted_doublets[key]) len(predicted_doublets_mask) predicted_singletons_mask = [not i for i in predicted_doublets_mask] data = data[np.array(predicted_singletons_mask), :].copy() print('Removing %d cells due to doublet scoring' % (len(predicted_singletons_mask) - sum(predicted_singletons_mask))) sc.write('SLX19841_LD_filtered_gene_bc_expression_minus_putative_doublets', data)
data_doublets = os.path.join(sc.settings.writedir, 'doublets') if not os.path.exists(data_doublets): os.makedirs(data_doublets) for key in doublet_scores: np.savetxt(os.path.join(data_doublets, key + '_doublet_scores.txt'), doublet_scores[key]) doublet_scores_list = [] for key in doublet_scores: doublet_scores_list += list(doublet_scores[key]) data.obs['doublet_scores'] = doublet_scores_list # create the boolean mask to filter out predicted doublets predicted_doublets_mask = [] for key in predicted_doublets: predicted_doublets_mask += list(predicted_doublets[key]) predicted_singletons_mask = [not i for i in predicted_doublets_mask] data = data[np.array(predicted_singletons_mask), :].copy() print('Removing %d cells due to doublet scoring' % (len(predicted_singletons_mask) - sum(predicted_singletons_mask))) sc.write('SLX14831_12978_filtered_gene_bc_expression_minus_putative_doublets', data)