print(mod) ti = time.time() if settings[mod].mod_category == 'mc': f_mat = hvftrs_f.format(mod, 'tsv') gxc_hvftrs[mod] = pd.read_csv(f_mat, sep='\t', header=0, index_col=0) print(gxc_hvftrs[mod].shape, time.time() - ti) assert np.all( gxc_hvftrs[mod].columns.values == metas[mod].index.values ) # make sure cell name is in the sanme order as metas (important if save knn mat) continue f_mat = hvftrs_f.format(mod, 'npz') f_gene = hvftrs_gene.format(mod) f_cell = hvftrs_cell.format(mod) _gxc_tmp = snmcseq_utils.load_gc_matrix(f_gene, f_cell, f_mat) _gene = _gxc_tmp.gene _cell = _gxc_tmp.cell _mat = _gxc_tmp.data gxc_hvftrs[mod] = GC_matrix(_gene, _cell, _mat) assert np.all( gxc_hvftrs[mod].cell == metas[mod].index.values ) # make sure cell name is in the sanme order as metas (important if save knn mat) print(gxc_hvftrs[mod].data.shape, time.time() - ti) resolutions = [ 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 3, 4, 6, 8, 12, 16, 20, 30, 40, 60, 80, 100, 120 ] # ns = [1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000]
print(mod) ## read data # read metadata normalization_option = normalization_options[mod] f_meta = f_meta_format.format(SRC_DIR, mod) ## meta = pd.read_csv(f_meta, sep="\t", index_col=0) metas[mod] = meta f_data = f_data_format.format(SRC_DIR, mod, '', 'npz') f_data_gene = f_data_format.format(SRC_DIR, mod, '', 'gene') f_data_cell = f_data_format.format(SRC_DIR, mod, '', 'cell') # read counts matrix print(mod, "Reading in files {}".format(time.time()-ti)) gxc_raw = snmcseq_utils.load_gc_matrix(f_data_gene, f_data_cell, f_data) # checked dimensions in agreement internally gxc_raws[mod] = gxc_raw num_cells = len(meta) num_reads = gxc_raw.data.sum().sum()/num_cells num_reads_all[mod] = num_reads print(gxc_raw.data.shape, num_cells, num_reads) # check meta cells agree with gxc cells assert np.all(meta.index.values == gxc_raw.cell) # check genes are uniq assert len(gxc_raw.gene) == len(np.unique(gxc_raw.gene)) print(mod, "Total time used: {}".format(time.time()-ti))
gxc_raws = collections.OrderedDict() for mod in mods_selected: logging.info("Read data {}...".format(mod)) if settings[mod].mod_category == 'mc': f_gene = raw_f.format(DATA_DIR, mod, '', 'gene') f_cell = raw_f.format(DATA_DIR, mod, '', 'cell') f_data_c = raw_f.format(DATA_DIR, mod, 'CH_', 'npz') f_data_mc = raw_f.format(DATA_DIR, mod, 'mCH_', 'npz') gxc_raws[mod] = snmcseq_utils.load_gc_matrix_methylation( f_gene, f_cell, f_data_mc, f_data_c) else: f_gene = raw_f.format(DATA_DIR, mod, '', 'gene') f_cell = raw_f.format(DATA_DIR, mod, '', 'cell') f_data = raw_f.format(DATA_DIR, mod, '', 'npz') gxc_raws[mod] = snmcseq_utils.load_gc_matrix(f_gene, f_cell, f_data) # In[13]: f = output_clst_and_umap first_round_cluster_col = 'cluster_joint_r0.1' df_info = pd.read_csv( f, sep="\t", index_col='sample')[[first_round_cluster_col, 'modality']] print(df_info.shape) df_info.head() # In[20]: normalization_options = { 'smarter_nuclei': 'TPM', 'smarter_cells': 'TPM',