def collect_cells(self, out_file: str) -> None: # Verify that the previous punchard subset exists parent = os.path.join(self.config.paths.build, "data", self.subset.card.name + ".loom") if not os.path.exists(parent): logging.error(f"Punchcard file '{parent}' was missing.") sys.exit(1) # Verify that there are some cells in the subset with loompy.connect(parent, mode="r") as ds: if (ds.ca.Subset == self.subset.name).sum() == 0: logging.info( f"Skipping {self.name} because the subset was empty") sys.exit(0) logging.info(f"Collecting cells for {self.name}") with loompy.new(out_file) as dsout: # Collect from a previous punchard subset with loompy.connect(parent, mode="r") as ds: for (_, _, view) in ds.scan( items=(ds.ca.Subset == self.subset.name), axis=1, key="Accession", layers=["", "spliced", "unspliced"], what=["layers", "col_attrs", "row_attrs"]): dsout.add_columns(view.layers, view.ca, row_attrs=view.ra)
def create_subsetted_loom(loom, output_loom, cellmask): """Deprecated. Parameters ---------- loom : output_loom : cellmask : genemask : Returns ------- """ import loompy with loompy.new(output_loom) as dsout: cells = np.where(cellmask)[0] for (ix, selection, view) in loom.scan(items=cells, axis=1, key="gene"): dsout.add_columns(view.layers, col_attrs=view.ca, row_attrs=view.ra)
def create_subsetted_loom(loom, output_loom_filename, cellmask): """Deprecated. Will create a new loom file with cells specified according to a Boolean vector mask. Parameters ---------- loom : LoomConnection object which will be subsetted output_loom_filename : string denoting the path and filename of the output loom file. cellmask : Boolean numpy vector with length equal to the number of cells in "loom" Returns ------- """ import loompy if len(cellmask) != loom.shape[1]: raise Exception( "cellmask must be boolean mask with length equal to the number of columns of loom" ) with loompy.new(output_loom_filename) as dsout: cells = np.where(cellmask)[0] for (ix, selection, view) in loom.scan(items=cells, axis=1, key="gene"): dsout.add_columns(view.layers, col_attrs=view.ca, row_attrs=view.ra)
def test_new() -> None: with loompy.new("test.loom") as ds: m = np.zeros((20, 100)) ra = {"Gene": [x for x in "ABCDEFGHILJKLMNOPQRS"]} ca = {"Cell": np.arange(100)} ds.add_columns(m, ca, row_attrs=ra) ds.add_columns(m, ca, row_attrs=ra) with loompy.connect("test.loom") as ds: assert (ds.shape == (20, 200))
def combine_loom_files(loom_file_list, library, species, organ, project_id, project_name, output_loom_file): expression_data_type_list = [] optimus_output_schema_version_list = [] pipeline_versions_list = [] input_id_metadata_field_list = [] input_name_metadata_field_list = [] input_id_list = [] input_name_list = [] with loompy.new("intermediate.loom") as dsout: for i in range(len(loom_file_list)): loom_file = loom_file_list[i] with loompy.connect(loom_file) as ds: # add global attributes for this file to the running list of global attributes expression_data_type_list.append(ds.attrs["expression_data_type"]) optimus_output_schema_version_list.append(ds.attrs["optimus_output_schema_version"]) pipeline_versions_list.append(ds.attrs["pipeline_version"]) input_id_metadata_field_list.append(ds.attrs["input_id_metadata_field"]) input_name_metadata_field_list.append(ds.attrs["input_name_metadata_field"]) input_id_list.append(ds.attrs["input_id"]) input_name_list.append(ds.attrs["input_name"]) # check that the ordering is the same for the matrices being combined if dsout.shape[0] != 0: assert(np.array_equal(dsout.ra["ensembl_ids"], ds.ra["ensembl_ids"])) # filter out cells with low counts n_molecules > 1 UMIs = ds.ca['n_molecules'] cells = np.where(UMIs >= 100)[0] for (ix, selection, view) in ds.scan(items=cells, axis=1): view.ca['cell_names'] = view.ca['cell_names'] + "-" + str(i) dsout.add_columns(view.layers, col_attrs=view.ca, row_attrs=view.ra) # add global attributes for this file to the running list of global attributes ds = loompy.connect("intermediate.loom") row_attrs = ds.ra[:] col_attrs = ds.ca[:] sp = ds.sparse() # Write out a new loom file with the sparse matrix loompy.create(output_loom_file, sp, row_attrs, col_attrs) ds.close() # add the global atributes to the loom file ds = loompy.connect(output_loom_file) ds.attrs["library_preparation_protocol.library_construction_approach"] = library ds.attrs["donor_organism.genus_species"] = species ds.attrs["specimen_from_organism.organ"] = organ ds.attrs["project.provenance.document_id"] = project_id ds.attrs["project.project_core.project_name"] = project_name ds.attrs["expression_data_type"] = ", ".join(set(expression_data_type_list)) ds.attrs["optimus_output_schema_version"] = ", ".join(set(optimus_output_schema_version_list)) ds.attrs["pipeline_version"] = ", ".join(set(pipeline_versions_list)) ds.attrs["input_id_metadata_field"] = ", ".join(set(input_id_metadata_field_list)) ds.attrs["input_name_metadata_field"] = ", ".join(set(input_name_metadata_field_list)) ds.attrs["input_id"] = ", ".join(input_id_list) ds.attrs["input_name"] = ", ".join(input_name_list) ds.close()
import scipy.sparse as sparse loom_file_in = snakemake.input['loom'] loom_test_out = snakemake.output['loom_test'] loom_train_out = snakemake.output['loom_train'] # Load the barcode list for cells from the loom file ds = loompy.connect(loom_file_in, 'r') test_ii = np.random.choice(ds.shape[1], size=ds.shape[1] // 2, replace=False) train_ii = np.setdiff1d(np.arange(ds.shape[1]), test_ii) test_ii = np.sort(test_ii) train_ii = np.sort(train_ii) ds_test_out = loompy.new(loom_test_out) for (ix, selection, view) in ds.scan(items=test_ii, axis=1): ds_test_out.add_columns(view.layers, col_attrs=view.ca, row_attrs=view.ra) ds_train_out = loompy.new(loom_train_out) for (ix, selection, view) in ds.scan(items=train_ii, axis=1): ds_train_out.add_columns(view.layers, col_attrs=view.ca, row_attrs=view.ra) ds.close() ds_test_out.close() ds_train_out.close()
ind_healthy = meta['Compartment'].str.startswith('Normal').values meta = meta[ind_healthy] counts = counts[:, ind_healthy] print('Turns out now we can convert to dense') counts = counts.todense() # FIXME: figure out what those annotations actually mean!! print('Set output file') fdn_out = '../data_full/Young_2018/' fn_out = fdn_out+'dataset.loom' os.makedirs(fdn_out, exist_ok=True) print('Write to loom') with loompy.new(fn_out) as dsl: dsl.add_columns( layers={'': counts}, row_attrs={ 'GeneName': meta_genes['Symbol'].values, }, col_attrs={ 'CellID': meta.index.values, 'CellType': meta['ClusterID'].values, 'NumberOfGenes': meta['nGenes'].astype(int).values, 'NumberOfUMI': meta['nUMI'].astype(int).values, 'Subject': meta['Source'].values, 'Location': meta['Compartment'].values, } )
def clr(x): x = np.log(x + 1) x = x.subtract(x.mean(axis=1), axis=0) return x ab_clr = clr(ab) # import matplotlib.pyplot as plt # plt.figure() # plt.plot(ab_clr['CD3'], ab_clr['CD4'], 'o', ms=2) # #plt.plot(ab_clr['CD14'], ab_clr['CD4'], 'o', ms=2) # plt.show() # # plt.figure() # plt.hist(np.log10(ab['CD4']+1), 30) # plt.show() is_mono = ((ab_clr['CD14'] > 2)).values ab_mono = ab.loc[is_mono] with loompy.connect(in_loom, mode='r') as ds: with loompy.new(out_loom) as ds_out: view = ds.view[:, is_mono] ds_out.add_columns(view.layers, col_attrs=view.ca, row_attrs=view.ra) ab_mono.to_csv(out_ab, sep="\t", compression='gzip')
def submit(loomfile, model, hapcode, chunk, submit_start, submit_end, outdir, email, queue, mem, walltime, systype, dryrun): LOG.warn('Loom file: %s' % loomfile) LOG.warn('Models: %s, %s' % (model[0], model[1])) LOG.warn('HPC system type: %s' % systype) if dryrun: LOG.warn('Showing submission script only') with loompy.connect(loomfile) as ds: ds.attrs.HapCode = hapcode num_genes, num_cells = ds.shape if submit_end == 0: submit_end = num_genes gsurv = np.where(ds.ra.Selected[submit_start:submit_end])[0] + submit_start num_gsurv = len(gsurv) LOG.warn('The number of selected genes: %d' % num_gsurv) LOG.warn('The number of selected cells: %d' % num_cells) LOG.warn('%d jobs will be submitted' % int(np.ceil(num_gsurv/chunk))) processed = 0 if systype == 'pbs': #tgx_layer = '' #mat_layer = hapcode[0] mat_layer, pat_layer = hapcode for idx_start in range(0, num_gsurv, chunk): # for idx_start in xrange(0, num_gsurv, chunk): # for idx_start in xrange(submit_start, submit_end, chunk): idx_end = min(idx_start+chunk, num_gsurv-1) #idx_end = min(submit_end, idx_start+chunk, num_gsurv-1) start = gsurv[idx_start] if idx_end < num_gsurv-1: end = gsurv[idx_end] genes = gsurv[idx_start:idx_end] else: #idx_end == num_gsurv-1: end = submit_end #end = num_genes genes = gsurv[idx_start:] LOG.info('Chunk start: %d, end %d' % (start, end)) infile = os.path.join(outdir, '_chunk.%05d-%05d.npz' % (start, end)) LOG.debug('Genes: %s' % ' '.join(genes.astype(str))) LOG.debug('Total %d genes submitted in this job' % len(genes)) data_dict = dict() data_dict['shape'] = (len(genes), num_cells) with loompy.connect(loomfile, 'r') as ds: data_dict['GeneID'] = ds.ra.GeneID[genes] cur_chunk = dict() #cur_chunk[tgx_layer] = ds.layers[tgx_layer][genes, :] cur_chunk[mat_layer] = ds.layers[mat_layer][genes, :] cur_chunk[pat_layer] = ds.layers[pat_layer][genes, :] #cur_chunk[tgx_layer] = cur_chunk[mat_layer] + cur_chunk[pat_layer] data_dict['Counts'] = cur_chunk data_dict['Size'] = ds.ca.Size data_dict['Selected'] = np.ones(len(genes)) # select all np.savez_compressed(infile, **data_dict) outfile = os.path.join(outdir, '_scbase.%05d-%05d.param.npz' % (start, end)) job_par = 'ASE_MODEL=%s,TGX_MODEL=%s,MAT_HAPCODE=%s,PAT_HAPCODE=%s,OUTFILE=%s,INFILE=%s' % \ (model[0], model[1], hapcode[0], hapcode[1], outfile, infile) cmd = ['qsub'] if email is not None: cmd += ['-M', email] if queue is not None: cmd += ['-q', queue] if mem > 0: cmd += ['-l', 'mem=%d' % mem] if walltime > 0: cmd += ['-l', 'walltime=%d:00:00' % walltime] cmd += ['-v', job_par] cmd += [os.path.join(os.path.dirname(os.environ['_']), 'run_mcmc_on_cluster.sh')] if dryrun: print(" ".join(cmd)) else: LOG.info(" ".join(cmd)) call(cmd) time.sleep(1.0) processed += len(genes) LOG.debug('Total %d genes were submitted' % processed) LOG.warn('Job submission complete') elif systype == 'pbs-with-whole-loom': # Do not use this: loom is not stable # for idx_start in xrange(0, num_gsurv, chunk): for idx_start in range(0, num_gsurv, chunk): idx_end = min(idx_start+chunk, num_gsurv-1) start = gsurv[idx_start] if idx_end < num_gsurv-1: end = gsurv[idx_end] genes = gsurv[idx_start:idx_end] else: #idx_end == num_gsurv-1: end = num_genes genes = gsurv[idx_start:] LOG.info('Chunk start: %d, end %d' % (start, end)) LOG.debug('Genes: %s' % ' '.join(genes.astype(str))) LOG.debug('Total %d genes submitted in this job' % len(genes)) outfile = os.path.join(outdir, '_scbase.%05d-%05d.param.npz' % (start, end)) job_par = 'ASE_MODEL=%s,TGX_MODEL=%s,MAT_HAPCODE=%s,PAT_HAPCODE=%s,START=%d,END=%d,OUTFILE=%s,INFILE=%s' % \ (model[0], model[1], hapcode[0], hapcode[1], start, end, outfile, loomfile) cmd = ['qsub'] if email is not None: cmd += ['-M', email] if queue is not None: cmd += ['-q', queue] if mem > 0: cmd += ['-l', 'mem=%d' % mem] if walltime > 0: cmd += ['-l', 'walltime=%d:00:00' % walltime] cmd += ['-v', job_par] cmd += [os.path.join(os.path.dirname(os.environ['_']), 'run_mcmc_on_cluster.sh')] if dryrun: print(" ".join(cmd)) else: LOG.info(" ".join(cmd)) call(cmd) time.sleep(1.0) processed += len(genes) LOG.debug('Total %d genes were submitted' % processed) LOG.warn('Job submission complete') elif systype == 'pbs-with-loom-chunks': # Do not use this: loompy does not support this # for idx_start in xrange(0, num_gsurv, chunk): for idx_start in range(0, num_gsurv, chunk): idx_end = min(idx_start+chunk, num_gsurv-1) start = gsurv[idx_start] end = gsurv[idx_end] if idx_end < num_gsurv-1: end = gsurv[idx_end] genes = gsurv[idx_start:idx_end] else: #idx_end == num_gsurv-1: end = num_genes genes = gsurv[idx_start:] LOG.info('Chunk start: %d, end %d' % (start, end)) infile = os.path.join(outdir, '_chunk.%05d-%05d.loom' % (start, end)) LOG.debug('Genes: %s' % ' '.join(genes.astype(str))) LOG.debug('Total %d genes submitted in this job' % len(genes)) with loompy.connect(loomfile, 'r') as ds: with loompy.new(infile) as dsout: for (_, selection, view) in ds.scan(items=genes, axis=0): LOG.debug('Genes in this view: %s' % ' '.join(selection.astype())) dsout.add_columns(view.layers, col_attrs=view.col_attrs, row_attrs=view.row_attrs) outfile = os.path.join(outdir, '_scbase.%05d-%05d.param.npz' % (start, end)) job_par = 'ASE_MODEL=%s,TGX_MODEL=%s,MAT_HAPCODE=%s,PAT_HAPCODE=%s,OUTFILE=%s,INFILE=%s' % \ (model[0], model[1], hapcode[0], hapcode[1], outfile, infile) cmd = ['qsub'] if email is not None: cmd += ['-M', email] if queue is not None: cmd += ['-q', queue] if mem > 0: cmd += ['-l', 'mem=%d' % mem] if walltime > 0: cmd += ['-l', 'walltime=%d:00:00' % walltime] cmd += ['-v', job_par] cmd += [os.path.join(os.path.dirname(os.environ['_']), 'run_mcmc_on_cluster.sh')] if dryrun: print(" ".join(cmd)) else: LOG.info(" ".join(cmd)) call(cmd) time.sleep(1.0) processed += len(genes) LOG.debug('Total %d genes were submitted' % processed) LOG.warn('Job submission complete') elif 'lsf': raise NotImplementedError('LSF submission is not yet supported') else: raise RuntimeError('No plan to support other job scheduling system until we see many requests')
def fit(self, ds: loompy.LoomConnection) -> None: logging.info("Computing pseudoage") ages = np.array([age_to_num(x) for x in ds.ca.Age]) knn = ds.col_graphs.KNN k = knn.nnz / knn.shape[0] ds.ca.PseudoAge = (knn.astype("bool") @ ages) / k logging.info("Slicing pseudoage") slice_names: List[str] = [] with TemporaryDirectory() as tempfolder: slices = np.percentile(ds.ca.PseudoAge, np.arange(0, 101, 5)) logging.info("Collecting cells") for (ix, _, view) in ds.scan(axis=1): for i in range(len(slices) - 2): s1 = slices[i] s2 = slices[i + 2] slice_name = f"Age{s1:05.2f}to{s2:05.2f}".replace( ".", "") + ".loom" if slice_name not in slice_names: slice_names.append(slice_name) cells = ((view.ca.PseudoAge >= s1) & (view.ca.PseudoAge < s2)) if cells.sum() == 0: continue fname = os.path.join(tempfolder, slice_name) if not os.path.exists(fname): with loompy.new(fname) as dsout: dsout.add_columns(view.layers[:, cells], col_attrs=view.ca[cells], row_attrs=view.ra) else: with loompy.connect(fname) as dsout: dsout.add_columns(view.layers[:, cells], col_attrs=view.ca[cells], row_attrs=view.ra) for slice_name in slice_names: fname = os.path.join(tempfolder, slice_name) logging.info("Cytograph on " + slice_name) with loompy.connect(fname) as ds: Cytograph(config=load_config()).fit(ds) # Use dynamic programming to find the deepest tree (forest), as given by total number of cells along each branch logging.info("Computing pseudolineage") clusters = "Clusters" min_pct = 0.1 # List of matrices giving the bipartite graph between each pair of layers, weighted by number of shared cells overlaps = [] n_nodes = [] # List of number of nodes (clusters) in each layer n_cells = [ ] # List of arrays giving the number of cells in each cluster n_layers = len(slice_names) # Compute the bipartite graphs between layers for t in range(n_layers): # Link clusters from layer t to clusters from layer t + 1 logging.info(f"{slice_names[t]}.loom") with loompy.connect(os.path.join(tempfolder, slice_names[t])) as ds1: n_nodes.append(ds1.ca[clusters].max() + 1) n_cells.append(np.zeros(n_nodes[t])) for c in range(n_nodes[t]): n_cells[t][c] = (ds1.ca[clusters] == c).sum() if t >= n_layers - 1: break with loompy.connect( os.path.join(tempfolder, slice_names[t + 1])) as ds2: overlap = np.zeros( (np.unique(ds1.ca[clusters]).shape[0], np.unique(ds2.ca[clusters]).shape[0]), dtype="int") for i in np.unique(ds1.ca[clusters]): cells1 = ds1.ca.CellID[ds1.ca[clusters] == i] for j in np.unique(ds2.ca[clusters]): cells2 = ds2.ca.CellID[ds2.ca[clusters] == j] overlap[i, j] = np.intersect1d(cells1, cells2).shape[0] overlaps.append(overlap) # List of arrays keeping track of the depth of the deepest tree starting at each node in the layer # Depth defined as sum of the number of shared cells along the branch depths = [np.zeros(n, dtype="int") for n in n_nodes] edges = [ np.zeros(n, dtype="int") for n in n_nodes[1:] ] # List of arrays giving the predecessor of each cluster (or -1 if no predecessor) for t in range(0, n_layers - 1): for i in range(n_nodes[t + 1]): # Now find the widest deepest branch from any node j in layer t to node i in layer t + 1 # Widest, deepest meaning: greatest sum of depth up to node j in layer t plus number of shared cells # But disallowing any branch with less than min_pct % shared cells best_j = -1 best_depth = 0 for j in range(n_nodes[t]): pct_overlapping = 100 * overlaps[t][j, i] / ( n_cells[t][j] + n_cells[t + 1][i]) if pct_overlapping > min_pct: depth = depths[t][j] + overlaps[t][j, i] if depth > best_depth: best_depth = depth best_j = j edges[t][i] = best_j # Now we have # # edges: List of arrays giving the index of the predecessor of each cluster (or -1 if no predecessor exists) # overlaps: List of matrices giving the number of cells shared between clusters in layer t and t + 1 # n_nodes: List of number of nodes (clusters) in each layer # n_cells: List of arrays of number of cells in each node (cluster) # Now position the nodes of each layer such that no edges cross ypositions = [np.arange(n_nodes[0])] for t in range(len(edges)): pos = np.full(n_nodes[t + 1], -1) for i in range(pos.shape[0]): prev = edges[t][i] if (prev) >= 0: pos[i] = ypositions[t][prev] ordering = np.argsort(pos) mapping = dict(zip(ordering, range(len(ordering)))) ypositions.append( np.array([mapping[i] for i in range(len(ordering))])) # Make the positions proportional to the number of cells (cumulative) max_pos = 0 for i, pos in enumerate(ypositions): with loompy.connect(os.path.join(tempfolder, slice_names[i])) as ds0: n_clusters = ds0.ca[clusters].max() + 1 ncells = np.array([(ds0.ca[clusters] == i).sum() for i in range(n_clusters)]) total = 0 new_pos = np.zeros_like(pos) for j in range(len(pos)): cluster = np.where(pos == j)[0] new_pos[cluster] = total + ncells[cluster] / 2 total += ncells[cluster] ypositions[i] = new_pos / 1000 max_pos = max(max_pos, max(ypositions[i])) for i, pos in enumerate(ypositions): ypositions[i] += (max_pos - np.max(pos)) / 2 # Then position the layers properly in time xpositions = [] for i in range(n_layers): with loompy.connect(os.path.join(tempfolder, slice_names[i])) as ds0: xpositions.append(np.mean(ds0.ca.PseudoAge)) # Now project each individual cell to the pseudolineage logging.info("Projecting cells to pseudolineage") cell_to_xy = {} for t in range(len(n_nodes) - 1): with loompy.connect(os.path.join(tempfolder, slice_names[t])) as ds0: with loompy.connect( os.path.join(tempfolder, slice_names[t + 1])) as ds1: for i in range(n_nodes[t + 1]): if edges[t][i] != -1: y1 = ypositions[t][edges[t][i]] y2 = ypositions[t + 1][i] offset = (xpositions[t + 1] - xpositions[t]) / 4 overlapping_cells = (ds1.ca[clusters] == i) & ( ds1.ca.PseudoAge < slices[t + 2]) crs = np.array( CatmullRomSpline( n_points=100).fit_transform( np.array( [[slices[t + 1] - offset, y1], [slices[t + 1], y1], [slices[t + 2], y2], [slices[t + 2] + offset, y2]]))) widths = np.linspace(n_cells[t][edges[t][i]], n_cells[t + 1][i], num=100) / 1500 f = interp1d(crs[:, 0], crs[:, 1], fill_value="extrapolate") fw = interp1d(crs[:, 0], widths, fill_value="extrapolate") y = f( ds1.ca.PseudoAge[overlapping_cells] ) + np.random.normal( scale=fw( ds1.ca.PseudoAge[overlapping_cells]) / 6, size=overlapping_cells.sum()) for i, ix in enumerate( np.where(overlapping_cells)[0]): cell_to_xy[ds1.ca.CellID[ix]] = [ ds1.ca.PseudoAge[ix], y[i] ] # Draw the leftmost pseudoage slice if t == 0: for i in range(n_nodes[0]): y1 = ypositions[0][i] y2 = ypositions[0][i] widths = np.linspace(n_cells[t][i], n_cells[t][i], num=100) / 1500 overlapping_cells = (ds0.ca[clusters] == i) & ( ds0.ca.PseudoAge < slices[1]) y = y1 + np.random.normal( scale=widths[0] / 6, size=overlapping_cells.sum()) for i, ix in enumerate( np.where(overlapping_cells)[0]): cell_to_xy[ds1.ca.CellID[ix]] = [ ds0.ca.PseudoAge[ix], y[i] ] # Draw the rightmost pseudoage slice if t == len(n_nodes) - 2: for i in range(n_nodes[-1]): y1 = ypositions[t][edges[t][i]] y2 = ypositions[t + 1][i] widths = np.linspace(n_cells[t][edges[t][i]], n_cells[t + 1][i], num=100) / 1500 overlapping_cells = (ds1.ca[clusters] == i) & ( ds1.ca.PseudoAge > slices[-2]) y = y2 + np.random.normal( scale=widths[-1] / 6, size=overlapping_cells.sum()) for i, ix in enumerate( np.where(overlapping_cells)[0]): cell_to_xy[ds1.ca.CellID[ix]] = [ ds1.ca.PseudoAge[ix], y[i] ] logging.info( "Saving pseudolineage projection back in original file") logging.info(ds.ca) return cell_to_xy xy = np.zeros((ds.shape[1], 2)) for i, cellid in enumerate(cell_to_xy.keys()): j = np.where(ds.ca.CellID == cellid)[0] xy[j] = cell_to_xy[cellid] ds.ca.PseudoLineage = xy