def test_preprocess_random_data(tmp_path): import loompy from panopticon.preprocessing import generate_count_normalization from panopticon.analysis import generate_incremental_pca, generate_embedding, generate_clustering, generate_masked_module_score d = tmp_path / "sub" d.mkdir() p = str(d / "test.loom") data = np.random.randint(1000, size=(100, 1000)) loompy.create(p, data, row_attrs={'gene': np.arange(100)}, col_attrs={'cell': np.arange(1000)}) db = loompy.connect(p) generate_count_normalization(db, '', 'log2(TP100k+1)') generate_incremental_pca(db, 'log2(TP100k+1)') generate_embedding(db, 'log2(TP100k+1)') generate_clustering(db, 'log2(TP100k+1)', n_clustering_iterations=2, clusteringcachedir=str(d / "clusteringcachedir/")) print(db.ca.keys()) print(db.ra.keys()) assert 'cell' in db.ca.keys() assert 'gene' in db.ra.keys() assert 'log2(TP100k+1) PCA UMAP embedding 1' in db.ca.keys() db.close()
def main(): # Read args mat_file = sys.argv[1] cell_features_file = sys.argv[2] out_loom_file = sys.argv[3] delimiter = get_delimiter_from_extension(mat_file) # Get count matrix, gene names, cell names and count_matrix = read_matrix(mat_file, delimiter) cell_names = read_names(mat_file, delimiter, "column") gene_names = read_names(mat_file, delimiter, "row") # Getting features i.e. col and row metadata cell_features = read_cell_features( cell_features_file, get_delimiter_from_extension(cell_features_file)) cell_features["CellID"] = cell_names row_features = {"Gene": gene_names} # Making sure the number of cells in features matches number of cells in matrix ncells = count_matrix.shape[1] for key in cell_features: if ncells != cell_features[key].shape[0]: raise Exception("Number of cells in " + key + "is not the sames as number of cells in matrix") # Make loom file loompy.create( out_loom_file, count_matrix, row_attrs=row_features, col_attrs=cell_features, )
def create_loom_files(args): """This function creates the loom file or folder structure in output_loom_path in format file_format, with input_id from the input folder analysis_output_path Args: input_id (str): sample or cell id qc_analysis_output_files_string (str): a string with the file names in the QCGroup of SS2 pipeline output, separated by commas rsem_genes_results_file (str): the file for the expression count output_loom_path (str): location of the output loom """ # generate a dictionary of column attributes col_attrs = generate_col_attr(args) # add the expression count matrix data # generate a dictionary of row attributes row_attrs, expr_tpms, expr_counts = generate_row_attr_and_matrix( args.rsem_genes_results_file) attrDict = dict() attrDict['input_id'] = args.input_id if args.input_name is not None: attrDict['input_name'] = args.input_name attrDict['pipeline_version'] = args.pipeline_version #generate loom file loompy.create(args.output_loom_path, expr_tpms, row_attrs, col_attrs, file_attrs=attrDict) ds = loompy.connect(args.output_loom_path) ds.layers['estimated_counts'] = expr_counts ds.close()
def create_subsetted_loom_with_genemask(loom, output_loom, cellmask, genemask): """Deprecated. Parameters ---------- loom : output_loom : cellmask : genemask : Returns ------- """ print("THIS FUNCTION IS DEPRECATED, USE loompy.new INSTEAD!!!") import loompy from panopticon.utilities import recover_meta if '' not in loom.layers.keys(): raise Exception("Expecting '' layer, yet none found") rowmeta, colmeta = recover_meta(loom) loompy.create(output_loom, loom[''][genemask, :][:, cellmask], rowmeta[genemask].to_dict("list"), colmeta[cellmask].to_dict("list")) with loompy.connect(output_loom) as smallerloom: for layer in [x for x in loom.layer.keys() if x != '']: smallerloom[layer] = loom[layer][:, cellmask][genemask, :]
def test_get_coordinates(loom_file): matrix, row_attrs, col_attrs, attrs = loom_file # POC for custom data in the loom file. num_cells = 100 _X = np.concatenate( [rg.normal(n, 0.1, int(num_cells / 4)) for n in range(-2, 2)]) _Y = rg.normal(0, 0.1, num_cells) main_embedding = pd.DataFrame(columns=["_X", "_Y"]) main_embedding["_X"] = _X main_embedding["_Y"] = _Y col_attrs["Embedding"] = Loom.dfToNamedMatrix(main_embedding) lp.create(filename=str(LOOM_PATH), layers=matrix, row_attrs=row_attrs, col_attrs=col_attrs, file_attrs=attrs) with lp.connect(LOOM_PATH, mode="r", validate=False) as ds: test_loom = Loom(LOOM_PATH, LOOM_PATH, ds, LOOM_FILE_HANDLER) np.testing.assert_equal(test_loom.get_coordinates(-1), { "x": _X, "y": -_Y, "cellIndices": list(range(num_cells)) })
def create_markers_file(loom_file_path: str, marker_n: int = 100, overwrite: bool = False) -> None: """Create a .marker (loom format) file that contains a (marker x cell) tables and all necessary annotation to plot it Args ---- loom_file_path: the path to the .loom file marker_n: the total number of genes will be approximatelly N_clusters * marker_n / 3. Returns ------- Nothing. Saves a file at loom_file_path.marker """ if os.path.exists(loom_file_path): marker_file_path = os.path.splitext(loom_file_path)[0] + ".markers" if os.path.exists(marker_file_path): if overwrite: logging.debug("Removing old version of %s" % marker_file_path) os.remove(marker_file_path) else: logging.debug("Previous version of %s was found, saving a backup" % marker_file_path) os.rename(marker_file_path, marker_file_path + '.bak') else: raise IOError("%s does not exist" % loom_file_path) ds, df, cols_df, rows_df = loompy2data_annot(loom_file_path) df_markers, rows_annot, cols_annot, accession_list, gene_cluster, mus = prepare_heat_map(df, cols_df, rows_df, marker_n=marker_n) loompy.create(marker_file_path, df_markers.values, {k:np.array(v) for k,v in rows_annot.T.to_dict("list").items()}, {k:np.array(v) for k,v in cols_annot.T.to_dict("list").items()})
def convert_to_loom(df): """Convert a dataframe of expression values to a loom file.""" path = _get_temp_path(".loom") loompy.create(path, df.as_matrix(), {"gene_names": df.index.values}, {"cell_names": df.columns.values}) return path
def create_loom_from_figshare(fn_loom): '''Create a loom file from our FigShare files''' print('Convert TSV into a loom file for convenience') fn_counts = fdn_data+'table_counts_lungimmune.tsv.gz' fn_meta = fdn_data+'table_cellmetadata_lungimmune.tsv.gz' print('Load metadata') samplesheet = pd.read_csv( fn_meta, sep='\t', index_col=0, compression='gzip', ) print('Load counts') counts = pd.read_csv( fn_counts, sep='\t', index_col=0, compression='gzip', ).astype(np.float32) print('Normalize by coverage') counts = (1e6 * counts / samplesheet['Coverage']).astype(np.float32) col_attrs = {col: samplesheet[col].values for col in samplesheet.columns} col_attrs['CellID'] = samplesheet.index.values row_attrs = {'GeneName': counts.index.values} loompy.create( fn_loom, layers={'': counts.values}, col_attrs=col_attrs, row_attrs=row_attrs, )
def write_loom(filename: PathLike, adata: AnnData, write_obsm_varm: bool = False): filename = Path(filename) row_attrs = {k: np.array(v) for k, v in adata.var.to_dict('list').items()} row_attrs['var_names'] = adata.var_names.values col_attrs = {k: np.array(v) for k, v in adata.obs.to_dict('list').items()} col_attrs['obs_names'] = adata.obs_names.values if adata.X is None: raise ValueError('loompy does not accept empty matrices as data') if write_obsm_varm: for key in adata.obsm.keys(): col_attrs[key] = adata.obsm[key] for key in adata.varm.keys(): row_attrs[key] = adata.varm[key] else: if len(adata.obsm.keys()) > 0 or len(adata.varm.keys()) > 0: logger.warning( 'The loom file will lack these fields:\n{}\n' 'Use write_obsm_varm=True to export multi-dimensional annotations' .format(adata.obsm.keys() + adata.varm.keys())) layers = {'': adata.X.T} for key in adata.layers.keys(): layers[key] = adata.layers[key].T from loompy import create if filename.exists(): filename.unlink() create(fspath(filename), layers, row_attrs=row_attrs, col_attrs=col_attrs)
def create_loom_files(args): """This function creates the loom file or folder structure in output_loom_path in format file_format, with sample_id from the input folder analysis_output_path Args: args (argparse.Namespace): input arguments for the run """ version = "1.0.0" # generate a dictionary of row attributes row_attrs = generate_row_attr(args) # generate a dictionarty of column attributes col_attrs = generate_col_attr(args) # add the expression count matrix data expr_sp_t = generate_matrix(args) # generate global attributes attrDict = dict() attrDict['expression_data_type'] = args.expression_data_type attrDict['optimus_output_schema_version'] = version attrDict['sample_id'] = args.sample_id #generate loom file loompy.create(args.output_loom_path, expr_sp_t, row_attrs, col_attrs, file_attrs=attrDict)
def write_loom(filename: PathLike, adata: AnnData, write_obsm_varm: bool = False): filename = Path(filename) row_attrs = {k: np.array(v) for k, v in adata.var.to_dict("list").items()} row_names = adata.var_names row_dim = row_names.name if row_names.name is not None else "var_names" row_attrs[row_dim] = row_names.values col_attrs = {k: np.array(v) for k, v in adata.obs.to_dict("list").items()} col_names = adata.obs_names col_dim = col_names.name if col_names.name is not None else "obs_names" col_attrs[col_dim] = col_names.values if adata.X is None: raise ValueError("loompy does not accept empty matrices as data") if write_obsm_varm: for key in adata.obsm.keys(): col_attrs[key] = adata.obsm[key] for key in adata.varm.keys(): row_attrs[key] = adata.varm[key] elif len(adata.obsm.keys()) > 0 or len(adata.varm.keys()) > 0: logger.warning( f"The loom file will lack these fields:\n" f"{adata.obsm.keys() | adata.varm.keys()}\n" f"Use write_obsm_varm=True to export multi-dimensional annotations" ) layers = {"": adata.X.T} for key in adata.layers.keys(): layers[key] = adata.layers[key].T from loompy import create if filename.exists(): filename.unlink() create(fspath(filename), layers, row_attrs=row_attrs, col_attrs=col_attrs)
def test_file_with_empty_col_attrs_is_valid(self) -> None: f = NamedTemporaryFile(suffix=".loom") f.close() loompy.create(f.name, np.zeros((5, 5)), {}, {}) try: self.assertTrue( LoomValidator().validate(f.name), "File with empty col_attrs or row_attrs should be valid") finally: os.remove(f.name)
def test_get_abs_file_path(loom_file): matrix, row_attrs, col_attrs, attrs = loom_file lp.create(filename=str(LOOM_PATH), layers=matrix, row_attrs=row_attrs, col_attrs=col_attrs, file_attrs=attrs) with lp.connect(LOOM_PATH, mode="r", validate=False) as ds: test_loom = Loom(LOOM_PATH, LOOM_PATH, ds, LOOM_FILE_HANDLER) assert test_loom.get_abs_file_path() == LOOM_PATH
def test_infer_species(loom_file): matrix, row_attrs, col_attrs, attrs = loom_file lp.create(filename=str(LOOM_PATH), layers=matrix, row_attrs=row_attrs, col_attrs=col_attrs, file_attrs=attrs) with lp.connect(LOOM_PATH, mode="r", validate=False) as ds: test_loom = Loom(LOOM_PATH, LOOM_PATH, ds, LOOM_FILE_HANDLER) assert test_loom.infer_species() == ("Unknown", {})
def test_has_motif_and_track_regulons(loom_file): matrix, row_attrs, col_attrs, attrs = loom_file lp.create(filename=str(LOOM_PATH), layers=matrix, row_attrs=row_attrs, col_attrs=col_attrs, file_attrs=attrs) with lp.connect(LOOM_PATH, mode="r", validate=False) as ds: test_loom = Loom(LOOM_PATH, LOOM_PATH, ds, LOOM_FILE_HANDLER) assert test_loom.has_motif_and_track_regulons() == True
def batch_add_sparse(loom_file, layers, row_attrs, col_attrs, append=False, empty_base=False, batch_size=512): """ Batch adds sparse matrices to a loom file Args: loom_file (str): Path to output loom file layers (dict): Keys are names of layers, values are matrices to include Matrices should be features by observations row_attrs (dict): Attributes for rows in loom file col_attrs (dict): Attributes for columns in loom file append (bool): If true, append new cells. If false, overwrite file empty_base (bool): If true, add an empty array to the base layer batch_size (int): Size of batches of cells to add """ # Check layers feats = set([]) obs = set([]) for key in layers: if not sparse.issparse(layers[key]): raise ValueError('Expects sparse matrix input') feats.add(layers[key].shape[0]) obs.add(layers[key].shape[1]) if len(feats) != 1 or len(obs) != 1: raise ValueError('Matrix dimension mismatch') # Get size of batches obs_size = list(obs)[0] feat_size = list(feats)[0] batches = np.array_split(np.arange(start=0, stop=obs_size, step=1), np.ceil(obs_size / batch_size)) for batch in batches: batch_layer = dict() if empty_base: batch_layer[''] = np.zeros((feat_size, batch.shape[0]), dtype=int) for key in layers: batch_layer[key] = layers[key].tocsc()[:, batch].toarray() batch_col = dict() for key in col_attrs: batch_col[key] = col_attrs[key][batch] if append: with loompy.connect(filename=loom_file) as ds: ds.add_columns(layers=batch_layer, row_attrs=row_attrs, col_attrs=batch_col) else: loompy.create(filename=loom_file, layers=batch_layer, row_attrs=row_attrs, col_attrs=batch_col) append = True
def convert_to_loom(df): """Convert a dataframe of expression values to a loom file.""" path = _get_temp_path(".loom") qcs = fake_qc_values(NUM_QC_VALUES, df.index, seed=df.values.sum()) row_attrs = qcs.to_dict(orient='list') row_attrs["cell_name"] = df.index.values loompy.create(path, df.as_matrix(), row_attrs, {"gene_name": df.columns.values}) return path
def test_get_global_attribute_by_name(loom_file): matrix, row_attrs, col_attrs, attrs = loom_file lp.create(filename=str(LOOM_PATH), layers=matrix, row_attrs=row_attrs, col_attrs=col_attrs, file_attrs=attrs) with lp.connect(LOOM_PATH, mode="r", validate=False) as ds: test_loom = Loom(LOOM_PATH, LOOM_PATH, ds, LOOM_FILE_HANDLER) assert test_loom.get_global_attribute_by_name( "Genome") == "Nomen dubium"
def setUp(self) -> None: self.file = NamedTemporaryFile(suffix=".loom") self.file.close() loompy.create( self.file.name, np.random.random((5, 5)), row_attrs={ "key": np.fromiter(range(5), dtype=np.int) }, col_attrs={ "key": np.fromiter(range(5), dtype=np.int) })
def test_get_cell_ids(loom_file): matrix, row_attrs, col_attrs, attrs = loom_file lp.create(filename=str(LOOM_PATH), layers=matrix, row_attrs=row_attrs, col_attrs=col_attrs, file_attrs=attrs) with lp.connect(LOOM_PATH, mode="r", validate=False) as ds: test_loom = Loom(LOOM_PATH, LOOM_PATH, ds, LOOM_FILE_HANDLER) num_cells = ds.shape[1] assert (test_loom.get_cell_ids() == np.array( [f"Cell_{n}" for n in range(1, num_cells + 1)])).all()
def save_cluster_avg(input_file: str, output_file: str) -> None: ds = loompy.connect(input_file) cells = np.where(ds.col_attrs["_Valid"] == 1)[0] labels = ds.col_attrs["Clusters"][cells] Nclust = np.max(labels) + 1 ca = {"Cluster": np.arange(Nclust), "OriginalFile": np.array([input_file] * Nclust)} ra = {"Accession": ds.row_attrs["Accession"], "Gene": ds.row_attrs["Gene"]} m = np.empty((ds.shape[0], Nclust)) for (ix, selection, vals) in ds.batch_scan(cells=cells, genes=None, axis=0): vals_avg = npg.aggregate_numba.aggregate(labels, vals, func="mean", axis=1) m[selection, :] = vals_avg loompy.create(output_file, m, ra, ca)
def test_get_meta_data_cluster_by_clustering_id_and_cluster_id(loom_file): matrix, row_attrs, col_attrs, attrs = loom_file lp.create(filename=str(LOOM_PATH), layers=matrix, row_attrs=row_attrs, col_attrs=col_attrs, file_attrs=attrs) with lp.connect(LOOM_PATH, mode="r", validate=False) as ds: test_loom = Loom(LOOM_PATH, LOOM_PATH, ds, LOOM_FILE_HANDLER) assert ( test_loom.get_meta_data_cluster_by_clustering_id_and_cluster_id( 0, 0)["description"] == "Unannotated Cluster 1")
def test_sparse() -> None: G = 1000 C = 100 S = sparse.eye(G, C) loompy.create('test.loom', S, {'g_id': np.arange(G)}, {'c_id': np.arange(C)}) with loompy.connect("test.loom") as ds: ds["layer"] = S assert (np.all(ds[:, :] == S.toarray())) assert (np.all(ds.sparse().data == S.tocoo().data)) assert (np.all(ds.layers["layer"][:, :] == S.toarray())) assert (np.all(ds.layers["layer"].sparse().data == S.tocoo().data))
def write_loom(filename: Union[Path, str], adata: AnnData): filename = str(filename) # allow passing Path object row_attrs = adata.var.to_dict('list') row_attrs['var_names'] = adata.var_names.values col_attrs = adata.obs.to_dict('list') col_attrs['obs_names'] = adata.obs_names.values X = adata.X.T if issparse(X): logg.info('... writing to \'.loom\' file densifies sparse matrix') X = X.tocoo() from loompy import create if os.path.exists(filename): os.remove(filename) create(filename, X, row_attrs=row_attrs, col_attrs=col_attrs)
def save_loom(self, filename: str) -> None: """Save an ExpressionMatrix as a loom file Parameters ---------- filename : str Name of loom file """ import loompy row_attrs = {k: self['cells'][k].values for k in self['cells'].coords} col_attrs = {k: self['genes'][k].values for k in self['genes'].coords} loompy.create(filename, self.data, row_attrs, col_attrs)
def to_dataset_file(self, filename, fmt=None, **kwargs): '''Store dataset into an integrated dataset file Args: filename (str): path of the file to write to. fmt (str or None): file format. If None, infer from the file extension. **kwargs (keyword arguments): depend on the format. The additional keyword argument for the supported formats are: - loom: - axis_samples: `rows` or `columns` (default) ''' if fmt is None: fmt = filename.split('.')[-1] if fmt == 'loom': import loompy matrix = self.counts.values row_attrs = { col: self.featuresheet[col].values for col in self.featuresheet } col_attrs = { col: self.samplesheet[col].values for col in self.samplesheet } # Add attributes for the indices no matter what if self.featuresheet.index.name is not None: row_attrs[self.featuresheet.index. name] = self.featuresheet.index.values else: row_attrs['_index'] = self.featuresheet.index.values if self.samplesheet.index.name is not None: col_attrs[self.samplesheet.index. name] = self.samplesheet.index.values else: col_attrs['_index'] = self.samplesheet.index.values if kwargs.get('axis_samples', 'columns') != 'columns': matrix = matrix.T row_attrs, col_attrs = col_attrs, row_attrs loompy.create(filename, matrix, row_attrs, col_attrs) else: raise ValueError('File format not supported')
def get_gene_expression(loom_file): matrix, row_attrs, col_attrs, attrs = loom_file lp.create(filename=str(LOOM_PATH), layers=matrix, row_attrs=row_attrs, col_attrs=col_attrs, file_attrs=attrs) with lp.connect(LOOM_PATH, mode="r", validate=False) as ds: test_loom = Loom(LOOM_PATH, LOOM_PATH, ds, LOOM_FILE_HANDLER) np.testing.assert_equal( test_loom.get_gene_expression("Gene_1", True, False), np.log1p(matrix[0])) np.testing.assert_equal( test_loom.get_gene_expression("Gene_100", False, False), matrix[99])
def write_loom(filename: PathLike, adata: AnnData): filename = Path(filename) row_attrs = adata.var.to_dict('list') row_attrs['var_names'] = adata.var_names.values col_attrs = adata.obs.to_dict('list') col_attrs['obs_names'] = adata.obs_names.values X = adata.X.T if issparse(X): logg.info( '... writing to \'.loom\' file densifies sparse matrix') X = X.tocoo() from loompy import create if filename.exists(): filename.unlink() create(fspath(filename), X, row_attrs=row_attrs, col_attrs=col_attrs)
def write_loom(filename: PathLike, adata: AnnData): filename = Path(filename) row_attrs = {k: np.array(v) for k, v in adata.var.to_dict('list').items()} row_attrs['var_names'] = adata.var_names.values col_attrs = {k: np.array(v) for k, v in adata.obs.to_dict('list').items()} col_attrs['obs_names'] = adata.obs_names.values layers = {'': adata.X.T} for key in adata.layers.keys(): layers[key] = adata.layers[key].T from loompy import create if filename.exists(): filename.unlink() create(fspath(filename), layers, row_attrs=row_attrs, col_attrs=col_attrs)
def to_loom(self, loom_file_name) -> None: row_attrs = { k: v.values for (k, v) in self[MATRIX_AXES.REGIONS].coords.items() } col_attrs = { k: v.values for (k, v) in self[MATRIX_AXES.FEATURES].coords.items() } file_attrs = self.attrs loompy.create(loom_file_name, self.values, row_attrs, col_attrs, file_attrs=file_attrs)