示例#1
0
def read_loom(filename: PathLike, sparse: bool = False) -> AnnData:
    """Read ``.loom``-formatted hdf5 file.

    This reads the whole file into memory.

    Beware that you have to explicitly state when you want to read the file as
    sparse data.

    Parameters
    ----------
    filename
        The filename.
    sparse
        Whether to read the data matrix as sparse.
    """
    filename = fspath(filename)  # allow passing pathlib.Path objects
    from loompy import connect
    if sparse:
        with connect(filename, 'r') as lc:
            X = lc.sparse()
    else:
        with h5py.File(filename, 'r') as f:
            X = f['matrix'][()]
    with connect(filename, 'r') as lc:
        adata = AnnData(
            X.T,
            obs=dict(lc.col_attrs),  # not ideal: make the generator a dict...
            var=dict(lc.row_attrs))
        lc.close()
    return adata
def get_attr_index(loom_file,
                   attr=None,
                   columns=False,
                   as_bool=True,
                   inverse=False):
    """
    Gets index for desired attributes in a loom file

    Args:
        loom_file (str): Path to loom file
        attr (str): Optional, attribute used to restrict index
            If None, all elements are included
        columns (boolean): Specifies if pulling rows or columns
            True: column attributes
            False: row attributes
        as_bool (bool): Return as boolean (true) or numerical (false) array
        inverse (bool): If true, returns inverse of index
            All trues are false, all falses are true

    Returns:
        idx (1D array): Index of attributes to use
            boolean if as_bool, numerical if not as_bool

    Assumptions:
        attr specifies a boolean array attribute in loom_file
    """

    with loompy.connect(filename=loom_file, mode='r') as ds:
        if columns:
            if attr:
                idx = ds.ca[attr].astype(bool)
            else:
                idx = np.ones((ds.shape[1], ), dtype=bool)
        else:
            if attr:
                idx = ds.ra[attr].astype(bool)
            else:
                idx = np.ones((ds.shape[0], ), dtype=bool)
    if inverse:
        idx = np.logical_not(idx)
    if as_bool:
        pass
    elif idx.ndim == 1:
        idx = np.where(idx)[0]
    else:
        raise ValueError('idx must be one dimensional')
    return idx
示例#3
0
def load_exp_matrix_as_loom(
        fname,
        attribute_name_cell_id: str = ATTRIBUTE_NAME_CELL_IDENTIFIER,
        attribute_name_gene: str = ATTRIBUTE_NAME_GENE) -> pd.DataFrame:
    """
    Load expression matrix from loom file.

    :param fname: The name of the loom file to load.
    :return: A 2-dimensional dataframe (rows = cells x columns = genes).
    """
    with lp.connect(fname, mode='r', validate=False) as ds:
        # The orientation of the loom file is always:
        #   - Columns represent cells or aggregates of cells
        # 	- Rows represent genes
        return pd.DataFrame(data=ds[:, :],
                            index=ds.ra[attribute_name_gene],
                            columns=ds.ca[attribute_name_cell_id]).T
示例#4
0
文件: io.py 项目: nealpsmith/pegasus
def load_loom_file(input_loom: str,
                   genome: str,
                   ngene: int = None) -> "MemData":
    """Load count matrix from a LOOM file. Currently only support HCA DCP Loom spec.

    Parameters
    ----------

    input_loom : `str`
        The LOOM file, containing the count matrix.
    genome : `str`
        The genome reference.
    ngene : `int`, optional (default: None)
        Minimum number of genes to keep a barcode. Default is to keep all barcodes.

    Returns
    -------

    An MemData object containing a genome-Array2D pair.

    Examples
    --------
    >>> io.load_loom_file('example.loom', genome = 'GRCh38', ngene = 200)
    """
    import loompy

    col_trans = {"CellID": "barcodekey"}
    row_trans = {"Accession": "featurekey", "Gene": "featurename"}

    data = MemData()
    with loompy.connect(input_loom) as ds:
        mat = csr_matrix(ds.sparse().T)
        barcode_metadata = {}
        for keyword, values in ds.col_attrs.items():
            keyword = col_trans.get(keyword, keyword)
            barcode_metadata[keyword] = values
        feature_metadata = {}
        for keyword, values in ds.row_attrs.items():
            keyword = row_trans.get(keyword, keyword)
            feature_metadata[keyword] = values

    array2d = Array2D(barcode_metadata, feature_metadata, mat)
    array2d.filter(ngene=ngene)
    data.addData(genome, array2d)

    return data
示例#5
0
def _load_loom(path_to_file: str,
               gene_names_attribute_name: str = "Gene") -> AnnData:
    import loompy

    dataset = loompy.connect(path_to_file)
    select = dataset[:, :].sum(
        axis=0) > 0  # Take out cells that don't express any gene
    if not all(select):
        warnings.warn("Removing empty cells")

    var_dict, obs_dict, uns_dict, obsm_dict = {}, {}, {}, {}
    for row_key in dataset.ra:
        if row_key == gene_names_attribute_name:
            gene_names = dataset.ra[gene_names_attribute_name].astype(str)
        else:
            var_dict[row_key] = dataset.ra[row_key]
            if type(var_dict[row_key]) is np.ndarray:
                var_dict[row_key] = var_dict[row_key].ravel()

    for column_key in dataset.ca:
        obs_dict = obs_dict if obs_dict is not None else {}
        obs_dict[column_key] = dataset.ca[column_key][select]
        if type(obs_dict[column_key]) is np.ndarray:
            if len(obs_dict[column_key]) == len(obs_dict[column_key].ravel()):
                obs_dict[column_key] = obs_dict[column_key].ravel()
            else:
                obsm_dict[column_key] = obs_dict[column_key]
                del obs_dict[column_key]

    for global_key in dataset.attrs:
        uns_dict = uns_dict if uns_dict is not None else {}
        uns_dict[global_key] = dataset.attrs[global_key]
        if type(uns_dict[global_key]) is np.ndarray:
            uns_dict[global_key] = uns_dict[global_key].ravel()
    data = dataset[:, :].T  # change matrix to cells by genes
    dataset.close()

    adata = AnnData(X=data,
                    obs=obs_dict,
                    var=var_dict,
                    uns=uns_dict,
                    obsm=obsm_dict)
    adata = adata[select].copy()
    adata.var_names = gene_names

    return adata
示例#6
0
def main():

    loom_in_file = sys.argv[1]
    loom_out_file = sys.argv[2]

    copy2(loom_in_file, loom_out_file)

    loom_out = lp.connect(loom_out_file)

    # get sra ids
    new_features = get_sra(loom_out)

    # adds them to file
    loom_out.ca.insdc_run_accessions = new_features["insdc_run_accessions"]
    loom_out.ca.file_name = new_features["file_name"]

    loom_out.close()
示例#7
0
def run_mcmc(loomfile, model, hapcode, start, end, outfile):
    LOG.warn('Quantifying allele-specific expression in each cell')
    LOG.info('Level-1 verbose is on')
    LOG.debug('Level-2 verbose is also on')
    model_file_ase = '%s.pkl' % model[0]
    model_file_tgx = '%s.pkl' % model[1]
    LOG.warn('ASE model file: %s' % get_data(model_file_ase))
    stan_model_ase = pickle.load(open(get_data(model_file_ase), 'rb'))
    LOG.debug('ASE model code\n%s' % stan_model_ase.model_code)
    LOG.warn('TGX model file: %s' % get_data(model_file_tgx))
    stan_model_tgx = pickle.load(open(get_data(model_file_tgx), 'rb'))
    LOG.debug('TGX model code\n%s' % stan_model_tgx.model_code)
    ds = loompy.connect(loomfile, 'r')
    if end is None:
        end = ds.shape[0]
    LOG.warn('Genes from %d to %d (0-based indexing)' % (start, end))
    c = ds.ca.Size / np.median(ds.ca.Size)
    LOG.debug('c: %s' % '\t'.join(c[:6].astype(str)))
    param = dict()
    processed = 0
    #tgx_layer = ''
    #mat_layer = hapcode[0]
    mat_layer, pat_layer = hapcode
    # for g in xrange(start, end):
    for g in range(start, end):
        if ds.ra.Selected[g]:
            LOG.warn('Loading data for Gene %s' % ds.ra['GeneID'][g])
            #n = ds.layers[tgx_layer][g]
            x = ds.layers[mat_layer][g]
            y = ds.layers[pat_layer][g]
            n = x + y
            LOG.debug('x: %s ...' % '\t'.join(x[:6].astype(int).astype(str)))
            LOG.debug('n: %s ...' % '\t'.join(n[:6].astype(int).astype(str)))
            cur_param = dict()
            LOG.warn('Fitting ASE with %s model' % model[0])
            cur_param['ase'] = __mcmc4ase(x, n, stan_model_ase).summary()['summary']
            LOG.warn('Fitting TGX with %s model' % model[1])
            cur_param['tgx'] = __mcmc4tgx(n, c, stan_model_tgx).summary()['summary']
            param[ds.row_attrs['GeneID'][g]] = cur_param
            processed += 1
    LOG.info("All {:,d} genes have been processed.".format(processed))
    if outfile is None:
        outfile = '_scbase.%05d-%05d.param.npz' % (start, end)
    np.savez_compressed(outfile, **param)
    ds.close()
def high_mem_get_data(loom_file,
                      layer,
                      feat_attr,
                      cell_attr,
                      valid_ra,
                      valid_ca,
                      remove_version,
                      verbose):
    """
    Gets relevant counts and type information for a given loom file

    Args:
        loom_file (str): Path to loom file
        layer (str): Layer in loom_file containing counts
        feat_attr (str): Row attribute containing unique feature IDs
        cell_attr (str): Column attribute containing unique cell IDs
        valid_ra (str/None): Row attribute specifying rows to include
        valid_ca (str/None): Column attribute specifying columns to include
        remove_version (bool): If True, remove GENCODE version ID
        verbose (bool): If true, print logging messages
    """
    if verbose:
        int_log.info('Obtaining counts from layer {0} in {1}'.format(layer, loom_file))
    # Get indices
    row_idx = utils.get_attr_index(loom_file=loom_file,
                                   attr=valid_ra,
                                   columns=False,
                                   as_bool=False,
                                   inverse=False)
    col_idx = utils.get_attr_index(loom_file=loom_file,
                                   attr=valid_ca,
                                   columns=True,
                                   as_bool=False,
                                   inverse=False)
    # Get data
    with loompy.connect(loom_file) as ds:
        dat = ds.layers[layer].sparse(row_idx, col_idx).todense()
        dat = pd.DataFrame(dat,
                           index=ds.ra[feat_attr][row_idx],
                           columns=ds.ca[cell_attr][col_idx])
    # Process data
    if remove_version:
        dat.index = utils.remove_gene_version(dat.index.values)
    dat = dat.T
    return dat
示例#9
0
 def load_loom_file(self,
                    file_path: Path,
                    abs_file_path: Path,
                    mode: str = "r") -> Optional[Loom]:
     try:
         loom_connection = lp.connect(abs_file_path.as_posix(),
                                      mode=mode,
                                      validate=False)
         return self.add_loom(
             file_path=file_path,
             abs_file_path=abs_file_path,
             loom_connection=loom_connection,
         )
     except KeyError as e:
         logger.error(e)
         os.remove(file_path)
         logger.warning(f"Deleting malformed loom {file_path}")
         return None
    def __init__(self,
                 loom,
                 schema=None,
                 cell_type_fields=None,
                 validate_loom=True):
        """
        loom: path to loom file
        schema: path to JSON schema file.  If schema not specified, attempts to use package
         or repo version.
        cell_type_fields: optionally specify one or more fields used to record cell type"""
        if not schema:
            try:
                schema = pkg_resources.resource_filename(
                    "matrix_semantic_map",
                    "json_schema/expression_matrix_semantic_map.json")
                assert os.path.isfile(schema) is True
            except:
                try:
                    schema = pkg_resources.resource_filename(
                        "matrix_semantic_map",
                        "../json_schema/expression_matrix_semantic_map.json")
                    assert os.path.isfile(schema) is True
                except FileNotFoundError:
                    warnings.warn(
                        "Schema file (expression_matrix_semantic_map.json) "
                        "not found in expected default location for package"
                        " installation or running from repo. Please specify"
                        " location via schema argument.")
                else:
                    pass
            else:
                pass

        self.loom = loom  # Connect and close when used.
        self.validate_loom = validate_loom
        self.semantic_map = {"semantic_map": []}
        with loompy.connect(loom, validate=self.validate_loom) as lc:
            if 'semantic_map' in lc.attrs.keys():
                self.semantic_map = json.loads(lc.attrs.semantic_map)
        self.validator = get_validator(schema)
        if cell_type_fields:
            for f in cell_type_fields:
                self.map_cell_type_field(f)
            self.ols = OLSQueryWrapper()
示例#11
0
文件: loaders.py 项目: ritwik7/scvae
def _load_loom_data_set(paths):

    values = labels = example_names = feature_names = batch_indices = None

    with loompy.connect(paths["all"]["full"]) as data_file:

        values = data_file[:, :].T
        n_examples, n_features = values.shape

        if "ClusterID" in data_file.ca:
            cluster_ids = data_file.ca["ClusterID"].flatten()

            if "CellTypes" in data_file.attrs:
                class_names = numpy.array(data_file.attrs["CellTypes"])
                class_name_from_class_id = numpy.vectorize(
                    lambda class_id: class_names[int(class_id)])
                labels = class_name_from_class_id(cluster_ids)
            else:
                labels = cluster_ids

        if "Cell" in data_file.ca:
            example_names = data_file.ca["Cell"].flatten()
        else:
            example_names = numpy.array(
                ["Cell {}".format(j + 1) for j in range(n_examples)])

        if "Gene" in data_file.ra:
            feature_names = data_file.ra["Gene"].flatten()
        else:
            feature_names = numpy.array(
                ["Gene {}".format(j + 1) for j in range(n_features)])

        if "BatchID" in data_file.ca:
            batch_indices = data_file.ca["BatchID"].flatten()

    data_dictionary = {
        "values": values,
        "labels": labels,
        "example names": example_names,
        "feature names": feature_names,
        "batch indices": batch_indices
    }

    return data_dictionary
示例#12
0
 def load_loom_file(self,
                    partial_md5_hash,
                    file_path,
                    abs_file_path,
                    rw=False):
     # if rw:
     #     loom = lp.connect(file_path, mode='r+')
     # else:
     #     loom = lp.connect(file_path, mode='r')\
     try:
         loom_connection = lp.connect(abs_file_path, mode='r+')
     except KeyError as e:
         print(e)
         os.remove(file_path)
         return None
     return self.add_loom(partial_md5_hash=partial_md5_hash,
                          file_path=file_path,
                          abs_file_path=abs_file_path,
                          loom_connection=loom_connection)
示例#13
0
def calculate_ss2_metrics_loom(loom_url):
    """Calculate metrics for a loom file."""

    temp_dir = tempfile.mkdtemp(suffix="loom_test")
    local_loom_path = os.path.join(temp_dir, os.path.basename(loom_url))
    response = requests.get(loom_url, stream=True)
    with open(local_loom_path, "wb") as local_loom_file:
        shutil.copyfileobj(response.raw, local_loom_file)

    ds = loompy.connect(local_loom_path)
    expression_sum = numpy.sum(ds[:, :])
    expression_nonzero = numpy.count_nonzero(ds[:, :])
    cell_count = ds.shape[1]

    return {
        "expression_sum": expression_sum,
        "expression_nonzero": expression_nonzero,
        "cell_count": cell_count
    }
示例#14
0
    def test_loom(self, mock_upload_method):
        """Test the loom output."""

        args = argparse.Namespace(request_id="test_id",
                                  expression_manifest_key=EXPRESSION_MANIFEST,
                                  cell_metadata_manifest_key=CELL_MANIFEST,
                                  gene_metadata_manifest_key=GENE_MANIFEST,
                                  target_path="test.loom",
                                  format="loom",
                                  working_dir=".")

        with mock.patch("matrix.docker.matrix_converter.RequestTracker") as mock_request_tracker, \
                mock.patch("os.remove"):
            matrix_converter = MatrixConverter(args)
            matrix_converter.FS = s3fs.S3FileSystem(anon=True)

            mock_request_tracker.return_value.creation_date = "1983-10-11T000000.00Z"

            matrix_converter.run()

        test_loom = loompy.connect("test.loom")

        self.assertListEqual(test_loom.ca["CellID"].tolist(),
                             list(self.direct_expression.keys()))

        col = 0
        for cellkey in test_loom.ca["CellID"]:
            loom_cell_expr = {
                k: v
                for k, v in zip(test_loom.ra["Accession"], test_loom[:, col])
                if v != 0
            }
            direct_cell_expr = self.direct_expression[cellkey]

            self.assertListEqual(list(loom_cell_expr.keys()),
                                 list(direct_cell_expr.keys()))

            for gene in loom_cell_expr:
                self.assertAlmostEqual(loom_cell_expr[gene],
                                       direct_cell_expr[gene],
                                       places=2)
            col += 1
示例#15
0
def gene_signature_wizard_main(loomfile=None, signaturefile=None):
    """

    Parameters
    ----------
    loomfile :
         (Default value = None)
    signaturefile :
         (Default value = None)

    Returns
    -------

    """
    print(loomfile)
    if loomfile is None:
        loomfile = click.prompt(
            "Loom file that you would like to augment with a gene signature: ")
        while not (os.path.isfile(loomfile) and loomfile.endswith('.loom')):
            loomfile = click.prompt(
                "Not a loom file.  Please select loom file that you would like to augment with cnv/segmentation data: "
            )
    if signaturefile is None:
        signaturefile = click.prompt(
            "Gene list that you would like to add as a gene signature (headerless file, single column): "
        )
    signature = np.genfromtxt(signaturefile, dtype=str)
    with loompy.connect(loomfile, validate=False) as loom:
        proceed = 'y'
        if len(np.intersect1d(signature, loom.ra['gene'])) < len(signature):
            proceed = click.prompt(
                "The following genes ({} in total) in the given signature\n{}\nare not in the loom file.  Would you like to proceed with those that are ({} genes in total)?"
                .format(len(np.setdiff1d(signature, loom.ra['gene'])),
                        ", ".join(np.setdiff1d(signature, loom.ra['gene'])),
                        len(np.intersect1d(signature, loom.ra['gene']))),
                type=click.Choice(['n', 'y']),
                default='y')
        if proceed == 'y':
            signature_name = click.prompt(
                "What would you like to name this signature?",
                default=signaturefile.split('/')[-1].split('.')[0::-1][0])
            loom.ra[signature_name] = np.isin(loom.ra['gene'], signature)
示例#16
0
    def read_loom(filename: str, tag: str = None):
        with lp.connect(filename, mode="r", validate=False) as loom:

            # Load the content into memory
            # Set the main matrix
            ex_mtx = pd.DataFrame(loom[:, :],
                                  index=loom.ra.Gene,
                                  columns=loom.ca.CellID).T
            # Set the column, row and global attribute using the underlying Dict of the AttributeManager
            col_attrs = {k: v for k, v in loom.ca.items()}
            row_attrs = {k: v for k, v in loom.ra.items()}
            global_attrs = {k: v for k, v in loom.attrs.items()}
            # Decompress and decode the MetaData global attribute
            try:
                global_attrs["MetaData"] = SCopeLoom.decompress_decode(
                    value=global_attrs["MetaData"])
            except Exception:
                # MetaData is uncompressed
                global_attrs["MetaData"] = json.loads(global_attrs["MetaData"])

        scope_loom = SCopeLoom(
            filename=filename,
            ex_mtx=ex_mtx,
            col_attrs=col_attrs,
            row_attrs=row_attrs,
            global_attrs=global_attrs,
            tag=tag,
        )
        if "embeddings" in scope_loom.get_meta_data():
            scope_loom.convert_loom_embeddings_repr_to_internal_repr()

        # If multi-runs mode
        is_multi_runs_mode = scope_loom.has_scenic_multi_runs_data()
        if is_multi_runs_mode:
            scope_loom.set_scenic_min_genes_regulon(
                min_genes_regulon=global_attrs["MetaData"]["regulonSettings"]
                ["min_genes_regulon"])
            scope_loom.set_scenic_min_regulon_gene_occurrence(
                min_regulon_gene_occurrence=global_attrs["MetaData"]
                ["regulonSettings"]["min_regulon_gene_occurrence"])

        return scope_loom
示例#17
0
 def load_loom_file(self,
                    partial_md5_hash: str,
                    file_path: str,
                    abs_file_path: str,
                    mode: str = "r"):
     try:
         loom_connection = lp.connect(abs_file_path,
                                      mode=mode,
                                      validate=False)
     except KeyError as e:
         logger.error(e)
         os.remove(file_path)
         logger.warning(f"Deleting malformed loom {file_path}")
         return None
     return self.add_loom(
         partial_md5_hash=partial_md5_hash,
         file_path=file_path,
         abs_file_path=abs_file_path,
         loom_connection=loom_connection,
     )
示例#18
0
文件: smfish.py 项目: sagoyal2/scVI
    def preprocess(self):
        print("Preprocessing smFISH dataset")
        ds = loompy.connect(self.save_path + self.download_name)
        select = ds[:, :].sum(
            axis=0) > 0  # Take out cells that doesn't express any gene

        labels, cell_types = np.array(ds.ca['ClusterID']), np.array(
            ds.ca['ClusterName'])
        labels = np.reshape(labels, (labels.shape[0], 1))[select]
        cell_types = np.reshape(cell_types, (cell_types.shape[0], 1))[select]

        x_coord, y_coord = np.array(ds.ca['X']), np.array(ds.ca['Y'])
        x_coord = np.reshape(x_coord, (x_coord.shape[0], 1))[select]
        y_coord = np.reshape(y_coord, (y_coord.shape[0], 1))[select]

        data = ds[:, select].T  # change matrix to cells by genes
        ds.close()

        print("Finished preprocessing smFISH dataset")
        return data, labels, cell_types, x_coord, y_coord
def get_pct(loom_file, num_val, axis=0):
    """
    Calculates the percentage of a given number over a given loom axis

    Args:
        loom_file (str): Path to loom file
        num_val (int): Number to calculate percentage with
        axis (int): Axis to calculate percentage with
            0: rows
            1: columns

    Returns:
        pct (float): Percentage of num_val/axis * 100
    """
    if axis == 0 or axis == 1:
        with loompy.connect(filename=loom_file, mode='r') as ds:
            pct = num_val / ds.shape[axis] * 100
    else:
        raise ValueError('Axis must be 0 or 1')
    return pct
示例#20
0
    def get_loom(self, loom_file_path: Path, mode: str = "r") -> Loom:
        abs_loom_file_path = self.get_loom_absolute_file_path(loom_file_path)
        with self.file_locks[abs_loom_file_path]:
            if not abs_loom_file_path.exists():
                logger.error(f"The file {loom_file_path} does not exists.")
                raise ValueError(
                    f"The file located at {abs_loom_file_path} does not exist."
                )
            if abs_loom_file_path in self.active_looms:
                logger.debug("Should be preloaded")
                loom = self.active_looms[abs_loom_file_path]
                try:
                    logger.debug(
                        f"Current mode: {self.active_looms[abs_loom_file_path].get_connection().mode}, wanted mode {mode}"
                    )
                    if self.active_looms[abs_loom_file_path].get_connection(
                    ).mode == mode:
                        logger.debug(
                            f"Returning pre-loaded loom file {loom_file_path}. Object {id(loom)}"
                        )
                        return loom
                    else:
                        logger.error(
                            f"Mode {mode} was requested for {loom_file_path}, but mode is currently {self.active_looms[abs_loom_file_path].get_connection().mode}"
                        )
                except AttributeError:
                    logger.error("Loom was previously closed")
                    loom.loom_connection = lp.connect(
                        abs_loom_file_path.as_posix(),
                        mode=mode,
                        validate=False)

            else:
                loom = self.load_loom_file(mode=mode,
                                           file_path=loom_file_path,
                                           abs_file_path=abs_loom_file_path)
                logger.debug(
                    f"Returning newly loaded loom file {loom_file_path}. Object {id(loom)}, mode {loom.get_connection().mode}"
                )
        return loom
示例#21
0
    def validate(self, path: str, strictness: str = "speconly") -> bool:
        """
		Validate a file for conformance to the Loom specification

		Args:
			path: 			Full path to the file to be validated
			strictness:		"speconly" or "conventions"

		Remarks:
			In "speconly" mode, conformance is assessed relative to the file format specification
			at http://linnarssonlab.org/loompy/format/. In "conventions" mode, conformance is additionally
			assessed relative to attribute name and data type conventions given at http://linnarssonlab.org/loompy/conventions/.
		"""
        valid1 = True
        if self.backend == "hdf5":
            open_func = h5py.File
        elif self.backend == "zarr":
            open_func = zarr.open_group

        f = open_func(path, mode="r")
        if self.version == None:
            self.version = get_loom_spec_version(f)
        valid1 = self.validate_spec(f)
        if not valid1:
            self.errors.append(
                "For help, see http://linnarssonlab.org/loompy/format/")
        if self.backend == "hdf5":
            f.close()

        valid2 = True
        if strictness == "conventions":
            with loompy.connect(path, mode="r") as ds:
                valid2 = self.validate_conventions(ds)
                if not valid2:
                    self.errors.append(
                        "For help, see http://linnarssonlab.org/loompy/conventions/"
                    )

        return valid1 and valid2
示例#22
0
	def __init__(self, project: str, filename: str, file_path: str, callback_on_close: Callable = None, close_connection_on_exit: bool = True) -> None:
		self.project = project
		self.filename = filename
		self.file_path = file_path
		self.close_connection_on_exit = close_connection_on_exit
		self.callback_on_close = callback_on_close
		self._closed = False
		self.ds = None
		try:
			# TODO: when loompy library is updated with a default
			# Unix timestamp for missing time fields, this should
			# be set back to 'r' for safety reasons
			self.ds = loompy.connect(file_path, 'r+')
		except Exception as e:
			logging.warning("Could not open loom file at %s, closing LoomExpand object", file_path)
			if self.ds is not None:
				self.ds.close(True)
			self.ds = None
			if self.callback_on_close is not None:
				self.callback_on_close(self)
			self._closed = True
			raise e
示例#23
0
def create_subsetted_loom_with_genemask(loom, output_loom, cellmask, genemask):
    """Deprecated.

    Parameters
    ----------
    loom :
        
    output_loom :
        
    cellmask :
        
    genemask :
        

    Returns
    -------

    
    """
    print("THIS FUNCTION IS DEPRECATED, USE loompy.new INSTEAD!!!")
    import loompy

    from panopticon.utilities import recover_meta
    if '' not in loom.layers.keys():
        raise Exception("Expecting '' layer, yet none found")
    rowmeta, colmeta = recover_meta(loom)
    if len(genemask) != loom.shape[0] or len(cellmask) != loom.shape[1]:
        raise Exception(
            "genemask and cellmask must be boolean masks with length equal to the number of rows and columns of loom, respectively"
        )
    loompy.create(output_loom,
                  loom[''][genemask.nonzero()[0], :][:,
                                                     cellmask.nonzero()[0]],
                  rowmeta[genemask].to_dict("list"),
                  colmeta[cellmask].to_dict("list"))
    with loompy.connect(output_loom) as smallerloom:
        for layer in [x for x in loom.layer.keys() if x != '']:
            smallerloom[layer] = loom[layer][:, cellmask.nonzero()[0]][
                genemask.nonzero()[0], :]
示例#24
0
def check_pca_batches(loom_file, n_pca=50, batch_size=512, verbose=False):
    """
    Checks and adjusts batch size for PCA

    Args:
        loom_file (str): Path to loom file
        n_pca (int): Number of components for PCA
        batch_size (int): Size of chunks
        verbose (bool): Print logging messages

    Returns:
        batch_size (int): Updated batch size to work with PCA
    """
    # Get the number of cells
    with loompy.connect(loom_file) as ds:
        num_total = ds.shape[1]
    # Check if batch_size and PCA are even reasonable
    if num_total < n_pca:
        err_msg = 'More PCA components {0} than samples {1}'.format(
            n_pca, num_total)
        if verbose:
            decomp_log.error(err_msg)
        raise ValueError(decomp_log)
    if batch_size < n_pca:
        batch_size = n_pca
    # Adjust based on expected size
    mod_total = num_total % batch_size
    adjusted_batch = False
    if mod_total < n_pca:
        adjusted_batch = True
        batch_size = batch_size - n_pca + mod_total
    if batch_size < n_pca:
        batch_size = num_total
    # Report to user
    if verbose and adjusted_batch:
        decomp_log.info(
            'Adjusted batch size to {0} for PCA'.format(batch_size))
    # Return value
    return batch_size
示例#25
0
def load_loom(filename):
    """Load data from a loom file

    From github.com/simslab/scHPF

    Parameters
    ----------
    filename: str
        file to load

    Returns
    -------
    coo : coo_matrix
        cell x gene sparse count matrix
    genes : Dataframe
        Dataframe of gene attributes.  Attributes are ordered so
        Accession and Gene are the first columns, if those attributs are
        present
    cells : Dataframe
        Dataframe of cell attributes
    """
    import loompy
    # load the loom file
    with loompy.connect(filename) as ds:
        loom_genes = pd.DataFrame(dict(ds.ra.items()))
        loom_cells = pd.DataFrame(dict(ds.ca.items()))
        loom_coo = ds.sparse().T

    # order gene attributes so Accession and Gene are the first two columns,
    # if they are present
    first_cols = []
    for colname in ['Accession', 'Gene']:
        if colname in loom_genes.columns:
            first_cols.append(colname)
    rest_cols = loom_genes.columns.difference(first_cols).tolist()
    loom_genes = loom_genes[first_cols + rest_cols]

    return loom_coo, loom_genes, loom_cells
    def test_ops(self):

        # Filter should return four of the five test bundles
        self.request_ids = self._post_matrix_service_request(filter_={
            "op":
            "and",
            "value": [{
                "op": "=",
                "field":
                "library_preparation_protocol.library_construction_method.ontology",
                "value": "EFO:0008931"
            }, {
                "op": "!=",
                "field": "derived_organ_label",
                "value": "decidua"
            }, {
                "op": "in",
                "field": "dss_bundle_fqid",
                "value": INPUT_BUNDLE_IDS[self.dss_env]
            }]
        },
                                                             format_="loom")

        WaitFor(self._poll_all_requests_in_status, self.request_ids, MatrixRequestStatus.COMPLETE.value)\
            .to_return_value(True, timeout_seconds=1200)
        matrix_location = self._retrieve_matrix_location(
            self.request_ids[GenusSpecies.HUMAN.value])

        temp_dir = tempfile.mkdtemp(suffix="loom_ops_test")
        local_loom_path = os.path.join(temp_dir,
                                       os.path.basename(matrix_location))
        response = requests.get(matrix_location, stream=True)
        with open(local_loom_path, "wb") as local_loom_file:
            shutil.copyfileobj(response.raw, local_loom_file)

        ds = loompy.connect(local_loom_path)

        self.assertEqual(ds.shape[1], 4)
def continuous_loom(input_file):
    """Continuous matrix attribute handler for loom files

    Supported file format may be different for each server, this function maps
    loom-specific attributes to a general data structure that can be used by
    all content testing functions regardless of file format

    Arguments:
        input_file (str): input loom file
    
    Returns:
        (dict): attribute handler, consistent structure regardless of file type 
    """

    # connects to the loom file, then remaps loom specific attributes to general
    # attribute names which are used in the content testing functions
    ds = loompy.connect(input_file)
    return {
        "Track": ds.ra.tracks,
        "Position": ds.ca.position,
        "Value": ds,
        "FH": ds # loom file handle, should be closed after content testing
    }
示例#28
0
    def _load_data(self, skip_row=None, skip_col=None, **kwargs):
        with lp.connect(self._file_name) as ds:
            X = ds[:, :].T
            if skip_row is not None:
                mask = np.array([not skip_row(i) for i in range(X.shape[1])])
                self._use_rows_mask = mask
            else:
                self._use_rows_mask = np.ones(X.shape[1], dtype=bool)
            if skip_col is not None:
                mask = np.array([not skip_col(i) for i in range(X.shape[0])])
                self._use_cols_mask = mask
            else:
                self._use_cols_mask = np.ones(X.shape[0], dtype=bool)

            X = X[self._use_cols_mask, :]
            X = X[:, self._use_rows_mask]
            gene_names = ds.ra.Gene[self._use_rows_mask] \
                if hasattr(ds.ra, "Gene") else []
            attrs = [ContinuousVariable.make(str(g)) for g in gene_names]
            meta_df = pd.DataFrame(
                {key: ds.ca[key][self._use_cols_mask]
                 for key in ds.ca.keys()})
        return attrs, X, meta_df, meta_df.index
示例#29
0
def calculate_ss2_metrics_loom(loom_url):
    """Calculate metrics for a loom file."""

    temp_dir = tempfile.mkdtemp(suffix="loom_zip_test")
    local_loom_zip_path = os.path.join(temp_dir, os.path.basename(loom_url))
    response = requests.get(loom_url, stream=True)
    with open(local_loom_zip_path, "wb") as local_loom_zip_file:
        shutil.copyfileobj(response.raw, local_loom_zip_file)

    loom_zip = zipfile.ZipFile(local_loom_zip_path)
    loom_name = [n for n in loom_zip.namelist() if n.endswith(".loom")][0]
    loom_zip.extractall()

    ds = loompy.connect(loom_name)
    expression_sum = numpy.sum(ds[:, :])
    expression_nonzero = numpy.count_nonzero(ds[:, :])
    cell_count = ds.shape[1]

    return {
        "expression_sum": expression_sum,
        "expression_nonzero": expression_nonzero,
        "cell_count": cell_count
    }
示例#30
0
    def __init__(self, loom_file_path, total_clusters=6):
        self.total_clusters = total_clusters
        self.ds = loompy.connect(loom_file_path)
        self.spliced = self.ds.layer["spliced"][:, :].astype(np.dtype(float))
        self.unspliced = self.ds.layer["unspliced"][:, :].astype(np.dtype(float))
        self.ambig = self.ds.layer["ambiguous"][:, :].astype(np.dtype(float))
        self.spliced = np.transpose(self.spliced)
        self.unspliced = np.transpose(self.unspliced)
        self.ambig = np.transpose(self.ambig)
        self.cells = np.stack((self.spliced, self.unspliced, self.ambig))
        ca = dict(self.ds.col_attrs.items())
        self.clusters = ca["Clusters"][:]
        print(self.unspliced.shape)
        print(self.cells.shape)
        self.cells = np.transpose(self.cells, (1, 0, 2))
        self.cells = sphere_data(self.cells)
        print(self.cells.shape)

        # for i in range(100):
        #     print(self.spliced[i][i], self.unspliced[i][i], self.ambig[i][i])
        #     print(self.cells[i, i])
        print("len cells", len(self.cells))
        print("shape cells[0]", self.cells[0].shape)
示例#31
0
def read_loom(
    file_path: str,
    mode_type: str = "rna",
    force_conversion={"annotations": False, "metrics": False},
) -> LoomX:

    try:
        _mode_type = ModeType(mode_type)
    except:
        mode_types = list(filter(lambda x: x != "_", [w.value for w in (ModeType)]))
        raise Exception(
            f"The given mode type '{mode_type}' does not exist. Choose one of: {', '.join(mode_types)}."
        )

    with lp.connect(filename=file_path, mode="r", validate=False) as loom_connection:
        if any(
            list(map(lambda x: x in loom_connection.attrs, GLOBAL_ATTRIBUTE_KEY_VX))
        ):
            return _read_scope_loom(
                loom_connection=loom_connection,
                mode_type=_mode_type,
                force_conversion=force_conversion,
            )
        raise Exception(f"Unable to read the loom at {file_path}")