Exemplo n.º 1
0
    def _load_data(self, data_locator):
        # as of AnnData 0.6.19, backed mode performs initial load fast, but at the
        # cost of significantly slower access to X data.
        try:
            # there is no guarantee data_locator indicates a local file.  The AnnData
            # API will only consume local file objects.  If we get a non-local object,
            # make a copy in tmp, and delete it after we load into memory.
            with data_locator.local_handle() as lh:
                # as of AnnData 0.6.19, backed mode performs initial load fast, but at the
                # cost of significantly slower access to X data.
                backed = "r" if self.server_config.adaptor__anndata_adaptor__backed else None
                self.data = anndata.read_h5ad(lh, backed=backed)

        except ValueError:
            raise DatasetAccessError(
                "File must be in the .h5ad format. Please read "
                "https://github.com/theislab/scanpy_usage/blob/master/170505_seurat/info_h5ad.md to "
                "learn more about this format. You may be able to convert your file into this format "
                "using `cellxgene prepare`, please run `cellxgene prepare --help` for more "
                "information."
            )
        except MemoryError:
            raise DatasetAccessError("Out of memory - file is too large for available memory.")
        except Exception:
            raise DatasetAccessError(
                "File not found or is inaccessible. File must be an .h5ad object. "
                "Please check your input and try again."
            )
Exemplo n.º 2
0
def get_data_adaptor(url_dataroot=None, dataset=None):
    config = current_app.app_config
    server_config = config.server_config
    dataset_key = None

    if dataset is None:
        datapath = server_config.single_dataset__datapath
    else:
        dataroot = None
        for key, dataroot_dict in server_config.multi_dataset__dataroot.items(
        ):
            if dataroot_dict["base_url"] == url_dataroot:
                dataroot = dataroot_dict["dataroot"]
                dataset_key = key
                break

        if dataroot is None:
            raise DatasetAccessError(
                f"Invalid dataset {url_dataroot}/{dataset}")
        datapath = path_join(dataroot, dataset)
        # path_join returns a normalized path.  Therefore it is
        # sufficient to check that the datapath starts with the
        # dataroot to determine that the datapath is under the dataroot.
        if not datapath.startswith(dataroot):
            raise DatasetAccessError(
                f"Invalid dataset {url_dataroot}/{dataset}")

    if datapath is None:
        return common_rest.abort_and_log(HTTPStatus.BAD_REQUEST,
                                         "Invalid dataset NONE",
                                         loglevel=logging.INFO)

    cache_manager = current_app.matrix_data_cache_manager
    return cache_manager.data_adaptor(dataset_key, datapath, config)
Exemplo n.º 3
0
 def pre_load_validation(data_locator):
     if data_locator.islocal():
         # if data locator is local, apply file system conventions and other "cheap"
         # validation checks.  If a URI, defer until we actually fetch the data and
         # try to read it.  Many of these tests don't make sense for URIs (eg, extension-
         # based typing).
         if not data_locator.exists():
             raise DatasetAccessError("does not exist")
         if not data_locator.isfile():
             raise DatasetAccessError("is not a file")
Exemplo n.º 4
0
 def query_obs_array(self, term_name):
     var = self.open_array("obs")
     try:
         data = var.query(attrs=[term_name])[:][term_name]
     except tiledb.libtiledb.TileDBError:
         raise DatasetAccessError("query_obs")
     return data
Exemplo n.º 5
0
 def get_embedding_names(self):
     with ServerTiming.time("layout.lsuri"):
         pemb = self.get_path("emb")
         embeddings = [
             os.path.basename(p) for (p, t) in self.lsuri(pemb)
             if t == "array"
         ]
     if len(embeddings) == 0:
         raise DatasetAccessError("cxg matrix missing embeddings")
     return embeddings
Exemplo n.º 6
0
 def use_dataset_with_error(self, matrix_cache, dirname, app_config,
                            dataset_index):
     try:
         with matrix_cache.data_adaptor(
                 None, os.path.join(dirname,
                                    str(dataset_index) + ".cxg"),
                 app_config):
             raise DatasetAccessError("something bad happened")
     except DatasetAccessError:
         # the MatrixDataCacheManager rethrows the exception, so catch and ignore
         pass
Exemplo n.º 7
0
    def __init__(self, location, matrix_data_type=None, app_config=None):
        """ location can be a string or DataLocator """
        region_name = None if app_config is None else app_config.server_config.data_locator__s3__region_name
        self.location = DataLocator(location, region_name=region_name)
        if not self.location.exists():
            raise DatasetAccessError("Dataset does not exist.",
                                     HTTPStatus.NOT_FOUND)

        # matrix_data_type is an enum value of type MatrixDataType
        self.matrix_data_type = matrix_data_type
        # matrix_type is a DataAdaptor type, which corresonds to the matrix_data_type
        self.matrix_type = None

        if matrix_data_type is None:
            self.matrix_data_type = self.__matrix_data_type()

        if not self.__matrix_data_type_allowed(app_config):
            raise DatasetAccessError("Dataset does not have an allowed type.")

        if self.matrix_data_type == MatrixDataType.H5AD:
            from backend.server.data_anndata.anndata_adaptor import AnndataAdaptor

            self.matrix_type = AnndataAdaptor
Exemplo n.º 8
0
    def _validate_and_initialize(self):
        """
        remember, preload_validation() has already been called, so
        no need to repeat anything it has done.

        Load the CXG "group" metadata and cache instance values.
        Be very aware of multiple versions of the CXG object.

        CXG versions in the wild:
        * version 0, aka "no version" -- can be detected by the lack
          of a cxg_group_metadata array.
        * version 0.1 -- metadata attache to cxg_group_metadata array.
          Same as 0, except it adds group metadata.
        """
        title = None
        about = None
        corpora_props = None
        if self.has_array("cxg_group_metadata"):
            # version >0
            gmd = self.open_array("cxg_group_metadata")
            cxg_version = gmd.meta["cxg_version"]
            # version 0.1 used a malformed/shorthand semver string.
            if cxg_version == "0.1" or cxg_version == "0.2.0":
                cxg_properties = json.loads(gmd.meta["cxg_properties"])
                title = cxg_properties.get("title", None)
                about = cxg_properties.get("about", None)
            if cxg_version == "0.2.0":
                corpora_props = json.loads(
                    gmd.meta["corpora"]) if "corpora" in gmd.meta else None
        else:
            # version 0
            cxg_version = "0.0"

        if cxg_version not in ["0.0", "0.1", "0.2.0"]:
            raise DatasetAccessError(f"cxg matrix is not valid: {self.url}")

        self.title = title
        self.about = about
        self.cxg_version = cxg_version
        self.corpora_props = corpora_props
Exemplo n.º 9
0
 def pre_load_validation(data_locator):
     location = data_locator.uri_or_path
     if not CxgAdaptor.isvalid(location):
         logging.error(f"cxg matrix is not valid: {location}")
         raise DatasetAccessError("cxg matrix is not valid")
Exemplo n.º 10
0
 def open_array(self, name):
     try:
         p = self.get_path(name)
         return self.arrays[p]
     except tiledb.libtiledb.TileDBError:
         raise DatasetAccessError(name)
Exemplo n.º 11
0
 def pre_load_validation(self):
     if self.matrix_data_type == MatrixDataType.UNKNOWN:
         raise DatasetAccessError(
             "Dataset does not have a recognized type: .h5ad")
     self.matrix_type.pre_load_validation(self.location)