예제 #1
0
def get_data_adaptor(url_dataroot=None, dataset=None):
    config = current_app.app_config
    server_config = config.server_config
    dataset_key = None

    if dataset is None:
        datapath = server_config.single_dataset__datapath
    else:
        dataroot = None
        for key, dataroot_dict in server_config.multi_dataset__dataroot.items(
        ):
            if dataroot_dict["base_url"] == url_dataroot:
                dataroot = dataroot_dict["dataroot"]
                dataset_key = key
                break

        if dataroot is None:
            raise DatasetAccessError(
                f"Invalid dataset {url_dataroot}/{dataset}")
        datapath = path_join(dataroot, dataset)
        # path_join returns a normalized path.  Therefore it is
        # sufficient to check that the datapath starts with the
        # dataroot to determine that the datapath is under the dataroot.
        if not datapath.startswith(dataroot):
            raise DatasetAccessError(
                f"Invalid dataset {url_dataroot}/{dataset}")

    if datapath is None:
        return common_rest.abort_and_log(HTTPStatus.BAD_REQUEST,
                                         "Invalid dataset NONE",
                                         loglevel=logging.INFO)

    cache_manager = current_app.matrix_data_cache_manager
    return cache_manager.data_adaptor(dataset_key, datapath, config)
예제 #2
0
    def __init__(self, location, matrix_data_type=None, app_config=None):
        """ location can be a string or DataLocator """
        region_name = None if app_config is None else app_config.server_config.data_locator__s3__region_name
        self.location = DataLocator(location, region_name=region_name)
        if not self.location.exists():
            raise DatasetAccessError("Dataset does not exist.",
                                     HTTPStatus.NOT_FOUND)

        # matrix_data_type is an enum value of type MatrixDataType
        self.matrix_data_type = matrix_data_type
        # matrix_type is a DataAdaptor type, which corresonds to the matrix_data_type
        self.matrix_type = None

        if matrix_data_type is None:
            self.matrix_data_type = self.__matrix_data_type()

        if not self.__matrix_data_type_allowed(app_config):
            raise DatasetAccessError("Dataset does not have an allowed type.")

        if self.matrix_data_type == MatrixDataType.H5AD:
            from server.data_anndata.anndata_adaptor import AnndataAdaptor

            self.matrix_type = AnndataAdaptor
        elif self.matrix_data_type == MatrixDataType.CXG:
            from server.data_cxg.cxg_adaptor import CxgAdaptor

            self.matrix_type = CxgAdaptor
예제 #3
0
    def _load_data(self, data_locator):
        # as of AnnData 0.6.19, backed mode performs initial load fast, but at the
        # cost of significantly slower access to X data.
        try:
            # there is no guarantee data_locator indicates a local file.  The AnnData
            # API will only consume local file objects.  If we get a non-local object,
            # make a copy in tmp, and delete it after we load into memory.
            with data_locator.local_handle() as lh:
                # as of AnnData 0.6.19, backed mode performs initial load fast, but at the
                # cost of significantly slower access to X data.
                backed = "r" if self.server_config.adaptor__anndata_adaptor__backed else None
                self.data = anndata.read_h5ad(lh, backed=backed)

        except ValueError:
            raise DatasetAccessError(
                "File must be in the .h5ad format. Please read "
                "https://github.com/theislab/scanpy_usage/blob/master/170505_seurat/info_h5ad.md to "
                "learn more about this format. You may be able to convert your file into this format "
                "using `cellxgene prepare`, please run `cellxgene prepare --help` for more "
                "information.")
        except MemoryError:
            raise DatasetAccessError(
                "Out of memory - file is too large for available memory.")
        except Exception:
            raise DatasetAccessError(
                "File not found or is inaccessible. File must be an .h5ad object. "
                "Please check your input and try again.")
예제 #4
0
 def pre_load_validation(data_locator):
     if data_locator.islocal():
         # if data locator is local, apply file system conventions and other "cheap"
         # validation checks.  If a URI, defer until we actually fetch the data and
         # try to read it.  Many of these tests don't make sense for URIs (eg, extension-
         # based typing).
         if not data_locator.exists():
             raise DatasetAccessError("does not exist")
         if not data_locator.isfile():
             raise DatasetAccessError("is not a file")
예제 #5
0
 def get_embedding_names(self):
     with ServerTiming.time("layout.lsuri"):
         pemb = self.get_path("emb")
         embeddings = [os.path.basename(p) for (p, t) in self.lsuri(pemb) if t == "array"]
     if len(embeddings) == 0:
         raise DatasetAccessError("cxg matrix missing embeddings")
     return embeddings
예제 #6
0
 def query_obs_array(self, term_name):
     var = self.open_array("obs")
     try:
         data = var.query(attrs=[term_name])[:][term_name]
     except tiledb.libtiledb.TileDBError:
         raise DatasetAccessError("query_obs")
     return data
예제 #7
0
    def _validate_and_initialize(self):
        """
        remember, preload_validation() has already been called, so
        no need to repeat anything it has done.

        Load the CXG "group" metadata and cache instance values.
        Be very aware of multiple versions of the CXG object.

        CXG versions in the wild:
        * version 0, aka "no version" -- can be detected by the lack
          of a cxg_group_metadata array.
        * version 0.1 -- metadata attache to cxg_group_metadata array.
          Same as 0, except it adds group metadata.
        """
        if self.has_array("cxg_group_metadata"):
            # version >0
            gmd = self.open_array("cxg_group_metadata")
            cxg_version = gmd.meta["cxg_version"]
            if cxg_version == "0.1":
                cxg_properties = json.loads(gmd.meta["cxg_properties"])
                title = cxg_properties.get("title", None)
                about = cxg_properties.get("about", None)
        else:
            # version 0
            cxg_version = "0.0"
            title = None
            about = None

        if cxg_version not in ["0.0", "0.1"]:
            raise DatasetAccessError(f"cxg matrix is not valid: {self.url}")

        self.title = title
        self.about = about
        self.cxg_version = cxg_version
예제 #8
0
    def acquire_and_open(self, app_config, dataset_config=None):
        """returns the data_adaptor if cached.  opens the data_adaptor if not.
        In either case, the a reader lock is taken.  Must call release when
        the data_adaptor is no longer needed"""
        self.data_lock.r_acquire()
        if self.data_adaptor:
            return self.data_adaptor
        self.data_lock.r_release()

        self.data_lock.w_acquire()
        # the data may have been loaded while waiting on the lock
        if not self.data_adaptor:
            try:
                self.loader.pre_load_validation()
                self.data_adaptor = self.loader.open(app_config,
                                                     dataset_config)
            except Exception as e:
                # necessary to hold the reader lock after an exception, since
                # the release will occur when the context exits.
                self.data_lock.w_demote()
                raise DatasetAccessError(str(e))

        # demote the write lock to a read lock.
        self.data_lock.w_demote()
        return self.data_adaptor
예제 #9
0
 def use_dataset_with_error(self, matrix_cache, dirname, app_config,
                            dataset_index):
     try:
         with matrix_cache.data_adaptor(
                 None, os.path.join(dirname,
                                    str(dataset_index) + ".cxg"),
                 app_config):
             raise DatasetAccessError("something bad happened")
     except DatasetAccessError:
         # the MatrixDataCacheManager rethrows the exception, so catch and ignore
         pass
예제 #10
0
 def _validate_data_types(self):
     # The backed API does not support interrogation of the underlying sparsity or sparse matrix type
     # Fake it by asking for a small subarray and testing it.   NOTE: if the user has ignored our
     # anndata <= 0.7 warning, opted for the --backed option, and specified a large, sparse dataset,
     # this "small" indexing request will load the entire X array. This is due to a bug in anndata<=0.7
     # which will load the entire X matrix to fullfill any slicing request if X is sparse.  See
     # user warning in _load_data().
     X0 = self.data.X[0, 0:1]
     if sparse.isspmatrix(X0) and not sparse.isspmatrix_csc(X0):
         warnings.warn(
             "Anndata data matrix is sparse, but not a CSC (columnar) matrix.  "
             "Performance may be improved by using CSC.")
     if self.data.X.dtype > np.dtype(np.float32):
         warnings.warn(
             f"Anndata data matrix is in {self.data.X.dtype} format not float32. "
             f"Precision may be truncated.")
     if self.data.X.dtype < np.float32:
         if self.data.isbacked:
             raise DatasetAccessError(
                 f"Data matrix in {self.data.X.dtype} format is not supported in backed mode."
                 " Please reload without --backed, or convert matrix to float32"
             )
         warnings.warn(
             f"Anndata data matrix is in unsupported {self.data.X.dtype} format -- will be cast to float32"
         )
         self.data.X = self.data.X.astype(np.float32)
     for ax in Axis:
         curr_axis = getattr(self.data, str(ax))
         for ann in curr_axis:
             datatype = curr_axis[ann].dtype
             downcast_map = {
                 "int64": "int32",
                 "uint32": "int32",
                 "uint64": "int32",
                 "float64": "float32",
             }
             if datatype in downcast_map:
                 warnings.warn(
                     f"Anndata annotation {ax}:{ann} is in unsupported format: {datatype}. "
                     f"Data will be downcast to {downcast_map[datatype]}.")
             if isinstance(datatype, CategoricalDtype):
                 category_num = len(curr_axis[ann].dtype.categories)
                 if category_num > 500 and category_num > self.dataset_config.presentation__max_categories:
                     warnings.warn(
                         f"{str(ax).title()} annotation '{ann}' has {category_num} categories, this may be "
                         f"cumbersome or slow to display. We recommend setting the "
                         f"--max-category-items option to 500, this will hide categorical "
                         f"annotations with more than 500 categories in the UI"
                     )
예제 #11
0
파일: app.py 프로젝트: lzlgboy/cellxgene
def get_data_adaptor(dataset=None):
    config = current_app.app_config

    if dataset is None:
        datapath = config.single_dataset__datapath
    else:
        datapath = path_join(config.multi_dataset__dataroot, dataset)
        # path_join returns a normalized path.  Therefore it is
        # sufficient to check that the datapath starts with the
        # dataroot to determine that the datapath is under the dataroot.
        if not datapath.startswith(config.multi_dataset__dataroot):
            raise DatasetAccessError("Invalid dataset {dataset}")

    if datapath is None:
        return common_rest.abort_and_log(HTTPStatus.BAD_REQUEST,
                                         f"Invalid dataset NONE",
                                         loglevel=logging.INFO)

    cache_manager = current_app.matrix_data_cache_manager
    return cache_manager.data_adaptor(datapath, config)
예제 #12
0
 def pre_load_validation(self):
     if self.matrix_data_type == MatrixDataType.UNKNOWN:
         raise DatasetAccessError(
             "Dataset does not have a recognized type: .h5ad")
     self.matrix_type.pre_load_validation(self.location)
예제 #13
0
 def pre_load_validation(data_locator):
     location = data_locator.uri_or_path
     if not CxgAdaptor.isvalid(location):
         logging.error(f"cxg matrix is not valid: {location}")
         raise DatasetAccessError("cxg matrix is not valid")
예제 #14
0
 def open_array(self, name):
     try:
         p = self.get_path(name)
         return self.arrays[p]
     except tiledb.libtiledb.TileDBError:
         raise DatasetAccessError(name)