예제 #1
0
    def annotation_to_fbs_matrix(self, axis, fields=None, labels=None):
        with ServerTiming.time(f"annotations.{axis}.query"):
            A = self.open_array(str(axis))

            # may raise if fields contains unknown key
            cxg_fields, anno_fields, return_fields, index_field = self._annotations_field_split(axis, fields, A, labels)

            if cxg_fields is None:
                data = A[:]
            elif cxg_fields:
                data = A.query(attrs=cxg_fields)[:]
            else:
                data = {}

            df = pd.DataFrame.from_dict(data)

            if axis == Axis.OBS and labels is not None and not labels.empty:
                if anno_fields is None:
                    assert index_field
                    df = df.join(labels, index_field)
                elif anno_fields:
                    assert index_field
                    df = df.join(labels[anno_fields], index_field)

            if return_fields:
                df = df[return_fields]

        with ServerTiming.time(f"annotations.{axis}.encode"):
            fbs = encode_matrix_fbs(df, col_idx=df.columns)

        return fbs
예제 #2
0
 def create_matrix(self, dataset_name, num_cat, max_labels):
     with self.elapsed_timer() as elapsed:
         annon_dict = self.create_annotations_dict_multi_process(
             dataset_name, num_cat, max_labels)
         dict_size = sum(
             sys.getsizeof(value)
             for value in annon_dict.values()) / 1024**2
         self.test_notes[dataset_name][f"num_categories_{num_cat}"][
             f"max_label_{max_labels}"]["annotation_dict"] = {
                 "creation_time": str(elapsed()),
                 "size": f"{dict_size} mb",
             }
         df = pd.DataFrame(annon_dict)
         df_size = sys.getsizeof(df) / 1024**2
         self.test_notes[dataset_name][f"num_categories_{num_cat}"][
             f"max_label_{max_labels}"]["data_frame"] = {
                 "creation_time": str(elapsed()),
                 "size": f"{df_size} mb",
             }
         try:
             matrix = encode_matrix_fbs(matrix=df,
                                        row_idx=None,
                                        col_idx=df.columns)
             matrix_size = sys.getsizeof(matrix) / 1024**2
             self.test_notes[dataset_name][f"num_categories_{num_cat}"][
                 f"max_label_{max_labels}"]["fbs_matrix"] = {
                     "creation_time": str(elapsed()),
                     "size": f"{matrix_size} mb",
                 }
             return matrix
         except Exception as e:
             print(f"Issue creating fbs matrix: {e}, for {dataset_name}")
             return []
예제 #3
0
    def layout_to_fbs_matrix(self, fields):
        """
        return specified embeddings as a flatbuffer, using the cellxgene matrix fbs encoding.

        * returns only first two dimensions, with name {ename}_0 and {ename}_1,
          where {ename} is the embedding name.
        * client assumes each will be individually centered & scaled (isotropically)
          to a [0, 1] range.
        * does not support filtering

        """
        embeddings = self.get_embedding_names() if fields is None or len(fields) == 0 else fields
        layout_data = []
        with ServerTiming.time("layout.query"):
            for ename in embeddings:
                embedding = self.get_embedding_array(ename, 2)
                normalized_layout = DataAdaptor.normalize_embedding(embedding)
                layout_data.append(pd.DataFrame(normalized_layout, columns=[f"{ename}_0", f"{ename}_1"]))

        with ServerTiming.time("layout.encode"):
            if layout_data:
                df = pd.concat(layout_data, axis=1, copy=False)
            else:
                df = pd.DataFrame()
            fbs = encode_matrix_fbs(df, col_idx=df.columns, row_idx=None)

        return fbs
예제 #4
0
    def data_frame_to_fbs_matrix(self, filter, axis):
        """
        Retrieves data 'X' and returns in a flatbuffer Matrix.
        :param filter: filter: dictionary with filter params
        :param axis: string obs or var
        :return: flatbuffer Matrix

        Caveats:
        * currently only supports access on VAR axis
        * currently only supports filtering on VAR axis
        """
        if axis != Axis.VAR:
            raise ValueError("Only VAR dimension access is supported")

        try:
            obs_selector, var_selector = self._filter_to_mask(filter)
        except (KeyError, IndexError, TypeError, AttributeError):
            raise FilterError("Error parsing filter")

        if obs_selector is not None:
            raise FilterError("filtering on obs unsupported")

        num_columns = self.get_shape()[1] if var_selector is None else np.count_nonzero(var_selector)
        if self.server_config.exceeds_limit("column_request_max", num_columns):
            raise ExceedsLimitError("Requested dataframe columns exceed column request limit")

        X = self.get_X_array(obs_selector, var_selector)
        col_idx = np.nonzero([] if var_selector is None else var_selector)[0]
        return encode_matrix_fbs(X, col_idx=col_idx, row_idx=None)
예제 #5
0
    def compute_embedding(self, method, obsFilter):
        if Axis.VAR in obsFilter:
            raise FilterError(
                "Observation filters may not contain variable conditions")
        if method != "umap":
            raise NotImplementedError(
                f"re-embedding method {method} is not available.")
        try:
            shape = self.get_shape()
            obs_mask = self._axis_filter_to_mask(Axis.OBS, obsFilter["obs"],
                                                 shape[0])
        except (KeyError, IndexError):
            raise FilterError("Error parsing filter")
        with ServerTiming.time("layout.compute"):
            X_umap = scanpy_umap(self.data, obs_mask)
            normalized_layout = DataAdaptor.normalize_embedding(X_umap)

        # Server picks reemedding name, which must not collide with any other
        # embedding name generated by this backed.
        name = f"reembed:{method}_{datetime.now().isoformat(timespec='milliseconds')}"
        dims = [f"{name}_0", f"{name}_1"]
        df = pd.DataFrame(normalized_layout, columns=dims)
        fbs = encode_matrix_fbs(df, col_idx=df.columns, row_idx=None)
        schema = {"name": name, "type": "float32", "dims": dims}
        return (schema, fbs)
예제 #6
0
 def test_encode_DataFrame(self):
     df = pd.DataFrame(
         data={
             "a": np.zeros((10,), dtype=np.float32),
             "b": np.ones((10,), dtype=np.int64),
             "c": np.array([i for i in range(0, 10)], dtype=np.uint16),
             "d": pd.Series(["x", "y", "z", "x", "y", "z", "a", "x", "y", "z"], dtype="category"),
         }
     )
     expected_types = ((np.ndarray, np.float32), (np.ndarray, np.int32), (np.ndarray, np.uint32), (list, None))
     fbs = encode_matrix_fbs(matrix=df, row_idx=None, col_idx=df.columns)
     self.fbs_checks(fbs, (10, 4), expected_types, ["a", "b", "c", "d"])
예제 #7
0
    def annotation_to_fbs_matrix(self, axis, fields=None, labels=None):
        if axis == Axis.OBS:
            if labels is not None and not labels.empty:
                df = self.data.obs.join(labels, self.parameters.get("obs_names"))
            else:
                df = self.data.obs
        else:
            df = self.data.var

        if fields is not None and len(fields) > 0:
            df = df[fields]
        return encode_matrix_fbs(df, col_idx=df.columns)
예제 #8
0
    def test_encode_boundary(self):
        """ test various boundary checks """

        # row indexing is unsupported
        with self.assertRaises(ValueError):
            encode_matrix_fbs(matrix=pd.DataFrame(), row_idx=[])

        # matrix must be 2D
        with self.assertRaises(ValueError):
            encode_matrix_fbs(matrix=np.zeros((3, 2, 1)))
        with self.assertRaises(ValueError):
            encode_matrix_fbs(matrix=np.ones((10,)))
예제 #9
0
 def test_roundtrip(self):
     dfSrc = pd.DataFrame(
         data={
             "a": np.zeros((10,), dtype=np.float32),
             "b": np.ones((10,), dtype=np.int64),
             "c": np.array([i for i in range(0, 10)], dtype=np.uint16),
             "d": pd.Series(["x", "y", "z", "x", "y", "z", "a", "x", "y", "z"], dtype="category"),
         }
     )
     dfDst = decode_matrix_fbs(encode_matrix_fbs(matrix=dfSrc, col_idx=dfSrc.columns))
     self.assertEqual(dfSrc.shape, dfDst.shape)
     self.assertEqual(set(dfSrc.columns), set(dfDst.columns))
     for c in dfSrc.columns:
         self.assertTrue(c in dfDst.columns)
         if isinstance(dfSrc[c], pd.Series):
             self.assertTrue(np.all(dfSrc[c] == dfDst[c]))
         else:
             self.assertEqual(dfSrc[c], dfDst[c])
예제 #10
0
 def convert(mat, cols):
     return decode_matrix_fbs(encode_matrix_fbs(
         mat, col_idx=cols)).to_numpy()
예제 #11
0
 def convert_to_fbs(annotation_dict):
     df = pd.DataFrame(annotation_dict)
     return encode_matrix_fbs(matrix=df, row_idx=None, col_idx=df.columns)
예제 #12
0
def make_fbs(data):
    df = pd.DataFrame(data)
    return encode_matrix_fbs(matrix=df, row_idx=None, col_idx=df.columns)
예제 #13
0
 def test_encode_sparse(self):
     csc = sparse.csc_matrix(np.array([[0, 1, 2], [3, 0, 4]]))
     expected_types = ((np.ndarray, np.int32), (np.ndarray, np.int32), (np.ndarray, np.int32))
     fbs = encode_matrix_fbs(matrix=csc, row_idx=None, col_idx=None)
     self.fbs_checks(fbs, (2, 3), expected_types, None)
예제 #14
0
 def test_encode_ndarray(self):
     arr = np.zeros((3, 2), dtype=np.float32)
     expected_types = ((np.ndarray, np.float32), (np.ndarray, np.float32), (np.ndarray, np.float32))
     fbs = encode_matrix_fbs(matrix=arr, row_idx=None, col_idx=None)
     self.fbs_checks(fbs, (3, 2), expected_types, None)