def create_matrix(self, dataset_name, num_cat, max_labels): with self.elapsed_timer() as elapsed: annon_dict = self.create_annotations_dict_multi_process( dataset_name, num_cat, max_labels) dict_size = sum( sys.getsizeof(value) for value in annon_dict.values()) / 1024**2 self.test_notes[dataset_name][f"num_categories_{num_cat}"][ f"max_label_{max_labels}"]["annotation_dict"] = { "creation_time": str(elapsed()), "size": f"{dict_size} mb", } df = pd.DataFrame(annon_dict) df_size = sys.getsizeof(df) / 1024**2 self.test_notes[dataset_name][f"num_categories_{num_cat}"][ f"max_label_{max_labels}"]["data_frame"] = { "creation_time": str(elapsed()), "size": f"{df_size} mb", } try: matrix = encode_matrix_fbs(matrix=df, row_idx=None, col_idx=df.columns) matrix_size = sys.getsizeof(matrix) / 1024**2 self.test_notes[dataset_name][f"num_categories_{num_cat}"][ f"max_label_{max_labels}"]["fbs_matrix"] = { "creation_time": str(elapsed()), "size": f"{matrix_size} mb", } return matrix except Exception as e: print(f"Issue creating fbs matrix: {e}, for {dataset_name}") return []
def annotation_to_fbs_matrix(self, axis, fields=None, labels=None): with ServerTiming.time(f"annotations.{axis}.query"): A = self.open_array(str(axis)) # may raise if fields contains unknown key cxg_fields, anno_fields, return_fields, index_field = self._annotations_field_split( axis, fields, A, labels) if cxg_fields is None: data = A[:] elif cxg_fields: data = A.query(attrs=cxg_fields)[:] else: data = {} df = pd.DataFrame.from_dict(data) if axis == Axis.OBS and labels is not None and not labels.empty: if anno_fields is None: assert index_field df = df.join(labels, index_field) elif anno_fields: assert index_field df = df.join(labels[anno_fields], index_field) if return_fields: df = df[return_fields] with ServerTiming.time(f"annotations.{axis}.encode"): fbs = encode_matrix_fbs(df, col_idx=df.columns) return fbs
def layout_to_fbs_matrix(self, fields): """ return specified embeddings as a flatbuffer, using the cellxgene matrix fbs encoding. * returns only first two dimensions, with name {ename}_0 and {ename}_1, where {ename} is the embedding name. * client assumes each will be individually centered & scaled (isotropically) to a [0, 1] range. * does not support filtering """ embeddings = self.get_embedding_names( ) if fields is None or len(fields) == 0 else fields layout_data = [] with ServerTiming.time("layout.query"): for ename in embeddings: embedding = self.get_embedding_array(ename, 2) normalized_layout = DataAdaptor.normalize_embedding(embedding) layout_data.append( pd.DataFrame(normalized_layout, columns=[f"{ename}_0", f"{ename}_1"])) with ServerTiming.time("layout.encode"): if layout_data: df = pd.concat(layout_data, axis=1, copy=False) else: df = pd.DataFrame() fbs = encode_matrix_fbs(df, col_idx=df.columns, row_idx=None) return fbs
def data_frame_to_fbs_matrix(self, filter, axis): """ Retrieves data 'X' and returns in a flatbuffer Matrix. :param filter: filter: dictionary with filter params :param axis: string obs or var :return: flatbuffer Matrix Caveats: * currently only supports access on VAR axis * currently only supports filtering on VAR axis """ if axis != Axis.VAR: raise ValueError("Only VAR dimension access is supported") try: obs_selector, var_selector = self._filter_to_mask(filter) except (KeyError, IndexError, TypeError, AttributeError): raise FilterError("Error parsing filter") if obs_selector is not None: raise FilterError("filtering on obs unsupported") num_columns = self.get_shape( )[1] if var_selector is None else np.count_nonzero(var_selector) if self.server_config.exceeds_limit("column_request_max", num_columns): raise ExceedsLimitError( "Requested dataframe columns exceed column request limit") X = self.get_X_array(obs_selector, var_selector) col_idx = np.nonzero([] if var_selector is None else var_selector)[0] return encode_matrix_fbs(X, col_idx=col_idx, row_idx=None)
def annotation_to_fbs_matrix(self, axis, fields=None, labels=None): if axis == Axis.OBS: if labels is not None and not labels.empty: df = self.data.obs.join(labels, self.parameters.get("obs_names")) else: df = self.data.obs else: df = self.data.var if fields is not None and len(fields) > 0: df = df[fields] return encode_matrix_fbs(df, col_idx=df.columns)
def test_encode_DataFrame(self): df = pd.DataFrame( data={ "a": np.zeros((10, ), dtype=np.float32), "b": np.ones((10, ), dtype=np.int64), "c": np.array([i for i in range(0, 10)], dtype=np.uint16), "d": pd.Series(["x", "y", "z", "x", "y", "z", "a", "x", "y", "z"], dtype="category"), }) expected_types = ((np.ndarray, np.float32), (np.ndarray, np.int32), (np.ndarray, np.uint32), (list, None)) fbs = encode_matrix_fbs(matrix=df, row_idx=None, col_idx=df.columns) self.fbs_checks(fbs, (10, 4), expected_types, ["a", "b", "c", "d"])
def test_encode_boundary(self): """ test various boundary checks """ # row indexing is unsupported with self.assertRaises(ValueError): encode_matrix_fbs(matrix=pd.DataFrame(), row_idx=[]) # matrix must be 2D with self.assertRaises(ValueError): encode_matrix_fbs(matrix=np.zeros((3, 2, 1))) with self.assertRaises(ValueError): encode_matrix_fbs(matrix=np.ones((10, )))
def summarize_var(self, method, filter, query_hash): if method != "mean": raise UnsupportedSummaryMethod("Unknown gene set summary method.") obs_selector, var_selector = self._filter_to_mask(filter) if obs_selector is not None: raise FilterError("filtering on obs unsupported") # if no filter, just return zeros. We don't have a use case # for summarizing the entire X without a filter, and it would # potentially be quite compute / memory intensive. if var_selector is None or np.count_nonzero(var_selector) == 0: mean = np.zeros((self.get_shape()[0], 1), dtype=np.float32) else: X = self.get_X_array(obs_selector, var_selector) if sparse.issparse(X): mean = X.mean(axis=1) else: mean = X.mean(axis=1, keepdims=True) col_idx = pd.Index([query_hash]) return encode_matrix_fbs(mean, col_idx=col_idx, row_idx=None)
def test_roundtrip(self): dfSrc = pd.DataFrame( data={ "a": np.zeros((10, ), dtype=np.float32), "b": np.ones((10, ), dtype=np.int64), "c": np.array([i for i in range(0, 10)], dtype=np.uint16), "d": pd.Series(["x", "y", "z", "x", "y", "z", "a", "x", "y", "z"], dtype="category"), }) dfDst = decode_matrix_fbs( encode_matrix_fbs(matrix=dfSrc, col_idx=dfSrc.columns)) self.assertEqual(dfSrc.shape, dfDst.shape) self.assertEqual(set(dfSrc.columns), set(dfDst.columns)) for c in dfSrc.columns: self.assertTrue(c in dfDst.columns) if isinstance(dfSrc[c], pd.Series): self.assertTrue(np.all(dfSrc[c] == dfDst[c])) else: self.assertEqual(dfSrc[c], dfDst[c])
def test_encode_sparse(self): csc = sparse.csc_matrix(np.array([[0, 1, 2], [3, 0, 4]])) expected_types = ((np.ndarray, np.int32), (np.ndarray, np.int32), (np.ndarray, np.int32)) fbs = encode_matrix_fbs(matrix=csc, row_idx=None, col_idx=None) self.fbs_checks(fbs, (2, 3), expected_types, None)
def test_encode_ndarray(self): arr = np.zeros((3, 2), dtype=np.float32) expected_types = ((np.ndarray, np.float32), (np.ndarray, np.float32), (np.ndarray, np.float32)) fbs = encode_matrix_fbs(matrix=arr, row_idx=None, col_idx=None) self.fbs_checks(fbs, (3, 2), expected_types, None)
def convert_to_fbs(annotation_dict): df = pd.DataFrame(annotation_dict) return encode_matrix_fbs(matrix=df, row_idx=None, col_idx=df.columns)
def make_fbs(data): df = pd.DataFrame(data) return encode_matrix_fbs(matrix=df, row_idx=None, col_idx=df.columns)
def convert(mat, cols): return decode_matrix_fbs(encode_matrix_fbs(mat, col_idx=cols)).to_numpy()