def test__is_matrix_sparse__partially_populated_sparse_matrix_returns_true( self): matrix = np.zeros([3, 4]) matrix[2][3] = 1.0 matrix[1][1] = 2.2 self.assertTrue(is_matrix_sparse(matrix, 50))
def write_anndata_x_matrix_to_cxg(self, output_cxg_directory, ctx, sparse_threshold): matrix_container = f"{output_cxg_directory}/X" x_matrix_data = self.anndata.X is_sparse = is_matrix_sparse(x_matrix_data, sparse_threshold) if not is_sparse: col_shift = get_column_shift_encode_for_matrix( x_matrix_data, sparse_threshold) is_sparse = col_shift is not None else: col_shift = None if col_shift is not None: logging.info( "Converting matrix X as sparse matrix with column shift encoding" ) x_col_shift_name = f"{output_cxg_directory}/X_col_shift" convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift, ctx) convert_matrix_to_cxg_array(matrix_container, x_matrix_data, is_sparse, ctx, col_shift) tiledb.consolidate(matrix_container, ctx=ctx) if hasattr(tiledb, "vacuum"): tiledb.vacuum(matrix_container)
def test__is_matrix_sparse__partially_populated_dense_matrix_returns_false( self): matrix = np.zeros([2, 2]) matrix[0][0] = 1.0 matrix[0][1] = 2.2 matrix[1][1] = 3.7 self.assertFalse(is_matrix_sparse(matrix, 50))
def test__is_matrix_sparse__giant_matrix_returns_false_early(self): matrix = np.ones([20000, 20]) with self.assertLogs(level="INFO") as logger: self.assertFalse(is_matrix_sparse(matrix, 1)) # Because the function returns early a log will output the _estimate_ instead of the _exact_ percentage of # non-zero elements in the matrix. self.assertIn("Percentage of non-zero elements (estimate)", logger.output[0])
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="input cxg directory") parser.add_argument("output", help="output cxg directory") parser.add_argument("--overwrite", action="store_true", help="replace output cxg directory") parser.add_argument("--verbose", "-v", action="count", default=0, help="verbose output") parser.add_argument( "--sparse-threshold", "-s", type=float, default=5.0, # default is 5% non-zero values help= "The X array will be sparse if the percent of non-zeros falls below this value", ) args = parser.parse_args() if os.path.exists(args.output): print("output dir exists:", args.output) if args.overwrite: print("output dir removed:", args.output) shutil.rmtree(args.output) else: print("use the overwrite option to remove the output directory") sys.exit(1) if not os.path.isdir(args.input): print("input is not a directory", args.input) sys.exit(1) shutil.copytree(args.input, args.output, ignore=shutil.ignore_patterns("X", "X_col_shift")) ctx = tiledb.Ctx({ "sm.num_reader_threads": 32, "sm.num_writer_threads": 32, "sm.consolidation.buffer_size": 1 * 1024 * 1024 * 1024, }) with tiledb.DenseArray(os.path.join(args.input, "X"), mode="r", ctx=ctx) as X_in: x_matrix_data = X_in[:, :] matrix_container = args.output is_sparse = is_matrix_sparse(x_matrix_data, args.sparse_threshold) if not is_sparse: col_shift = get_column_shift_encode_for_matrix( x_matrix_data, args.sparse_threshold) is_sparse = col_shift is not None else: col_shift = None if col_shift is not None: x_col_shift_name = f"{args.output}/X_col_shift" convert_ndarray_to_cxg_dense_array(x_col_shift_name, col_shift, ctx) tiledb.consolidate(matrix_container, ctx=ctx) if is_sparse: convert_matrix_to_cxg_array(matrix_container, x_matrix_data, is_sparse, ctx, col_shift) tiledb.consolidate(matrix_container, ctx=ctx) if not is_sparse: print("The array is not sparse, cleaning up, abort.") shutil.rmtree(args.output) sys.exit(1)
def test__is_matrix_sparse__zero_and_one_hundred_percent_threshold(self): matrix = np.array([1, 2, 3]) self.assertFalse(is_matrix_sparse(matrix, 0)) self.assertTrue(is_matrix_sparse(matrix, 100))