def generate_request_hash(self) -> str: """ Generates a request hash uniquely identifying a request by its input parameters. Requires cell query results to exist, else raises MatrixQueryResultsNotFound. :return: str Request hash """ cell_manifest_key = f"s3://{os.environ['MATRIX_QUERY_RESULTS_BUCKET']}/{self.request_id}/cell_metadata_manifest" reader = CellQueryResultsReader(cell_manifest_key) logger.info(f"Generating request hash from {cell_manifest_key}") h = hashlib.md5() h.update(self.feature.encode()) h.update(self.format.encode()) for field in self.metadata_fields: h.update(field.encode()) n_slices = len(reader.manifest['part_urls']) for i in range(n_slices): logger.info(f"[Slice {i}] start.") cell_df = reader.load_slice(i) for key in cell_df.index: h.update(key.encode()) logger.info(f"[Slice {i}] Hashed all {len(cell_df.index)} keys.") del cell_df request_hash = h.hexdigest() logger.info(f"Successfully generated request hash {request_hash}.") return request_hash
def test_load_empty_results(self, mock_parse_manifest): mock_parse_manifest.return_value = {"record_count": 0} cell_query_results_reader = CellQueryResultsReader("test_manifest_key") results = cell_query_results_reader.load_results() self.assertEqual(results.shape, (0, 0))
def test_load_slice(self, mock_open, mock_pd_read_csv): manifest_file_path = "tests/functional/res/cell_metadata_manifest" with open(manifest_file_path) as f: mock_open.return_value = f reader = CellQueryResultsReader("test_manifest_key") reader.load_slice(3) pandas_args = mock_pd_read_csv.call_args[-2] pandas_kwargs = mock_pd_read_csv.call_args[-1] self.assertIn("project.project_core.project_short_name", pandas_kwargs["names"]) self.assertTrue(pandas_args[0].startswith("s3://"))
def test_load_results(self, mock_parse_manifest, mock_load_slice): mock_parse_manifest.return_value = { "columns": ["a", "b", "c"], "part_urls": ["A", "B", "C"], "record_count": 5 } test_df = pandas.DataFrame() mock_load_slice.return_value = test_df reader = CellQueryResultsReader("test_manifest_key") reader.load_results() expected_calls = [mock.call(0), mock.call(1), mock.call(2)] mock_load_slice.assert_has_calls(expected_calls)
def test_empty_results(self, mock_parse_manifest, mock_upload_converted_matrix, mock_complete_subtask_execution, mock_complete_request, mock_creation_date, mock_remove): mock_creation_date.return_value = date.to_string( datetime.datetime.utcnow()) self.matrix_converter.query_results = { QueryType.CELL: CellQueryResultsReader("test_manifest_key"), QueryType.EXPRESSION: ExpressionQueryResultsReader("test_manifest_key"), QueryType.FEATURE: FeatureQueryResultsReader("test_manifest_key") } mock_parse_manifest.return_value = {"record_count": 0} self.matrix_converter.local_output_filename = "unit_test_empty_loom.loom" self.matrix_converter.run() self.assertEqual(os.path.getsize("unit_test_empty_loom.loom"), 0) mock_complete_subtask_execution.assert_called_once_with( Subtask.CONVERTER) mock_complete_request.assert_called_once() os.remove("unit_test_empty_loom.loom")
def run(self): try: LOGGER.debug(f"Beginning matrix conversion run for {self.args.request_id}") self.query_results = { QueryType.CELL: CellQueryResultsReader(self.args.cell_metadata_manifest_key), QueryType.EXPRESSION: ExpressionQueryResultsReader(self.args.expression_manifest_key), QueryType.FEATURE: FeatureQueryResultsReader(self.args.gene_metadata_manifest_key) } LOGGER.debug(f"Beginning conversion to {self.format}") local_converted_path = getattr(self, f"_to_{self.format}")() LOGGER.debug(f"Conversion to {self.format} completed") LOGGER.debug(f"Beginning upload to S3") self._upload_converted_matrix(local_converted_path, self.target_path) LOGGER.debug("Upload to S3 complete, job finished") os.remove(local_converted_path) self.request_tracker.complete_subtask_execution(Subtask.CONVERTER) self.request_tracker.complete_request(duration=(date.get_datetime_now() - date.to_datetime(self.request_tracker.creation_date)) .total_seconds()) except Exception as e: LOGGER.info(f"Matrix Conversion failed on {self.args.request_id} with error {str(e)}") self.request_tracker.log_error(str(e)) raise e
def test__n_slices(self, mock_open): manifest_file_path = "tests/functional/res/cell_metadata_manifest" with open(manifest_file_path) as f: mock_open.return_value = f self.matrix_converter.query_results = { QueryType.CELL: CellQueryResultsReader("test_manifest_key") } self.assertEqual(self.matrix_converter._n_slices(), 8)
def test__to_csv(self, mock_parse_manifest, mock_load_cell_results, mock_write_gene_dataframe, mock_make_directory, mock_generate_dfs): results_dir = "unit_test__to_csv" os.makedirs(results_dir) mock_make_directory.return_value = results_dir test_data = self._create_test_data() mock_write_gene_dataframe.return_value = test_data["genes_df"] mock_load_cell_results.return_value = test_data["cells_df"] expression_manifest = { "record_count": sum(d.shape[0] for d in test_data["expr_dfs"]) } mock_parse_manifest.return_value = expression_manifest mock_generate_dfs.return_value = iter(test_data["expr_dfs"]) self.matrix_converter.query_results = { QueryType.CELL: CellQueryResultsReader("test_manifest_key"), QueryType.EXPRESSION: ExpressionQueryResultsReader("test_manifest_key") } test_data["genes_df"].to_csv(os.path.join(results_dir, "genes.csv"), index_label="featurekey") self.matrix_converter.local_output_filename = "unit_test__to_csv.zip" zip_path = self.matrix_converter._to_csv() with zipfile.ZipFile(zip_path) as z: z.extractall() df = pandas.read_csv(os.path.join(results_dir, "expression.csv"), header=0, index_col="cellkey") self.assertAlmostEqual( df.sum().sum(), sum(d["exprvalue"].sum() for d in test_data["expr_dfs"]), 2) # Every cell has 20 genes with non-zero expression. Check first and # last cells to makes sure that the expression matches self.assertAlmostEqual( df.sum(axis=1)[0], test_data["expr_dfs"][0]['exprvalue'][:20].sum(), 2) self.assertAlmostEqual( df.sum(axis=1)[1], test_data["expr_dfs"][0]['exprvalue'][20:40].sum(), 2) self.assertAlmostEqual( df.sum(axis=1).tail(1).item(), test_data["expr_dfs"][-1]['exprvalue'][-20:].sum(), 2) shutil.rmtree(results_dir) os.remove(zip_path)
def test__to_loom(self, mock_parse_manifest, mock_load_gene_results, mock_load_cell_results, mock_generate_dfs): working_dir = "unit_test__to_loom" self.matrix_converter.working_dir = working_dir test_data = self._create_test_data() self.matrix_converter.query_results = { QueryType.CELL: CellQueryResultsReader("test_manifest_key"), QueryType.EXPRESSION: ExpressionQueryResultsReader("test_manifest_key"), QueryType.FEATURE: FeatureQueryResultsReader("test_manifest_key") } self.matrix_converter.query_results[QueryType.CELL].manifest = { "record_count": test_data["cells_df"].shape[0] } mock_load_gene_results.return_value = test_data["genes_df"] mock_load_cell_results.return_value = test_data["cells_df"] expression_manifest = { "record_count": sum(d.shape[0] for d in test_data["expr_dfs"]) } mock_parse_manifest.return_value = expression_manifest mock_generate_dfs.return_value = iter(test_data["expr_dfs"]) self.matrix_converter.local_output_filename = "unit_test__to_loom.loom" loom_path = self.matrix_converter._to_loom() ds = loompy.connect(loom_path) self.assertAlmostEqual( ds[:, :].sum(), sum(d["exprvalue"].sum() for d in test_data["expr_dfs"]), -1) # Every cell has 20 genes with non-zero expression. Check first and # last cells to makes sure that the expression matches self.assertAlmostEqual( ds[:, 0].sum(), test_data["expr_dfs"][0]['exprvalue'][:20].sum(), 1) self.assertAlmostEqual( ds[:, 1].sum(), test_data["expr_dfs"][0]['exprvalue'][20:40].sum(), 1) self.assertAlmostEqual( ds[:, -1].sum(), test_data["expr_dfs"][-1]['exprvalue'][-20:].sum(), 1) shutil.rmtree(working_dir)
def generate_request_hash(self) -> str: """ Generates a request hash uniquely identifying a request by its input parameters. Requires cell query results to exist, else raises MatrixQueryResultsNotFound. :return: str Request hash """ cell_manifest_key = f"s3://{os.environ['MATRIX_QUERY_RESULTS_BUCKET']}/{self.request_id}/cell_metadata_manifest" reader = CellQueryResultsReader(cell_manifest_key) cell_df = reader.load_results() cellkeys = cell_df.index h = hashlib.md5() h.update(self.feature.encode()) h.update(self.format.encode()) for field in self.metadata_fields: h.update(field.encode()) for key in cellkeys: h.update(key.encode()) request_hash = h.hexdigest() return request_hash
def test__generate_expression_dfs(self, mock_load_slice, mock_parse_manifest): mock_parse_manifest.return_value = { "part_urls": ["url1"], "columns": ["cellkey", "featurekey", "exprvalue"], "record_count": 2624879 } self.matrix_converter.query_results = { QueryType.CELL: CellQueryResultsReader("test_cell_manifest_key"), QueryType.EXPRESSION: ExpressionQueryResultsReader("test_expression_manifest_key") } # Create some fake gene and cell values. We'll have 2027 cells each # with 647 expressed genes. This makes sure the test hits some jagged # edges. genes = itertools.cycle(("gene_" + str(n) for n in range(647))) cells = itertools.chain.from_iterable( (itertools.repeat("cell_" + str(n), 647) for n in range(2027))) full_expr_df = pandas.DataFrame( columns=["cellkey", "featurekey", "exprvalue"], data=[[c, f, random.randrange(1, 10000)] for c, f in zip(cells, genes)]) # load_slice splits on 1000000 rows chunk1_df = full_expr_df[:999615] chunk2_df = full_expr_df[999615:] # Have load slice return two different chunks mock_load_slice.return_value = iter([chunk1_df, chunk2_df]) # Keep track of how many unique cells we see and the sum of expression # values cell_counter = 0 expr_sum = 0 for cell_df in self.matrix_converter._generate_expression_dfs(50): num_cells = len(set(cell_df["cellkey"])) self.assertLessEqual(num_cells, 50) cell_counter += num_cells expr_sum += cell_df["exprvalue"].sum() # Verify we saw every cell and all the expression values self.assertEqual(cell_counter, 2027) self.assertEqual(expr_sum, full_expr_df["exprvalue"].sum())
def test__to_mtx(self, mock_parse_manifest, mock_load_cell_results, mock_write_gene_dataframe_10x, mock_write_gene_dataframe, mock_make_directory, mock_generate_dfs): results_dir = "unit_test__to_mtx" os.makedirs(results_dir) mock_make_directory.return_value = results_dir test_data = self._create_test_data() mock_write_gene_dataframe.return_value = test_data["genes_df"] mock_write_gene_dataframe_10x.return_value = test_data["genes_df"] mock_load_cell_results.return_value = test_data["cells_df"] expression_manifest = { "record_count": sum(d.shape[0] for d in test_data["expr_dfs"]) } mock_parse_manifest.return_value = expression_manifest mock_generate_dfs.return_value = iter(test_data["expr_dfs"]) self.matrix_converter.query_results = { QueryType.CELL: CellQueryResultsReader("test_manifest_key"), QueryType.EXPRESSION: ExpressionQueryResultsReader("test_manifest_key") } test_data["genes_df"].to_csv(os.path.join(results_dir, "features.tsv.gz"), index_label="featurekey", sep="\t", compression="gzip") test_data["genes_df"].to_csv(os.path.join(results_dir, "genes.tsv.gz"), index_label="featurekey", sep="\t", compression="gzip") self.matrix_converter.local_output_filename = "unit_test__to_mtx.zip" zip_path = self.matrix_converter._to_mtx() with zipfile.ZipFile(zip_path) as z: z.extractall() matrix = scipy.io.mmread(os.path.join(results_dir, "matrix.mtx.gz")).todense() self.assertAlmostEqual( matrix.sum(), sum(d["exprvalue"].sum() for d in test_data["expr_dfs"]), 2) # Every cell has 20 genes with non-zero expression. Check first and # last cells to makes sure that the expression matches self.assertAlmostEqual( matrix[:, 0].sum(), test_data["expr_dfs"][0]['exprvalue'][:20].sum(), 2) self.assertAlmostEqual( matrix[:, 1].sum(), test_data["expr_dfs"][0]['exprvalue'][20:40].sum(), 2) self.assertAlmostEqual( matrix[:, -1].sum(), test_data["expr_dfs"][-1]['exprvalue'][-20:].sum(), 2) shutil.rmtree(results_dir) os.remove(zip_path)