def test_get_identifier_changes(self): identifier_fc_string_1 = "Identifier\tlogFC\tadj.P.Val\nP1\t0.01\t1\nP2\t0.02\t1\nP3\t0.03\t1\n" identifier_fc_string_2 = "Identifier\tlogFC\tadj.P.Val\nP1\t0.01\t1\nP2\t0.02\t1\nP4\t0.03\t1\n" all_identifiers = {"P1", "P2", "P3", "P4"} identifier_fcs = [util.string_to_array(identifier_fc_string_1), util.string_to_array(identifier_fc_string_2)] identifier_changes = result_converter._get_identifier_changes(identifier_fcs, all_identifiers) self.assertIsNotNone(identifier_changes) self.assertEqual(4, len(identifier_changes)) for i in all_identifiers: self.assertTrue(i in identifier_changes) self.assertEqual(2, len(identifier_changes[i]))
def test_get_pathway_changes(self): pathway_fc_string_1 = "Pathway\tDirection\tFDR\nP1\tUp\t0.02\nP2\tDown\t0.01\nP3\tUp\t0.07\nP4\tDown\t0.09\n" pathway_fc_string_2 = "Pathway\tDirection\tFDR\nP1\tDown\t0.02\nP2\tUp\t0.01\nP5\tDown\t0.07\nP4\tUp\t0.09\n" pathway_fcs = [util.string_to_array(pathway_fc_string_1), util.string_to_array(pathway_fc_string_2)] all_pathways = {"P1", "P2", "P3", "P4", "P5"} pathway_changes = result_converter._get_pathway_changes(pathway_fcs, all_pathways, 0.05) self.assertIsNotNone(pathway_changes) self.assertEqual(5, len(pathway_changes)) for p in all_pathways: self.assertTrue(p in pathway_changes) self.assertEqual(2, len(pathway_changes[p]))
def test_get_pathway_p_values(self): pathway_fc_string_1 = "Pathway\tDirection\tPValue\nP1\tUp\t0.02\nP2\tDown\t0.01\nP3\tUp\t0.07\nP4\tDown\t0.09\n" pathway_fc_string_2 = "Pathway\tDirection\tPValue\nP1\tUp\t0.02\nP2\tUp\t0.01\nP5\tDown\t0.07\nP4\tUp\t0.09\n" pathway_fcs = [util.string_to_array(pathway_fc_string_1), util.string_to_array(pathway_fc_string_2)] pathway_p = result_converter._get_pathway_p_values(pathway_fcs) self.assertIsNotNone(pathway_p) self.assertEqual(5, len(pathway_p)) for pathway in {"P1", "P2", "P3", "P4", "P5"}: self.assertTrue(pathway in pathway_p) self.assertGreater(0.02, pathway_p["P1"]["p"]) self.assertLess(0.01, pathway_p["P2"]["p"]) self.assertEqual(0.07, pathway_p["P3"]["p"])
def test_ssgsea(self): json_obj = json.loads(self.test_json) request = create_analysis_input_object(json_obj) request.datasets[0].df = util.string_to_array(request.datasets[0].data) # get the mappings mappings = util.map_identifiers({"MITF", "CD19", "MS4A1"}) gene_set = self._get_gene_set() gene_id_colname = request.datasets[0].df.dtype.names[0] gene_set_mapping = GeneSetMapping.create_mapping(gene_set, identifier_mapping=mappings, identifiers=request.datasets[0].df[:][ gene_id_colname].tolist()) analyser = ReactomeGSVARAnalyser() result = analyser.analyse_request(request=request, gene_set_mappings={request.datasets[0].name: gene_set_mapping}, identifier_mappings=mappings, gene_set=gene_set) # test the result self.assertEqual(1, len(result)) self.assertIsNotNone(result[0].pathways) self.assertIsNotNone(result[0].fold_changes) # test the actual result reader = csv.DictReader(result[0].pathways.split("\n"), delimiter="\t") self.assertEqual(5, len(reader.fieldnames)) required_fields = ["Pathway", "Name", "Sample.1", "Sample.2", "Sample.3"] for required_field in required_fields: self.assertTrue(required_field in reader.fieldnames) # test the pathways found_pathways = 0 found_p1 = False found_p2 = False for pathway in reader: found_pathways += 1 if pathway["Pathway"] == "R-HSA-1280218": found_p1 = True self.assertEqual("0.0", pathway["Sample.1"].strip()) self.assertEqual("0.02880908", pathway["Sample.2"].strip()) self.assertEqual("0.02880908", pathway["Sample.3"].strip()) if pathway["Pathway"] == "R-HSA-392499": found_p2 = True self.assertEqual(-0.5, float(pathway["Sample.1"])) self.assertEqual(-0.5, float(pathway["Sample.2"])) self.assertEqual(-0.5, float(pathway["Sample.3"])) self.assertEqual(143, found_pathways) self.assertTrue(found_p1) self.assertTrue(found_p2)
def testOneLineConversion(self): """ Test basic data structure conversion :return: """ text = "\\tSample 1\\tSample2\\nGene 1\\t10\\t20\\n" array = util.string_to_array(text) self.assertEqual(0, array.ndim) self.assertEqual("Gene 1", array[array.dtype.names[0]].item())
def testConversion(self): """ Test basic data structure conversion :return: """ text = "\\tSample 1\\tSample2\\nGene 1\\t10\\t20\\nGene 2\\t10\\t30\\nGene 3\\t10\\t30\\n" array = util.string_to_array(text) self.assertEqual(1, array.ndim) self.assertEqual("Gene 1", array[0][array.dtype.names[0]]) self.assertEqual("Gene 2", array[1][array.dtype.names[0]])
def _get_identifier_zscores(result: AnalysisResult) -> dict: """ Extract the expression values for every identifier and returns them as a dict with the identifier as a key and the expression values across all datasets as value (list). Expression values are z-score normalised. :param result: An AnalysisResult object :return: A dict with the pathway as key and the expression values as value """ # get the observed proteins all_identifiers = set() identifier_fcs = list() for dataset in result.results: if not dataset.fold_changes: raise ConversionException( "Fold-change data missing in dataset '{}'".format( dataset.name)) identifier_fc = util.string_to_array(dataset.fold_changes) identifier_fcs.append(identifier_fc) identifier_index = identifier_fc.dtype.names.index("Identifier") dataset_identifiers = [row[identifier_index] for row in identifier_fc] all_identifiers.update(dataset_identifiers) # Use the z-score across the genes as identifier expression values z_scores_per_identifier = dict([(identifier, list()) for identifier in all_identifiers]) for identifier_fc in identifier_fcs: processed_identifiers = set() n_samples = len(identifier_fc.dtype.names) - 1 for identifier_row in identifier_fc: identifier = identifier_row[0] expression_values = identifier_row.tolist()[1:] z_scores = zscore(expression_values) z_scores_per_identifier[identifier] += z_scores.tolist() processed_identifiers.add(identifier) # add the missing values for identifier in all_identifiers: if identifier not in processed_identifiers: z_scores_per_identifier[identifier] += [0] * n_samples return z_scores_per_identifier
def test_analysis(self): json_obj = json.loads(self.test_json) request = create_analysis_input_object(json_obj) request.datasets[0].df = util.string_to_array(request.datasets[0].data) # get the mappings mappings = util.map_identifiers({ "MITF", "CD19", "MS4A1", "SDC1", "CD38", "EGFR", "IL10", "IL6", "GRB2", "GAB1", "SHC1" }) # filter the dataset request.datasets[0].df = ReactomeAnalysisWorker._filter_dataset( request.datasets[0].df, mappings, request.datasets[0].design, 1) gene_set = self._get_gene_set() gene_id_colname = request.datasets[0].df.dtype.names[0] gene_set_mapping = GeneSetMapping.create_mapping( gene_set, identifier_mapping=mappings, identifiers=request.datasets[0].df[:][gene_id_colname].tolist()) analyser = ReactomeRAnalyser() result = analyser.analyse_request( request=request, gene_set_mappings={request.datasets[0].name: gene_set_mapping}, identifier_mappings=mappings, gene_set=gene_set) # test the result self.assertEqual(1, len(result)) self.assertIsNotNone(result[0].pathways) result_lines = result[0].pathways.split("\n") self.assertEqual(233, len(result_lines)) reader = csv.DictReader(result_lines, delimiter="\t") required_fields = ("Pathway", "Name", "Direction", "FDR", "PValue", "NGenes") for field in required_fields: self.assertTrue(field in reader.fieldnames, "Missing field " + field)
def test_pathway_string(self): json_obj = json.loads(self.test_json) # add the parameters json_obj["parameters"] = [{"name": "pathways", "value": "R-HSA-1280218,R-HSA-392499"}, {"name": "create_reactome_visualization", "value": "False"}] request = create_analysis_input_object(json_obj) request.datasets[0].df = util.string_to_array(request.datasets[0].data) # get the mappings mappings = util.map_identifiers({"MITF", "CD19", "MS4A1"}) gene_set = self._get_gene_set() gene_id_colname = request.datasets[0].df.dtype.names[0] gene_set_mapping = GeneSetMapping.create_mapping(gene_set, identifier_mapping=mappings, identifiers=request.datasets[0].df[:][ gene_id_colname].tolist()) analyser = ReactomeGSVARAnalyser() result = analyser.analyse_request(request=request, gene_set_mappings={request.datasets[0].name: gene_set_mapping}, identifier_mappings=mappings, gene_set=gene_set) # test the result self.assertEqual(1, len(result)) self.assertIsNotNone(result[0].pathways) self.assertIsNotNone(result[0].fold_changes) # test the actual result reader = csv.DictReader(result[0].pathways.split("\n"), delimiter="\t") self.assertEqual(5, len(reader.fieldnames)) # there should only be two entries n_entries = 0 for line in reader: n_entries += 1 self.assertEqual(2, n_entries)
def convert_string_data(str_data: str, result_queue: multiprocessing.Queue) -> None: """ Launch this function in a new process to convert a string encoded data object to a Number array. :param str_data: The data matrix as a tab-delimited string :param result_queue: A queue object where the result will be stored or an exception """ try: result_data = util.string_to_array(str_data) # change the gene names to string (not the case for NCBI gene ids) if not str(result_data.dtype[0]).startswith("<U"): dt = result_data.dtype.descr dt[0] = (dt[0][0], '<U15') result_data = result_data.astype(dt) result_queue.put(result_data) # Mark the analysis as failed if the conversion caused an error. except util.ConversionException as e: result_queue.put(e)
def test_heartbeat(self): json_obj = json.loads(self.test_json) json_obj["parameters"].append({ "name": "max_missing_values", "value": "1" }) # remove the patient since this coefficient cannot be estimated json_obj["datasets"][0]["design"].pop("patient") request = create_analysis_input_object(json_obj) request.datasets[0].df = util.string_to_array(request.datasets[0].data) # get the mappings mappings = util.map_identifiers({"MITF", "CD19", "MS4A1"}) # filter the dataset request.datasets[0].df = ReactomeAnalysisWorker._filter_dataset( request.datasets[0].df, mappings, request.datasets[0].design, 1) gene_set = self._get_gene_set() gene_id_colname = request.datasets[0].df.dtype.names[0] gene_set_mapping = GeneSetMapping.create_mapping( gene_set, identifier_mapping=mappings, identifiers=request.datasets[0].df[:][gene_id_colname].tolist()) analyser = ReactomeRAnalyser() analyser.set_heartbeat_callback(self.update_heartbeat) start_time = int(time.time()) - 1 result = analyser.analyse_request( request=request, gene_set_mappings={request.datasets[0].name: gene_set_mapping}, identifier_mappings=mappings, gene_set=gene_set) # make sure the heartbeat was updated self.assertGreater(self.last_heartbeat, start_time)
def _get_gsva_pathway_expression(result: AnalysisResult) -> dict: """ Extracts the GSVA expression values per pathway and returns the concatenated values across all datasets as a dict with the pathway id as key and the expression values in a list as value. :param result: An AnalysisResult object :return: A dict with the pathway id as key and expression values as value """ # get all observed pathways all_pathways = set() pathway_fcs = list() for dataset in result.results: pathway_fc = util.string_to_array(dataset.pathways) pathway_fcs.append(pathway_fc) pathway_id = pathway_fc.dtype.names.index("Pathway") dataset_pathways = [row[pathway_id] for row in pathway_fc] all_pathways.update(dataset_pathways) # Use the GSVA score as pathway expression value of all experiments gsva_expr_per_pathway = dict([(pathway_id, list()) for pathway_id in all_pathways]) for pathway_fc in pathway_fcs: processed_pathways = set() # -2 since the pathway id + name column should be ignored n_samples = len(pathway_fc.dtype.names) - 2 for pathway_row in pathway_fc: pathway_id = pathway_row[0] gsva_expr_per_pathway[pathway_id] += pathway_row.tolist()[2:] processed_pathways.add(pathway_id) # add missing values for pathway_id in all_pathways: if pathway_id not in processed_pathways: gsva_expr_per_pathway[pathway_id] += [0] * n_samples return gsva_expr_per_pathway
def test_parameter_passing(self): json_obj = json.loads(self.test_json) json_obj["parameters"].append({ "name": "max_missing_values", "value": "1" }) # remove the patient since this coefficient cannot be estimated json_obj["datasets"][0]["design"].pop("patient") request = create_analysis_input_object(json_obj) request.datasets[0].df = util.string_to_array(request.datasets[0].data) self.assertEqual(3, len(request.parameters)) # default values inserted automatically self.assertEqual(6, len(request.parameter_dict)) self.assertTrue("max_missing_values" in request.parameter_dict) # get the mappings mappings = util.map_identifiers({"MITF", "CD19", "MS4A1"}) # filter the dataset request.datasets[0].df = ReactomeAnalysisWorker._filter_dataset( request.datasets[0].df, mappings, request.datasets[0].design, 1) gene_set = self._get_gene_set() gene_id_colname = request.datasets[0].df.dtype.names[0] gene_set_mapping = GeneSetMapping.create_mapping( gene_set, identifier_mapping=mappings, identifiers=request.datasets[0].df[:][gene_id_colname].tolist()) analyser = ReactomeRAnalyser() result = analyser.analyse_request( request=request, gene_set_mappings={request.datasets[0].name: gene_set_mapping}, identifier_mappings=mappings, gene_set=gene_set) # test the result self.assertEqual(1, len(result)) self.assertIsNotNone(result[0].pathways) result_lines = result[0].pathways.split("\n") self.assertEqual(24, len(result_lines)) reader = csv.DictReader(result_lines, delimiter="\t") required_fields = ("Pathway", "Name", "Direction", "FDR", "PValue", "NGenes") for field in required_fields: self.assertTrue(field in reader.fieldnames, "Missing field " + field) pathways_up = ("R-HSA-392499", "R-HSA-597592", "R-HSA-2990846", "R-HSA-3108232", "R-HSA-3232118") for row in reader: if reader.line_num == 2: self.assertTrue(row["Pathway"] == "R-HSA-392499") if reader.line_num == 6: self.assertTrue(row["Pathway"] == "R-HSA-3232118") if reader.line_num == 15: self.assertTrue(row["Pathway"] == "R-HSA-162582") if reader.line_num == 24: self.assertTrue(row["Pathway"] == "R-HSA-6811558") if row["Pathway"] in pathways_up: self.assertTrue(row["Direction"] == "Down") self.assertTrue( float(row["av_foldchange"]) < 0, "Incorrect regulation for " + row["Pathway"]) else: self.assertTrue(row["Direction"] == "Up") self.assertTrue(float(row["av_foldchange"]) > 0) # test the FC result self.assertIsNotNone(result[0].fold_changes) fc_lines = result[0].fold_changes.split("\n") self.assertEqual(4, len(fc_lines)) fc_reader = csv.DictReader(fc_lines, delimiter="\t") fc_fields = ("logFC", "Identifier") for field in fc_fields: self.assertTrue(field in fc_reader.fieldnames, "Missing FC field " + field) mitf_found = False for row in fc_reader: if row["Identifier"] == "MITF": self.assertAlmostEqual(4.53, float(row["logFC"]), delta=0.01) mitf_found = True self.assertTrue(mitf_found, "Failed to find MITF in FC data")
def _convert_gsa_result( result: AnalysisResult, reactome_blueprint: dict, min_p: float = 0.05, use_p: bool = False, excluded_pathways: list = list()) -> dict: """ Adds the data of the passed AnalysisResult to the passed reactome_blueprint. :param result: The AnalysisResult to draw the data from :param reactome_blueprint: The result retrieved from the REACTOME ORA analysis :param min_p: The minimum p-value in order to consider a pathway as significantly regulated. :param use_p: If set, p-values instead of fold-changes / pathway direction are used as "expression values" :param excluded_pathways: An optional list of excluded pathways. If set, pathways present in this list will be marked as excluded and will not trigger an exception if no expression data is available. :return: The adapted reactome result as a dict. """ reactome_blueprint = copy.deepcopy(reactome_blueprint) # get all observed pathways all_pathways = set() pathway_fcs = list() for dataset in result.results: pathway_fc = util.string_to_array(dataset.pathways) pathway_fcs.append(pathway_fc) pathway_id = pathway_fc.dtype.names.index("Pathway") dataset_pathways = [row[pathway_id] for row in pathway_fc] all_pathways.update(dataset_pathways) # get the pathway-level changes pathway_expr = _get_pathway_changes(pathway_fcs, all_pathways=all_pathways, min_p=min_p, return_p=use_p) pathway_p = _get_pathway_p_values(pathway_fcs) # get the observed proteins all_identifiers = set() identifier_fcs = list() for dataset in result.results: if not dataset.fold_changes: raise ConversionException( "Fold-change data missing in dataset '{}'".format( dataset.name)) identifier_fc = util.string_to_array(dataset.fold_changes) identifier_fcs.append(identifier_fc) identifier_index = identifier_fc.dtype.names.index("Identifier") dataset_identifiers = [row[identifier_index] for row in identifier_fc] all_identifiers.update(dataset_identifiers) # get the gene-/protein-level changes identifier_expr = _get_identifier_changes(identifier_fcs, all_identifiers, return_p=use_p) # set the type reactome_blueprint["summary"][ "type"] = "GSA_STATISTICS" if use_p else "GSA_REGULATION" reactome_blueprint["summary"]["sampleName"] = "Multi-sample analysis" # add the dataset names as column names reactome_blueprint["expressionSummary"]["columnNames"] = [ dataset.name for dataset in result.results ] reactome_blueprint["expressionSummary"]["min"] = 0 if use_p else -2 reactome_blueprint["expressionSummary"]["max"] = 1 if use_p else 2 # initialize lists for missing values missing_expr = [0 for r in result.results] # populate the pathway data for i in range(0, len(reactome_blueprint["pathways"])): pathway_id = reactome_blueprint["pathways"][i]["stId"] # add the entity-level expression values for identifier_level in ("entities", "interactors"): for entity_index in range( 0, len(reactome_blueprint["pathways"][i]["data"] [identifier_level])): org_id = reactome_blueprint["pathways"][i]["data"][ identifier_level][entity_index]["id"] if org_id not in identifier_expr: raise ConversionException( "Missing expression values for " + org_id) reactome_blueprint["pathways"][i]["data"][identifier_level][ entity_index]["exp"] = identifier_expr[org_id] # update the statistics for resource_index in range( 0, len(reactome_blueprint["pathways"][i]["data"]["statistics"])): # set the pathway p-value to 1 if the pathway was excluded if _ignore_pathway(reactome_blueprint["pathways"][i], excluded_pathways=excluded_pathways): reactome_blueprint["pathways"][i]["data"]["statistics"][ resource_index]["entitiesPValue"] = 1 reactome_blueprint["pathways"][i]["data"]["statistics"][ resource_index]["entitiesFDR"] = 1 reactome_blueprint["pathways"][i]["data"]["statistics"][ resource_index]["exp"] = missing_expr else: if pathway_id not in pathway_expr: raise ConversionException( "Missing pathway regulation information for '" + pathway_id + "'") if pathway_id not in pathway_p: raise ConversionException("Missing p-value for pathway '" + pathway_id + "'") reactome_blueprint["pathways"][i]["data"]["statistics"][resource_index]["entitiesPValue"] = \ pathway_p[pathway_id]["p"] reactome_blueprint["pathways"][i]["data"]["statistics"][resource_index]["entitiesFDR"] = \ pathway_p[pathway_id]["fdr"] reactome_blueprint["pathways"][i]["data"]["statistics"][ resource_index]["exp"] = pathway_expr[pathway_id] # populate the "not found" data for i in range(0, len(reactome_blueprint["notFound"])): identifier = reactome_blueprint["notFound"][i]["id"] if identifier not in identifier_expr: raise ConversionException("Missing expression data for " + identifier) # add the expression data reactome_blueprint["notFound"][i]["exp"] = identifier_expr[identifier] return reactome_blueprint
def test_no_design(self): test_json = """ { "analysisId": "test_01", "datasets": [ { "data": "\\tSample 1\\tSample2\\tSample 3\\nCD19\\t10\\t20\\t2\\nMS4A1\\t10\\t20\\t2\\n\ MITF\\t10\\t0\\t0\\n", "name": "First experiment", "type": "rnaseq_counts" } ], "methodName": "ssgsea" } """ json_obj = json.loads(test_json) request = create_analysis_input_object(json_obj) request.datasets[0].df = util.string_to_array(request.datasets[0].data) self.assertIsNotNone(request) # get the mappings mappings = util.map_identifiers({"MITF", "CD19", "MS4A1"}) gene_set = self._get_gene_set() gene_id_colname = request.datasets[0].df.dtype.names[0] gene_set_mapping = GeneSetMapping.create_mapping(gene_set, identifier_mapping=mappings, identifiers=request.datasets[0].df[:][ gene_id_colname].tolist()) analyser = ReactomeGSVARAnalyser() result = analyser.analyse_request(request=request, gene_set_mappings={request.datasets[0].name: gene_set_mapping}, identifier_mappings=mappings, gene_set=gene_set) # test the result self.assertEqual(1, len(result)) self.assertIsNotNone(result[0].pathways) self.assertIsNotNone(result[0].fold_changes) # test the actual result reader = csv.DictReader(result[0].pathways.split("\n"), delimiter="\t") self.assertEqual(5, len(reader.fieldnames)) required_fields = ["Pathway", "Sample_1", "Sample2", "Sample_3"] for required_field in required_fields: self.assertTrue(required_field in reader.fieldnames, "Missing required field " + required_field) # test the pathways found_pathways = 0 for pathway in reader: found_pathways += 1 if pathway["Pathway"] == "R-HSA-1280218": self.assertEqual("0.0", pathway["Sample_1"].strip()) self.assertEqual("0.02880908", pathway["Sample2"].strip()) self.assertEqual("0.02880908", pathway["Sample_3"].strip()) if pathway["Pathway"] == "R-HSA-392499": self.assertEqual(-0.5, float(pathway["Sample_1"])) self.assertEqual(-0.5, float(pathway["Sample2"])) self.assertEqual(-0.5, float(pathway["Sample_3"])) self.assertEqual(143, found_pathways)