def test_graph_equality(turtlish, graph): """ :param turtlish: String of triples in turtle format without prefix header :param graph: Graph object to test against :return: Boolean, True if graphs contain same set of triples """ turtle_graph = RDFGraph() turtle_graph.bind_all_namespaces() prefixes = "\n".join([ "@prefix {}: <{}> .".format(n[0], n[1]) for n in turtle_graph.namespace_manager.namespaces() ]) turtle_string = prefixes + turtlish mock_file = io.StringIO(turtle_string) turtle_graph.parse(mock_file, format="turtle") turtle_triples = set(list(turtle_graph)) ref_triples = set(list(graph)) equality = turtle_triples == ref_triples if not equality: LOG.warning( "Triples do not match\n" "\tLeft hand difference: %s\n" "\tRight hand difference: %s", sorted(turtle_triples - ref_triples), sorted(ref_triples - turtle_triples)) return equality
def test_graph_equality(self, turtlish, graph): """ :param turtlish: String of triples in turtle format without prefix header :param graph: Graph object to test against :return: Boolean, True if graphs contain same set of triples """ turtle_graph = RDFGraph() turtle_graph.bind_all_namespaces() prefixes = "\n".join( ["@prefix {}: <{}> .".format(n[0], n[1]) for n in turtle_graph.namespace_manager.namespaces()] ) turtle_string = prefixes + turtlish mock_file = io.StringIO(turtle_string) turtle_graph.parse(mock_file, format="turtle") turtle_triples = set(list(turtle_graph)) ref_triples = set(list(graph)) equality = turtle_triples == ref_triples if not equality: logger.warning("Triples do not match\n" "Left hand difference: {}\n" "Right hand difference:{}".format( turtle_triples - ref_triples, ref_triples - turtle_triples )) return equality
def test_parse(self): for rcv in RCVS: output_nt = rcv + '.nt' input_xml = rcv + '.xml.gz' reference_ttl = TTL_PATH + rcv + '.ttl' with self.subTest(rcv=rcv): mock_args = [ "test_clinvar.py", "--inputdir", XML_PATH, "--filename", input_xml, "--mapfile", MAP_FILE, "--destination", NT_PATH, "--output", output_nt ] patch('sys.argv', mock_args).start() clinvar_parse() query_graph = RDFGraph() query_graph.bind_all_namespaces() query_graph.parse(NT_PATH + output_nt, format='nt') with open(reference_ttl, 'r') as ref_fh: ref_graph = "\n".join(ref_fh.readlines()) # debug LOG.debug( "Reference graph: %s", query_graph.serialize(format="turtle").decode("utf-8")) # Convert output from ClinVar parse to dot then png dot_file_path = DOT_PATH + rcv + ".dot" with open(dot_file_path, 'w') as dot_file: rdf2dot(query_graph, dot_file) self.assertTrue( TestUtils.test_graph_equality(ref_graph, query_graph))
def test_snp_trait_association(self): """ test the _add_variant_trait_association :return: """ efo_ontology = RDFGraph() logger.info("Loading EFO ontology in separate rdf graph") efo_ontology.parse(self.source.files['efo']['url'], format='xml') efo_ontology.bind_all_namespaces() logger.info("Finished loading EFO ontology") variant_curie, variant_type = \ self.source._get_curie_and_type_from_id(self.test_data['snp_label']) description = self.source._make_description( self.test_data['trait'], self.test_data['init_sample_desc'], self.test_data['replicated_sample_desc'], self.test_data['platform'], self.test_data['pvalue']) self.source._add_variant_trait_association(variant_curie, self.test_data['trait_uri'], efo_ontology, self.test_data['pubmed'], description) sparql_query = """ SELECT ?snp WHERE {{ <https://monarchinitiative.org/MONARCH_b46cdf48950cb00d> a OBAN:association ; dc:description "{}" ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0003949 ; OBAN:association_has_predicate OBO:RO_0002326 ; OBAN:association_has_subject ?snp . <https://monarchinitiative.org/MONARCH_70a05d8eb1c3d4b0> a OBAN:association ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0006995 ; OBAN:association_has_predicate OBO:RO_0002326 ; OBAN:association_has_subject ?snp . EFO:0003949 a owl:Class ; rdfs:label "eye color"^^xsd:string ; rdfs:subClassOf UPHENO:0001001 . ?snp OBO:RO_0002326 EFO:0003949, EFO:0006995 . PMID:25918132 a OBO:IAO_0000013 . }} """.format(description) sparql_output = self.source.graph.query(sparql_query) # Test that query passes and returns one row results = list(sparql_output) expected = [(URIRef(self.source.graph._getNode("dbSNP:rs1491921")), )] self.assertEqual(results, expected)
def test_snp_trait_association(self): """ test the _add_variant_trait_association :return: """ self.assertTrue(len(list(self.source.graph)) == 0) efo_ontology = RDFGraph() LOG.info("Loading EFO ontology in separate rdf graph") efo_ontology.parse(self.source.files['efo']['url'], format='xml') efo_ontology.bind_all_namespaces() LOG.info("Finished loading EFO ontology") variant_curie, variant_type = self.source._get_curie_and_type_from_id( self.test_data['snp_label']) description = self.source._make_description( self.test_data['trait'], self.test_data['init_sample_desc'], self.test_data['replicated_sample_desc'], self.test_data['platform'], self.test_data['pvalue']) self.source._add_variant_trait_association( variant_curie, self.test_data['trait_uri'], efo_ontology, self.test_data['pubmed'], description) triples = """ MONARCH:bffc7a930c08cc8fe931 a OBAN:association ; dc:description "{0}" ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0003949 ; OBAN:association_has_predicate RO:0003304 ; OBAN:association_has_subject dbSNP:rs1491921 . MONARCH:bff9b97458d67ed7f517 a OBAN:association ; dc:description "{0}" ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0006995 ; OBAN:association_has_predicate RO:0003304 ; OBAN:association_has_subject dbSNP:rs1491921 . EFO:0003949 a owl:Class ; rdfs:label "eye color"^^xsd:string ; rdfs:subClassOf UPHENO:0001001 . dbSNP:rs1491921 RO:0003304 EFO:0003949, EFO:0006995 . PMID:25918132 a OBO:IAO_0000013 . """.format(description) # dbg # LOG.debug( # "Reference graph: %s", # self.source.graph.serialize(format="turtle").decode("utf-8")) self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph))
def test_snp_trait_association(self): """ test the _add_variant_trait_association :return: """ efo_ontology = RDFGraph() logger.info("Loading EFO ontology in separate rdf graph") efo_ontology.parse(self.source.files['efo']['url'], format='xml') efo_ontology.bind_all_namespaces() logger.info("Finished loading EFO ontology") variant_curie, variant_type = \ self.source._get_curie_and_type_from_id(self.test_data['snp_label']) description = self.source._make_description( self.test_data['trait'], self.test_data['init_sample_desc'], self.test_data['replicated_sample_desc'], self.test_data['platform'], self.test_data['pvalue']) self.source._add_variant_trait_association( variant_curie, self.test_data['trait_uri'], efo_ontology, self.test_data['pubmed'], description) sparql_query = """ SELECT ?snp WHERE {{ <https://monarchinitiative.org/MONARCH_b46cdf48950cb00d> a OBAN:association ; dc:description "{}" ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0003949 ; OBAN:association_has_predicate OBO:RO_0002326 ; OBAN:association_has_subject ?snp . <https://monarchinitiative.org/MONARCH_70a05d8eb1c3d4b0> a OBAN:association ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0006995 ; OBAN:association_has_predicate OBO:RO_0002326 ; OBAN:association_has_subject ?snp . EFO:0003949 a owl:Class ; rdfs:label "eye color"^^xsd:string ; rdfs:subClassOf UPHENO:0001001 . ?snp OBO:RO_0002326 EFO:0003949, EFO:0006995 . PMID:25918132 a OBO:IAO_0000013 . }} """.format(description) sparql_output = self.source.graph.query(sparql_query) # Test that query passes and returns one row results = list(sparql_output) expected = [(URIRef(self.source.graph._getNode("dbSNP:rs1491921")),)] self.assertEqual(results, expected)
def test_graph_equality(turtlish, graph): """ :param turtlish: file path or string of triples in turtle format without prefix header :param graph: Graph object to test against :return: Boolean, True if graphs contain same set of triples """ turtle_graph = RDFGraph() turtle_graph.bind_all_namespaces() prefixes = "\n".join([ "@prefix {}: <{}> .".format(n[0], n[1]) for n in turtle_graph.namespace_manager.namespaces() ]) headless_ttl = '' try: if Path(turtlish).exists(): headless_ttl = Path(turtlish).read_text() else: raise OSError except OSError: if isinstance(turtlish, str): headless_ttl = turtlish else: raise ValueError("turtlish must be filepath or string") turtle_string = prefixes + headless_ttl mock_file = io.StringIO(turtle_string) turtle_graph.parse(mock_file, format="turtle") TestUtils.remove_ontology_axioms(graph) turtle_triples = set(list(turtle_graph)) ref_triples = set(list(graph)) equality = turtle_triples == ref_triples if not equality: LOG.warning( "Triples do not match\n" "\tLeft hand difference: %s\n" "\tRight hand difference: %s", sorted(turtle_triples - ref_triples), sorted(ref_triples - turtle_triples)) return equality
def main(): hpo = RDFGraph() root = "HP:0000118" hpo_terms = OrderedDict() hpo.parse("http://purl.obolibrary.org/obo/hp.owl", format='xml') hpo.bind_all_namespaces() hpo.bind("oboInOwl", "http://www.geneontology.org/formats/oboInOwl#") tree = {} tree[root] = {} path = [] hpo_to_tree(root, hpo_terms, hpo, tree, path) with open('hpo-tree.json', 'w') as outfile: json.dump(tree, outfile) with open('hpo-terms.tsv', 'w') as outfile: for key, value in hpo_terms.items(): outfile.write("{0}\t{1}\t{2}\t{3}\n".format( key, value['label'], "|".join(value['lay_person']), value['parents'] ))
def main(): hpo = RDFGraph() root = "HP:0000118" hpo_terms = OrderedDict() hpo.parse("http://purl.obolibrary.org/obo/hp.owl", format='xml') hpo.bind_all_namespaces() hpo.bind("oboInOwl", "http://www.geneontology.org/formats/oboInOwl#") tree = {} tree[root] = {} path = [] hpo_to_tree(root, hpo_terms, hpo, tree, path) with open('hpo-tree.json', 'w') as outfile: json.dump(tree, outfile) with open('hpo-terms.tsv', 'w') as outfile: for key, value in hpo_terms.items(): outfile.write("{0}\t{1}\t{2}\t{3}\n".format( key, value['label'], "|".join(value['lay_person']), value['parents']))
def process_catalog(self, limit=None): """ :param limit: :return: """ src_key = 'catalog' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Data from %s", raw) so_ontology = RDFGraph(False, "SO") LOG.info("Loading SO ontology in separate rdf graph") so_ontology.parse(self.files['so']['url'], format='xml') so_ontology.bind_all_namespaces() LOG.info("Finished loading SO ontology") mondo_file = '/'.join((self.rawdir, self.files['mondo']['file'])) with open(mondo_file, 'r') as mondo_fh: mondo_data = json.load(mondo_fh) col = self.files[src_key]['columns'] with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t') row = next(reader) if self.check_fileheader(col, row): pass for row in reader: if len(col) != len(row): LOG.error('BadRow: %i has %i columns', reader.line_num, row) continue # head -1 gwas-catalog-associations_ontology-annotated.tsv | # tr '\t' '\n' | sed "s|\(.*\)|# = row[col.index('\1')]|g" # = row[col.index('DATE ADDED TO CATALOG')] pubmed_num = row[col.index('PUBMEDID')].strip() # = row[col.index('FIRST AUTHOR')] # = row[col.index('DATE')] # = row[col.index('JOURNAL')] # = row[col.index('LINK')] # = row[col.index('STUDY')] disease_or_trait = row[col.index('DISEASE/TRAIT')].strip() initial_sample_description = row[col.index( 'INITIAL SAMPLE SIZE')].strip() replicate_sample_description = row[col.index( 'REPLICATION SAMPLE SIZE')].strip() # = row[col.index('REGION')] chrom_num = row[col.index('CHR_ID')].strip() chrom_pos = row[col.index('CHR_POS')].strip() # = row[col.index('REPORTED GENE(S)')] mapped_gene = row[col.index('MAPPED_GENE')].strip() upstream_gene_num = row[col.index('UPSTREAM_GENE_ID')].strip() downstream_gene_num = row[col.index( 'DOWNSTREAM_GENE_ID')].strip() snp_gene_nums = row[col.index('SNP_GENE_IDS')].strip() # = row[col.index('UPSTREAM_GENE_DISTANCE')] # = row[col.index('DOWNSTREAM_GENE_DISTANCE')] strongest_snp_risk_allele = row[col.index( 'STRONGEST SNP-RISK ALLELE')].strip() # = row[col.index('SNPS')] merged = row[col.index('MERGED')].strip() snp_id_current = row[col.index('SNP_ID_CURRENT')].strip() context = row[col.index('CONTEXT')].strip() # = row[col.index('INTERGENIC')] risk_allele_frequency = row[col.index( 'RISK ALLELE FREQUENCY')].strip() pvalue = row[col.index('P-VALUE')].strip() # = row[col.index('PVALUE_MLOG')] # = row[col.index('P-VALUE (TEXT)')] # = row[col.index('OR or BETA')] # = row[col.index('95% CI (TEXT)')] platform_with_snps_passing_qc = row[col.index( 'PLATFORM [SNPS PASSING QC]')].strip() # = row[col.index('CNV')] mapped_trait = row[col.index('MAPPED_TRAIT')].strip() mapped_trait_uri = row[col.index('MAPPED_TRAIT_URI')].strip() # = row[col.index('STUDY ACCESSION')] # = row[col.index('GENOTYPING TECHNOLOGY')] if self.test_mode: continue # 06-May-2015 25917933 # Zai CC 20-Nov-2014 J Psychiatr Res http://europepmc.org/abstract/MED/25917933 # A genome-wide association study of suicide severity scores in bipolar disorder. # Suicide in bipolar disorder # 959 European ancestry individuals NA # 10p11.22 10 32704340 C10orf68, CCDC7, ITGB1 CCDC7 # rs7079041-A rs7079041 0 7079041 intron 0 2E-6 5.698970 snp_id_current = snp_id_current.split(' ')[0] # note: that these will no longer pattern match other instances variant_curie, variant_type = self._get_curie_and_type_from_id( strongest_snp_risk_allele) if strongest_snp_risk_allele == '': LOG.debug("No strongest SNP risk allele for %s:\n%s", pubmed_num, str(row)) # still consider adding in the EFO terms # for what the study measured? continue if variant_curie is not None and variant_curie[0] == '_' and \ strongest_snp_risk_allele is not None: self.graph.addTriple(variant_curie, self.globaltt['label'], strongest_snp_risk_allele, object_is_literal=True) if variant_type == 'snp': self._add_snp_to_graph(variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency) self._add_deprecated_snp(variant_curie, snp_id_current, merged, chrom_num, chrom_pos) self._add_snp_gene_relation(variant_curie, snp_gene_nums, upstream_gene_num, downstream_gene_num) elif variant_type == 'haplotype': self._process_haplotype(variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology) elif variant_type is None and snp_id_current != '': LOG.warning("There's a snp id we can't manage: %s", strongest_snp_risk_allele) continue description = self._make_description( disease_or_trait, initial_sample_description, replicate_sample_description, platform_with_snps_passing_qc, pvalue) self._add_variant_trait_association(variant_curie, mapped_trait_uri, mapped_trait, mondo_data, pubmed_num, description) if not self.test_mode and (limit is not None and reader.line_num > limit): break # TODO loop through the location hash, # and make all snps at that location equivalent for loc in self.id_location_map: snp_ids = self.id_location_map[loc] if len(snp_ids) > 1: LOG.info("%s has >1 snp id: %s", loc, str(snp_ids))
def process_catalog(self, limit=None): """ :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['catalog']['file'])) LOG.info("Processing Data from %s", raw) efo_ontology = RDFGraph(False, "EFO") LOG.info("Loading EFO ontology in separate rdf graph") efo_ontology.parse(self.files['efo']['url'], format='xml') efo_ontology.bind_all_namespaces() LOG.info("Finished loading EFO ontology") so_ontology = RDFGraph(False, "SO") LOG.info("Loading SO ontology in separate rdf graph") so_ontology.parse(self.files['so']['url'], format='xml') so_ontology.bind_all_namespaces() LOG.info("Finished loading SO ontology") with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') header = next(filereader, None) # the header row header_len = len(header) LOG.info('header length:\t %i', header_len) for row in filereader: if not row: pass else: if header_len != len(row): LOG.error('BadRow: %i has %i columns', filereader.line_num, row) (date_added_to_catalog, pubmed_num, first_author, pub_date, journal, link, study_name, disease_or_trait, initial_sample_description, replicate_sample_description, region, chrom_num, chrom_pos, reported_gene_nums, mapped_gene, upstream_gene_num, downstream_gene_num, snp_gene_nums, upstream_gene_distance, downstream_gene_distance, strongest_snp_risk_allele, snps, merged, snp_id_current, context, intergenic_flag, risk_allele_frequency, pvalue, pvalue_mlog, pvalue_text, or_or_beta, confidence_interval_95, platform_with_snps_passing_qc, cnv_flag, mapped_trait, mapped_trait_uri, study_accession, GENOTYPING_TECHNOLOGY ) = row if self.test_mode: continue # 06-May-2015 25917933 # Zai CC 20-Nov-2014 J Psychiatr Res http://europepmc.org/abstract/MED/25917933 # A genome-wide association study of suicide severity scores in bipolar disorder. # Suicide in bipolar disorder # 959 European ancestry individuals NA # 10p11.22 10 32704340 C10orf68, CCDC7, ITGB1 CCDC7 # rs7079041-A rs7079041 0 7079041 intron 0 2E-6 5.698970 variant_curie, variant_type = self._get_curie_and_type_from_id( strongest_snp_risk_allele) if strongest_snp_risk_allele.strip() == '': LOG.debug( "No strongest SNP risk allele for %s:\n%s", pubmed_num, str(row)) # still consider adding in the EFO terms # for what the study measured? continue if variant_type == 'snp': self._add_snp_to_graph( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency) self._add_deprecated_snp( variant_curie, snp_id_current, merged, chrom_num, chrom_pos) self._add_snp_gene_relation( variant_curie, snp_gene_nums, upstream_gene_num, downstream_gene_num) elif variant_type == 'haplotype': self._process_haplotype( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology) elif variant_type is None: LOG.warning( "There's a snp id i can't manage: %s", strongest_snp_risk_allele) continue description = self._make_description( disease_or_trait, initial_sample_description, replicate_sample_description, platform_with_snps_passing_qc, pvalue) self._add_variant_trait_association( variant_curie, mapped_trait_uri, efo_ontology, pubmed_num, description) if not self.test_mode and ( limit is not None and filereader.line_num > limit): break # TODO loop through the location hash, # and make all snps at that location equivalent for l in self.id_location_map: snp_ids = self.id_location_map[l] if len(snp_ids) > 1: LOG.info("%s has >1 snp id: %s", l, str(snp_ids)) return
def process_catalog(self, limit=None): """ :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['catalog']['file'])) logger.info("Processing Data from %s", raw) efo_ontology = RDFGraph() logger.info("Loading EFO ontology in separate rdf graph") efo_ontology.parse(self.files['efo']['url'], format='xml') efo_ontology.bind_all_namespaces() logger.info("Finished loading EFO ontology") so_ontology = RDFGraph() logger.info("Loading SO ontology in separate rdf graph") so_ontology.parse(self.files['so']['url'], format='xml') so_ontology.bind_all_namespaces() logger.info("Finished loading SO ontology") line_counter = 0 with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') header = next(filereader, None) # the header row header_len = len(header) logger.info('header length:\t %i', header_len) for row in filereader: if not row: pass else: line_counter += 1 if header_len != len(row): logger.error( 'BadRow: %i has %i columns', line_counter, row) pass (date_added_to_catalog, pubmed_num, first_author, pub_date, journal, link, study_name, disease_or_trait, initial_sample_description, replicate_sample_description, region, chrom_num, chrom_pos, reported_gene_nums, mapped_gene, upstream_gene_num, downstream_gene_num, snp_gene_nums, upstream_gene_distance, downstream_gene_distance, strongest_snp_risk_allele, snps, merged, snp_id_current, context, intergenic_flag, risk_allele_frequency, pvalue, pvalue_mlog, pvalue_text, or_or_beta, confidence_interval_95, platform_with_snps_passing_qc, cnv_flag, mapped_trait, mapped_trait_uri, study_accession) = row if self.testMode: continue # 06-May-2015 25917933 Zai CC 20-Nov-2014 J Psychiatr Res http://europepmc.org/abstract/MED/25917933 # A genome-wide association study of suicide severity scores in bipolar disorder. # Suicide in bipolar disorder # 959 European ancestry individuals NA # 10p11.22 10 32704340 C10orf68, CCDC7, ITGB1 CCDC7 # rs7079041-A rs7079041 0 7079041 intron 0 2E-6 5.698970 variant_curie, variant_type = \ self._get_curie_and_type_from_id( strongest_snp_risk_allele) if strongest_snp_risk_allele.strip() == '': logger.debug( "No strongest SNP risk allele for %s:\n%s", pubmed_num, str(row)) # still consider adding in the EFO terms # for what the study measured? continue if variant_type == 'snp': self._add_snp_to_graph( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency) self._add_deprecated_snp(variant_curie, snp_id_current, merged, chrom_num, chrom_pos) self._add_snp_gene_relation( variant_curie, snp_gene_nums, upstream_gene_num, downstream_gene_num) elif variant_type == 'haplotype': self._process_haplotype( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology) elif variant_type is None: logger.warning( "There's a snp id i can't manage: %s", strongest_snp_risk_allele) continue description = self._make_description( disease_or_trait, initial_sample_description, replicate_sample_description, platform_with_snps_passing_qc, pvalue) self._add_variant_trait_association( variant_curie, mapped_trait_uri, efo_ontology, pubmed_num, description) if not self.testMode and\ (limit is not None and line_counter > limit): break # TODO loop through the location hash, # and make all snps at that location equivalent for l in self.id_location_map: snp_ids = self.id_location_map[l] if len(snp_ids) > 1: logger.info("%s has >1 snp id: %s", l, str(snp_ids)) return
def test_snp_model(self): """ Test output model of _process_haplotype() self._process_haplotype( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology) """ variant_curie, variant_type = \ self.source._get_curie_and_type_from_id(self.test_data['snp_label']) so_ontology = RDFGraph() logger.info("Loading SO ontology in separate rdf graph") so_ontology.parse(self.source.files['so']['url'], format='xml') so_ontology.bind_all_namespaces() logger.info("Finished loading SO ontology") self.source._process_haplotype( variant_curie, self.test_data['snp_label'], self.test_data['chrom_num'], self.test_data['chrom_pos'], self.test_data['context'], self.test_data['allele_freq'], self.test_data['mapped_gene'], so_ontology) sparql_query = """ SELECT ?snp WHERE { :haplotype_bcb627b1f64039b0 a OBO:GENO_0000871 ; rdfs:label "rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?" ; OBO:GENO_0000382 ?snp, dbSNP:rs3758171, dbSNP:rs3824344, dbSNP:rs7020413 ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . ?snp a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs1329573-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3758171 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3758171-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3824344 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3824344-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs7020413 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs7020413-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> a faldo:Position ; faldo:position 36997420 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> a faldo:Position ; faldo:position 36998996 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> a faldo:Position ; faldo:position 37000690 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> a faldo:Position ; faldo:position 37002118 ; faldo:reference OBO:CHR_GRCh38chr9 . } """ sparql_output = self.source.graph.query(sparql_query) # Test that query passes and returns one row results = list(sparql_output) expected = [(URIRef(self.source.graph._getNode("dbSNP:rs1329573")), )] self.assertEqual(results, expected)
def test_snp_model(self): """ Test output model of _process_haplotype() self._process_haplotype( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology) """ self.assertTrue(len(list(self.source.graph)) == 0) variant_curie, variant_type = self.source._get_curie_and_type_from_id( self.test_data['snp_label']) so_ontology = RDFGraph() LOG.info("Loading SO ontology in separate rdf graph") so_ontology.parse(self.source.files['so']['url'], format='xml') so_ontology.bind_all_namespaces() LOG.info("Finished loading SO ontology") self.source._process_haplotype( variant_curie, self.test_data['snp_label'], self.test_data['chrom_num'], self.test_data['chrom_pos'], self.test_data['context'], self.test_data['allele_freq'], self.test_data['mapped_gene'], so_ontology) triples = """ :haplotype_bb627b1f64039b0f751a a OBO:GENO_0000871 ; rdfs:label "rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?" ; OBO:GENO_0000382 dbSNP:rs1329573, dbSNP:rs3758171, dbSNP:rs3824344, dbSNP:rs7020413 ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs1329573 a OBO:SO_0000694, SO:0001627 ; rdfs:label "rs1329573-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3758171 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3758171-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3824344 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3824344-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs7020413 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs7020413-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> a faldo:Position ; faldo:position 36997420 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> a faldo:Position ; faldo:position 36998996 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> a faldo:Position ; faldo:position 37000690 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> a faldo:Position ; faldo:position 37002118 ; faldo:reference OBO:CHR_GRCh38chr9 . """ # dbg # LOG.debug( # "Reference graph: %s", # self.source.graph.serialize(format="turtle").decode("utf-8")) # Does not seem to acknowlage these constant triples self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph))
def test_snp_model(self): """ Test output model of _process_haplotype() self._process_haplotype( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology) """ self.assertTrue(len(list(self.source.graph)) == 0) variant_curie, variant_type = self.source._get_curie_and_type_from_id( self.test_data['snp_label']) so_ontology = RDFGraph() LOG.info("Loading SO ontology in separate rdf graph") so_ontology.parse(self.source.files['so']['url'], format='xml') so_ontology.bind_all_namespaces() LOG.info("Finished loading SO ontology") self.source._process_haplotype( variant_curie, self.test_data['snp_label'], self.test_data['chrom_num'], self.test_data['chrom_pos'], self.test_data['context'], self.test_data['allele_freq'], self.test_data['mapped_gene'], so_ontology) triples = """ :haplotype_bb627b1f64039b0f751a a SO:0001024 ; rdfs:label "rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?" ; GENO:0000382 dbSNP:rs1329573, dbSNP:rs3758171, dbSNP:rs3824344, dbSNP:rs7020413 ; GENO:0000418 HGNC:8619 ; RO:0002162 NCBITaxon:9606 . dbSNP:rs1329573 a SO:0000694, SO:0001627 ; rdfs:label "rs1329573-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/b3fad5df82cdfb283329> ; GENO:0000418 HGNC:8619 ; RO:0002162 NCBITaxon:9606 . dbSNP:rs3758171 a SO:0000694, SO:0001627 ; rdfs:label "rs3758171-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/b25a2da36647bdd71be3> ; GENO:0000418 HGNC:8619 ; RO:0002162 NCBITaxon:9606 . dbSNP:rs3824344 a SO:0000694, OBO:SO_0001627 ; rdfs:label "rs3824344-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/b096a3e94e32fe23374a> ; GENO:0000418 HGNC:8619 ; RO:0002162 NCBITaxon:9606 . dbSNP:rs7020413 a SO:0000694, SO:0001627 ; rdfs:label "rs7020413-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/bbb252d9b6cd02e9880a> ; GENO:0000418 HGNC:8619 ; RO:0002162 NCBITaxon:9606 . <https://monarchinitiative.org/.well-known/genid/b25a2da36647bdd71be3> a faldo:Region ; rdfs:label "GRCh38chr9-36997420-36997420-Region"; faldo:begin <https://monarchinitiative.org/.well-known/genid/b21985847fe0774084eb> ; faldo:end <https://monarchinitiative.org/.well-known/genid/b21985847fe0774084eb> . <https://monarchinitiative.org/.well-known/genid/b3fad5df82cdfb283329> a faldo:Region ; rdfs:label "GRCh38chr9-36998996-36998996-Region"; faldo:begin <https://monarchinitiative.org/.well-known/genid/b55051762f8d5a3dbeb5> ; faldo:end <https://monarchinitiative.org/.well-known/genid/b55051762f8d5a3dbeb5> . <https://monarchinitiative.org/.well-known/genid/b096a3e94e32fe23374a> a faldo:Region ; rdfs:label "GRCh38chr9-37000690-37000690-Region"; faldo:begin <https://monarchinitiative.org/.well-known/genid/b5d61dbc7958a979d046> ; faldo:end <https://monarchinitiative.org/.well-known/genid/b5d61dbc7958a979d046> . <https://monarchinitiative.org/.well-known/genid/bbb252d9b6cd02e9880a> a faldo:Region ; rdfs:label "GRCh38chr9-37002118-37002118-Region"; faldo:begin <https://monarchinitiative.org/.well-known/genid/bb870c3d7606a3e0fc3c> ; faldo:end <https://monarchinitiative.org/.well-known/genid/bb870c3d7606a3e0fc3c> . <https://monarchinitiative.org/.well-known/genid/b21985847fe0774084eb> a faldo:Position ; rdfs:label "GRCh38chr9-36997420"; faldo:position 36997420 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/b55051762f8d5a3dbeb5> a faldo:Position ; rdfs:label "GRCh38chr9-36998996"; faldo:position 36998996 ; faldo:reference CHR:GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/b5d61dbc7958a979d046> a faldo:Position ; rdfs:label "GRCh38chr9-37000690"; faldo:position 37000690 ; faldo:reference CHR:GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/bb870c3d7606a3e0fc3c> a faldo:Position ; rdfs:label "GRCh38chr9-37002118"; faldo:position 37002118 ; faldo:reference CHR:GRCh38chr9 . """ # dbg LOG.debug("Reference graph: %s", self.source.graph.serialize(format="turtle").decode("utf-8")) self.assertTrue( self.test_util.test_graph_equality(triples, self.source.graph))
def test_snp_model(self): """ Test output model of _process_haplotype() self._process_haplotype( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology) """ variant_curie, variant_type = \ self.source._get_curie_and_type_from_id(self.test_data['snp_label']) so_ontology = RDFGraph() logger.info("Loading SO ontology in separate rdf graph") so_ontology.parse(self.source.files['so']['url'], format='xml') so_ontology.bind_all_namespaces() logger.info("Finished loading SO ontology") self.source._process_haplotype( variant_curie, self.test_data['snp_label'], self.test_data['chrom_num'], self.test_data['chrom_pos'], self.test_data['context'], self.test_data['allele_freq'], self.test_data['mapped_gene'], so_ontology) sparql_query = """ SELECT ?snp WHERE { :haplotype_bcb627b1f64039b0 a OBO:GENO_0000871 ; rdfs:label "rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?" ; OBO:GENO_0000382 ?snp, dbSNP:rs3758171, dbSNP:rs3824344, dbSNP:rs7020413 ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . ?snp a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs1329573-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3758171 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3758171-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3824344 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3824344-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs7020413 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs7020413-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> a faldo:Position ; faldo:position 36997420 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> a faldo:Position ; faldo:position 36998996 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> a faldo:Position ; faldo:position 37000690 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> a faldo:Position ; faldo:position 37002118 ; faldo:reference OBO:CHR_GRCh38chr9 . } """ sparql_output = self.source.graph.query(sparql_query) # Test that query passes and returns one row results = list(sparql_output) expected = [(URIRef(self.source.graph._getNode("dbSNP:rs1329573")),)] self.assertEqual(results, expected)