def test_graph_equality(self, turtlish, graph): """ :param turtlish: String of triples in turtle format without prefix header :param graph: Graph object to test against :return: Boolean, True if graphs contain same set of triples """ turtle_graph = RDFGraph() turtle_graph.bind_all_namespaces() prefixes = "\n".join( ["@prefix {}: <{}> .".format(n[0], n[1]) for n in turtle_graph.namespace_manager.namespaces()] ) turtle_string = prefixes + turtlish mock_file = io.StringIO(turtle_string) turtle_graph.parse(mock_file, format="turtle") turtle_triples = set(list(turtle_graph)) ref_triples = set(list(graph)) equality = turtle_triples == ref_triples if not equality: logger.warning("Triples do not match\n" "Left hand difference: {}\n" "Right hand difference:{}".format( turtle_triples - ref_triples, ref_triples - turtle_triples )) return equality
def test_snp_trait_association(self): """ test the _add_variant_trait_association :return: """ efo_ontology = RDFGraph() logger.info("Loading EFO ontology in separate rdf graph") efo_ontology.parse(self.source.files['efo']['url'], format='xml') efo_ontology.bind_all_namespaces() logger.info("Finished loading EFO ontology") variant_curie, variant_type = \ self.source._get_curie_and_type_from_id(self.test_data['snp_label']) description = self.source._make_description( self.test_data['trait'], self.test_data['init_sample_desc'], self.test_data['replicated_sample_desc'], self.test_data['platform'], self.test_data['pvalue']) self.source._add_variant_trait_association( variant_curie, self.test_data['trait_uri'], efo_ontology, self.test_data['pubmed'], description) sparql_query = """ SELECT ?snp WHERE {{ <https://monarchinitiative.org/MONARCH_b46cdf48950cb00d> a OBAN:association ; dc:description "{}" ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0003949 ; OBAN:association_has_predicate OBO:RO_0002326 ; OBAN:association_has_subject ?snp . <https://monarchinitiative.org/MONARCH_70a05d8eb1c3d4b0> a OBAN:association ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0006995 ; OBAN:association_has_predicate OBO:RO_0002326 ; OBAN:association_has_subject ?snp . EFO:0003949 a owl:Class ; rdfs:label "eye color"^^xsd:string ; rdfs:subClassOf UPHENO:0001001 . ?snp OBO:RO_0002326 EFO:0003949, EFO:0006995 . PMID:25918132 a OBO:IAO_0000013 . }} """.format(description) sparql_output = self.source.graph.query(sparql_query) # Test that query passes and returns one row results = list(sparql_output) expected = [(URIRef(self.source.graph._getNode("dbSNP:rs1491921")),)] self.assertEqual(results, expected)
def __init__(self, identifier, title, url, description=None, license_url=None, data_rights=None, graph_type=None, file_handle=None): if graph_type is None: self.graph = RDFGraph(None, identifier) # elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph() self.model = Model(self.graph) self.identifier = ':' + identifier self.version = None self.date_issued = None # The data_accesed value is later used as an literal of properties # such as dct:issued, which needs to conform xsd:dateTime format. # TODO ... we need to have a talk about typed literals and SPARQL self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') self.citation = set() self.license = license_url self.model.addType(self.identifier, 'dctypes:Dataset') self.graph.addTriple(self.identifier, 'dct:title', title, True) self.graph.addTriple( self.identifier, 'dct:identifier', identifier, object_is_literal=True) self.graph.addTriple(self.identifier, 'foaf:page', url) # maybe in the future add the logo here: # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> . # TODO add the licence info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.addTriple( self.identifier, 'dct:license', license_url) else: logger.debug('No license provided.') if data_rights is not None: self.graph.addTriple( self.identifier, 'dct:rights', data_rights, object_is_literal=True) else: logger.debug('No rights provided.') if description is not None: self.model.addDescription(self.identifier, description) return
def test_parse(self): self.source.graph = RDFGraph(True) # Reset graph self.assertTrue(len(list(self.source.graph)) == 0) self.source.parse() triples = """ UNII:46U771ERWK RO:0002606 SNOMED:386761002 ; rdfs:subClassOf CHEBI:23367 . SNOMED:386761002 rdfs:label "Local anesthesia" ; rdfs:subClassOf MONDO:0000001 . UNII:46U771ERWK biolink:category biolink:ChemicalSubstance . SNOMED:386761002 biolink:category biolink:Disease . """ # dbg logger.debug( "Reference graph: %s", self.source.graph.serialize(format="turtle") .decode("utf-8") ) self.assertTrue(self.test_util.test_graph_equality( triples, self.source.graph))
def testSGDParser(self): sgd = SGD('rdf_graph', True) sgd.graph = RDFGraph(True) record = self.test_set_1 sgd.make_association(record) description = sgd._make_description(record) triples = """ :MONARCH_ba748c98c0f167739128 a OBAN:association ; OBO:RO_0002558 OBO:APO_0000020 ; dc:description "{0}"; dc:source PMID:21715656 ; OBAN:association_has_object MONARCH:APO_0000309APO_0000245 ; OBAN:association_has_predicate OBO:RO_0002200 ; OBAN:association_has_subject SGD:S000007268 . SGD:S000007268 rdfs:label "ATP6" ; RO:0002200 MONARCH:APO_0000309APO_0000245 . APO:0000020 rdfs:label "classical genetics" . PMID:21715656 a OBO:IAO_0000311 ; owl:sameAs SGD_REF:S000145858 . MONARCH:APO_0000309APO_0000245 rdfs:label "respiratory growth:decreased rate" ; rdfs:subClassOf UPHENO:0001001 . """.format(description) # test exact contents of graph self.assertTrue(self.test_util.test_graph_equality(triples, sgd.graph))
def test_patient_phenotype_model(self): """ functional test for _parse_patient_phenotypes() """ udp = UDP('rdf_graph', True) udp.graph = RDFGraph(True) # test that graph is empty self.assertTrue(len(list(udp.graph)) == 0) mock_lines = [ 'patient_1\tHP:000001\tyes', 'patient_1\tHP:000002\tno' ] mock_data = MagicMock() mock_data.__iter__.return_value = iter(mock_lines) mock_file = mock_open(mock=mock_data) udp._parse_patient_phenotypes(mock_file) triples = """ :patient_1 a foaf:Person ; rdfs:label "patient_1" ; RO:0002200 DOID:4, HP:000001 . """ self.assertTrue(self.test_util.test_graph_equality( triples, udp.graph))
def test_evidence_model(self): """ Functional test for _add_evidence() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) # Reset graph # Test graph is empty self.assertTrue(len(list(impc.graph)) == 0) (p_value, percentage_change, effect_size) = self.test_set_1[23:26] impc._add_evidence(self.assoc_curie, self.eco_id, p_value, percentage_change, effect_size, self.study_curie) triples = """ :MONARCH_test_association SEPIO:0000007 <https://monarchinitiative.org/.well-known/genid/b097a98087df7a99> . <https://monarchinitiative.org/.well-known/genid/b097a98087df7a99> a ECO:0000015 ; SEPIO:0000084 <https://monarchinitiative.org/.well-known/genid/b89ee584330837c9>, <https://monarchinitiative.org/.well-known/genid/bc0eeccdea27a1d8> ; SEPIO:0000085 <https://monarchinitiative.org/.well-known/genid/study> . <https://monarchinitiative.org/.well-known/genid/bc0eeccdea27a1d8> a OBI:0000175 ; RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ; STATO:0000129 1.637023e-10 . <https://monarchinitiative.org/.well-known/genid/b89ee584330837c9> a STATO:0000085 ; RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ; STATO:0000129 "8.885439E-007" . """ self.assertTrue(self.test_util.test_graph_equality( triples, impc.graph))
def test_gene_to_disease(self): self.orphanet.graph = RDFGraph() # Reset graph self.orphanet.files['disease-gene']['file'] = 'orph-no-variant.xml' self.orphanet._process_diseasegene(limit=None) logger.debug( "Reference graph: %s", self.orphanet.graph.serialize(format="turtle").decode("utf-8")) expected_triples = """ MONARCH:b64684a0ea6ae59fdb09 a OBAN:association ; RO:0002558 ECO:0000322 ; OBAN:association_has_object Orphanet:938475 ; OBAN:association_has_predicate RO:0003304 ; OBAN:association_has_subject Orphanet:268061 . Orphanet:268061 a owl:Class ; rdfs:label "KS1" ; RO:0003304 Orphanet:938475 ; dc:description "kinesin family member 7" ; oboInOwl:hasExactSynonym "KAS1" ; rdfs:subClassOf SO:0001217 . Orphanet:938475 a owl:Class ; rdfs:label "too much unit testing disorder" . """ self.assertTrue( self.test_util.test_graph_equality(expected_triples, self.orphanet.graph))
def test_assertion_model(self): """ Functional test for _add_study_provenance() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) self.assertTrue(len(list(impc.graph)) == 0) impc._add_assertion_provenance(self.assoc_curie, self.evidence_curie) triples = """ MONARCH:test_association SEPIO:0000015 <https://monarchinitiative.org/.well-known/genid/bff92df374a88496> . <https://monarchinitiative.org/.well-known/genid/bff92df374a88496> a SEPIO:0000001 ; SEPIO:0000018 <https://www.mousephenotype.org/> ; SEPIO:0000111 <https://monarchinitiative.org/.well-known/genid/evidence> . <https://www.mousephenotype.org/> a foaf:organization ; rdfs:label "International Mouse Phenotyping Consortium" . """ # dbg logger.info("Assertion graph:\n %s\n", impc.graph.serialize(format="turtle").decode("utf-8")) self.assertTrue(self.test_util.test_graph_equality( triples, impc.graph))
def setUp(self): self.test_util = TestUtils() self.source = GWASCatalog('rdf_graph', True) self.source.graph = RDFGraph(True) # Reset graph self.source.graph.bind_all_namespaces() self.test_data = { 'snp_label': 'rs1491921-C', 'chrom_num': '5', 'chrom_pos': '21259029', 'context': 'intergenic_variant', 'allele_freq': '0.013', 'trait': 'Diisocyanate-induced asthma', 'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0006995, http://www.ebi.ac.uk/efo/EFO_0003949', 'pvalue': '0.0000007', 'merged': '0', 'snp_id_current': '1491921', 'mapped_gene': 'LOC102723561 - GUSBP1', 'snp_gene_nums': '', 'upstream_gene_num': '107986179', 'downstream_gene_num': '107986180', 'init_sample_desc': '74 European ancestry cases, 824 European ancestry controls', 'replicated_sample_desc': 'NA', 'platform': 'Illumina [1556551]', 'pubmed': '25918132' }
def testFakeDataSet1(self): string_db = StringDB('rdf_graph', True) string_db.graph = RDFGraph(True) self.assertEqual(len(string_db.graph), 0) ensembl = Ensembl('rdf_graph', True) prot_map = ensembl.fetch_protein_gene_map('9606') [prot_map.update({k: ['ENSEMBL:' + prot_map[k]]}) for k in prot_map.keys()] print("Finished fetching ENSP IDs, fetched {} proteins".format(len(prot_map))) # just looking # for key in prot_map: # if string_db.graph.curie_regexp.match(prot_map[key]) is None: # print("INVALID curie for %s from %s", prot_map[key], key) dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns) string_db._process_protein_links(dataframe, prot_map, '9606') # g1 <interacts with> g2 triples = """ ENSEMBL:ENSG00000001626 RO:0002434 ENSEMBL:ENSG00000004059 . """ self.assertTrue(self.test_util.test_graph_equality(triples, string_db.graph))
def readGraphFromTurtleFile(self, f): """ This will read the specified file into a graph. A simple parsing test. :param f: :return: """ import os vg = RDFGraph() p = os.path.abspath(f) logger.info("Testing reading turtle file from %s", p) vg.parse(f, format="turtle") logger.info('Found %s graph nodes in %s', len(vg), p) self.assertTrue(len(vg) > 0, "No nodes found in "+p) return
def testEnsemblReactomeParser(self): reactome = Reactome('rdf_graph', True) reactome.graph = RDFGraph(True) self.assertTrue(len(list(reactome.graph)) == 0) eco_map = Reactome.get_eco_map(Reactome.map_files['eco_map']) (gene, pathway_id, pathway_iri, pathway_label, go_ecode, species_name) = self.test_set_1 reactome._add_component_pathway_association(eco_map, gene, 'ENSEMBL', pathway_id, 'REACT', pathway_label, go_ecode) triples = """ ENSEMBL:ENSBTAP00000013354 RO:0002331 REACT:R-BTA-3000480 . :MONARCH_b582c188b7ec20016206 a OBAN:association ; OBO:RO_0002558 ECO:0000501 ; OBAN:association_has_object REACT:R-BTA-3000480 ; OBAN:association_has_predicate RO:0002331 ; OBAN:association_has_subject ENSEMBL:ENSBTAP00000013354 . REACT:R-BTA-3000480 a owl:Class ; rdfs:label "Scavenging by Class A Receptors" ; rdfs:subClassOf GO:0009987, PW:0000001 . """ self.assertTrue( self.test_util.test_graph_equality(triples, reactome.graph))
def test_gene_to_disease(self): self.orphanet.graph = RDFGraph() # Reset graph self.orphanet.files['disease-gene']['file'] = 'orph-no-variant.xml' self.orphanet._process_diseasegene(limit=None) LOG.debug( "Reference graph: %s", self.orphanet.graph.serialize(format="turtle") .decode("utf-8") ) expected_triples = """ MONARCH:bdbeb077e365ddedda20 a OBAN:association ; RO:0002558 ECO:0000322 ; OBAN:association_has_object ORPHA:938475 ; OBAN:association_has_predicate RO:0004015 ; OBAN:association_has_subject ORPHA:268061 . ORPHA:268061 RO:0004015 ORPHA:938475 ; oboInOwl:hasExactSynonym "KAS1" . ORPHA:938475 a owl:Class ; rdfs:label "too much unit testing disorder" . """ self.assertTrue(self.test_util.test_graph_equality( expected_triples, self.orphanet.graph)) return
def test_germline_lof_variant_to_disease(self): self.orphanet.graph = RDFGraph() # Reset graph self.orphanet.files['disease-gene']['file'] = 'orph-germline-lof.xml' self.orphanet._process_diseasegene(limit=None) logger.debug( "Reference graph: %s", self.orphanet.graph.serialize(format="turtle").decode("utf-8")) expected_triples = """ MONARCH:b53dada0eb229a75e705 OBAN:association ; RO:0002558 ECO:0000322 ; OBAN:association_has_object Orphanet:938475 ; OBAN:association_has_predicate RO:0003303 ; OBAN:association_has_subject <https://monarchinitiative.org/.well-known/genid/ba0884fb61004110> . Orphanet:268061 a owl:Class ; rdfs:label "KS1" ; dc:description "kinesin family member 7" ; oboInOwl:hasExactSynonym "KAS1" ; rdfs:subClassOf SO:0001217 . <https://monarchinitiative.org/.well-known/genid/ba0884fb61004110> a GENO:0000002 ; rdfs:label "germline loss of function variant of KS1" ; GENO:0000418 Orphanet:268061 ; RO:0003303 Orphanet:938475 ; :MONARCH_anonymous true ; :has_cell_origin GENO:0000900 ; :has_functional_consequence SO:0002054 . Orphanet:938475 a owl:Class ; rdfs:label "too much unit testing disorder" . """ self.assertTrue( self.test_util.test_graph_equality(expected_triples, self.orphanet.graph))
def test_germline_variant_to_disease(self): self.orphanet.graph = RDFGraph() # Reset graph self.orphanet.files['disease-gene']['file'] = 'orph-germline.xml' self.orphanet._process_diseasegene(limit=None) LOG.debug( "Reference graph: %s", self.orphanet.graph.serialize(format="turtle").decode("utf-8") ) expected_triples = """ MONARCH:ba2ac5d2153c70e2bb98 a OBAN:association ; RO:0002558 ECO:0000322 ; OBAN:association_has_object ORPHA:938475 ; OBAN:association_has_predicate RO:0004013 ; OBAN:association_has_subject HGNC:30497 . ENSEMBL:ENSG00000166813 a owl:Class . HGNC:30497 a owl:Class ; RO:0004013 ORPHA:938475 ; oboInOwl:hasExactSynonym "KAS1" ; owl:equivalentClass ENSEMBL:ENSG00000166813, ORPHA:268061 . ORPHA:268061 a owl:Class . ORPHA:938475 a owl:Class ; rdfs:label "too much unit testing disorder" . """ self.assertTrue(self.test_util.test_graph_equality( expected_triples, self.orphanet.graph)) return
def setUp(self): self.test_util = TestUtils() self.source = GWASCatalog('rdf_graph', True) self.source.graph = RDFGraph(True) self.test_data = { 'snp_label': 'rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?', 'chrom_num': '9;9;9;9', 'chrom_pos': '36998996;37002118;37000690;36997420', 'context': 'intron_variant; intron_variant; intron_variant; intron_variant', 'allele_freq': 'NR', 'trait': 'Intelligence', 'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0004337', 'pvalue': '0.00000004', 'merged': '0', 'snp_id_current': '', 'mapped_gene': 'PAX5; PAX5; PAX5; PAX5', 'snp_gene_nums': '', 'upstream_gene_num': '107986179', 'downstream_gene_num': '107986180', 'init_sample_desc': '656 European ancestry individuals from ADHD families', 'replicated_sample_desc': 'NA', 'platform': 'Illumina [795637]', 'pubmed': '22449649' }
def readGraphFromTurtleFile(self, f): """ This will read the specified file into a graph. A simple parsing test. :param f: :return: """ import os vg = RDFGraph() p = os.path.abspath(f) logger.info("Testing reading turtle file from %s", p) vg.parse(f, format="turtle") logger.info('Found %s graph nodes in %s', len(vg), p) self.assertTrue(len(vg) > 0, "No nodes found in " + p) return
class GeneralGraphTestCase(unittest.TestCase): def setUp(self): self.graph = RDFGraph() self.curie_map = curie_map.get() def tearDown(self): self.graph = None def test_curieprefixes(self): """ This will ensure that we can create identifiers for all of the defined curie prefixes using the GraphUtils.getNode() method :return: """ # add one id per curie as classes to the graph for p in self.curie_map.keys(): testid = p + ':testme' n = self.graph._getNode(testid) m = "prefix \"" + p + "\" has an error...can't create graph node" self.assertTrue(n is not None, m) return def readGraphFromTurtleFile(self, f): """ This will read the specified file into a graph. A simple parsing test. :param f: :return: """ import os vg = RDFGraph() p = os.path.abspath(f) logger.info("Testing reading turtle file from %s", p) vg.parse(f, format="turtle") logger.info('Found %s graph nodes in %s', len(vg), p) self.assertTrue(len(vg) > 0, "No nodes found in " + p) return def readGraphIntoOWL(self, f): """ test if the ttl can be parsed by owlparser this expects owltools to be accessible from commandline :param f: file of ttl :return: """ import subprocess from subprocess import check_call status = check_call(["owltools", f], stderr=subprocess.STDOUT) # returns zero is success! if status != 0: logger.error('finished verifying with owltools with status %s', status) self.assertTrue(status == 0) return
def test_provenance_model(self): """ Functional test for _add_study_provenance() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) self.assertTrue(len(list(impc.graph)) == 0) (phenotyping_center, colony) = self.test_set_1[2:4] (project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name) = self.test_set_1[12:19] (statistical_method, resource_name) = self.test_set_1[26:28] impc._add_study_provenance(phenotyping_center, colony, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) # dbg LOG.info("Provenance graph as turtle:\n%s\n", impc.graph.serialize(format="turtle").decode("utf-8")) triples = """ <https://monarchinitiative.org/.well-known/genid/bdd05a8ca155ddaf415e> a OBI:0000471 ; BFO:0000051 OBO:STATO_0000076, <https://www.mousephenotype.org/impress/protocol/175/15> ; BFO:0000050 IMPRESS-procedure:15 , <http://www.sanger.ac.uk/science/data/mouse-genomes-project> ; SEPIO:0000114 <https://www.mousephenotype.org/impress/parameterontologies/1867/91> ; SEPIO:0000017 <http://www.sanger.ac.uk/> . <https://monarchinitiative.org/.well-known/genid/b0b26361b8687b5ad9ef> a owl:NamedIndividual ; rdfs:label "MEFW" . <http://www.sanger.ac.uk/> a foaf:organization ; rdfs:label "WTSI" . <http://www.sanger.ac.uk/science/data/mouse-genomes-project> a VIVO:Project ; rdfs:label "Wellcome Trust Sanger Institute Mouse Genetics Project" . <https://www.mousephenotype.org/impress/parameterontologies/1867/91> a owl:NamedIndividual ; rdfs:label "Number of ribs right (X-ray)" . IMPRESS-procedure:15 a owl:NamedIndividual ; rdfs:label "MGP Select Pipeline" . <https://www.mousephenotype.org/impress/protocol/175/15> a owl:NamedIndividual ; rdfs:label "X-ray" . """ # dbg LOG.debug("Reference graph: %s", impc.graph.serialize(format="turtle").decode("utf-8")) self.assertTrue(self.test_util.test_graph_equality( triples, impc.graph))
def test_parse(self): for rcv in RCVS: output_nt = rcv + '.nt' input_xml = rcv + '.xml.gz' reference_ttl = TTL_PATH + rcv + '.ttl' with self.subTest(rcv=rcv): mock_args = [ "test_clinvar.py", "--inputdir", XML_PATH, "--filename", input_xml, "--mapfile", MAP_FILE, "--destination", NT_PATH, "--output", output_nt ] patch('sys.argv', mock_args).start() clinvar_parse() query_graph = RDFGraph() query_graph.bind_all_namespaces() query_graph.parse(NT_PATH + output_nt, format='nt') with open(reference_ttl, 'r') as ref_fh: ref_graph = "\n".join(ref_fh.readlines()) # debug LOG.debug( "Reference graph: %s", query_graph.serialize(format="turtle").decode("utf-8")) # Convert output from ClinVar parse to dot then png dot_file_path = DOT_PATH + rcv + ".dot" with open(dot_file_path, 'w') as dot_file: rdf2dot(query_graph, dot_file) self.assertTrue( TestUtils.test_graph_equality(ref_graph, query_graph))
def setUp(self): self.test_util = TestUtils() self.source = CTD('rdf_graph', True) self.source.graph = RDFGraph(True) self.test_row = [ 'Nicotine', 'D009538', '', 'TOBACCO ADDICTION, SUSCEPTIBILITY TO', 'OMIM:188890', 'therapeutic', '', '', '', '12345|56789' ] return
def test_variant_model(self): """ functional test for _parse_patient_variants() """ udp = UDP('rdf_graph', True) udp.graph = RDFGraph(True) # test that graph is empty self.assertTrue(len(list(udp.graph)) == 0) data = ['patient_1', 'family_1', '1', 'HG19', '155230432', 'G', 'A', 'Maternal', 'Biallelic', 'Non-synonymous;DOWNSTREAM', 'CLK2', '', '', '', '', '', '', '', 'Compound heterozygous', 'Heterozygous', '', '0.002747253', ''] test_data = "\t".join(data) mock_lines = [test_data] mock_data = MagicMock() mock_data.__iter__.return_value = iter(mock_lines) mock_file = mock_open(mock=mock_data) udp._parse_patient_variants(mock_file) triples = """ :patient_1 GENO:0000222 <https://monarchinitiative.org/.well-known/genid/ba5f377fc8c95d4a6d7a> . <https://monarchinitiative.org/.well-known/genid/b41e8da0787b45e24c4f> a SO:0001059 ; rdfs:label "hg19chr1(CLK2):g.155230432G>A" ; GENO:0000418 HGNC:2069 ; RO:0002162 NCBITaxon:9606 ; owl:sameAs dbSNP:rs11557757 . <https://monarchinitiative.org/.well-known/genid/ba5f377fc8c95d4a6d7a> a GENO:0000000 ; rdfs:label "patient_1 genotype" ; GENO:0000382 <https://monarchinitiative.org/.well-known/genid/b41e8da0787b45e24c4f> . """ self.assertTrue(self.test_util.test_graph_equality(triples, udp.graph))
def test_unmapped_disease_assoc_type(self): """ Test that a gene disease type that we have not mapped in translationtable/orphanet.yaml raises a ValueError """ self.orphanet.graph = RDFGraph() # Reset graph self.orphanet.files['disease-gene']['file'] = 'orph-no-mapping.xml' self.assertRaises( ValueError, lambda: self.orphanet._process_diseasegene(limit=None))
def test_graph_equality(turtlish, graph): """ :param turtlish: String of triples in turtle format without prefix header :param graph: Graph object to test against :return: Boolean, True if graphs contain same set of triples """ turtle_graph = RDFGraph() turtle_graph.bind_all_namespaces() prefixes = "\n".join([ "@prefix {}: <{}> .".format(n[0], n[1]) for n in turtle_graph.namespace_manager.namespaces() ]) turtle_string = prefixes + turtlish mock_file = io.StringIO(turtle_string) turtle_graph.parse(mock_file, format="turtle") turtle_triples = set(list(turtle_graph)) ref_triples = set(list(graph)) equality = turtle_triples == ref_triples if not equality: LOG.warning( "Triples do not match\n" "\tLeft hand difference: %s\n" "\tRight hand difference: %s", sorted(turtle_triples - ref_triples), sorted(ref_triples - turtle_triples)) return equality
def test_snp_trait_association(self): """ test the _add_variant_trait_association :return: """ efo_ontology = RDFGraph() logger.info("Loading EFO ontology in separate rdf graph") efo_ontology.parse(self.source.files['efo']['url'], format='xml') efo_ontology.bind_all_namespaces() logger.info("Finished loading EFO ontology") variant_curie, variant_type = \ self.source._get_curie_and_type_from_id(self.test_data['snp_label']) description = self.source._make_description( self.test_data['trait'], self.test_data['init_sample_desc'], self.test_data['replicated_sample_desc'], self.test_data['platform'], self.test_data['pvalue']) self.source._add_variant_trait_association(variant_curie, self.test_data['trait_uri'], efo_ontology, self.test_data['pubmed'], description) sparql_query = """ SELECT ?snp WHERE {{ <https://monarchinitiative.org/MONARCH_b46cdf48950cb00d> a OBAN:association ; dc:description "{}" ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0003949 ; OBAN:association_has_predicate OBO:RO_0002326 ; OBAN:association_has_subject ?snp . <https://monarchinitiative.org/MONARCH_70a05d8eb1c3d4b0> a OBAN:association ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0006995 ; OBAN:association_has_predicate OBO:RO_0002326 ; OBAN:association_has_subject ?snp . EFO:0003949 a owl:Class ; rdfs:label "eye color"^^xsd:string ; rdfs:subClassOf UPHENO:0001001 . ?snp OBO:RO_0002326 EFO:0003949, EFO:0006995 . PMID:25918132 a OBO:IAO_0000013 . }} """.format(description) sparql_output = self.source.graph.query(sparql_query) # Test that query passes and returns one row results = list(sparql_output) expected = [(URIRef(self.source.graph._getNode("dbSNP:rs1491921")), )] self.assertEqual(results, expected)
def test_snp_trait_association(self): """ test the _add_variant_trait_association :return: """ self.assertTrue(len(list(self.source.graph)) == 0) efo_ontology = RDFGraph() LOG.info("Loading EFO ontology in separate rdf graph") efo_ontology.parse(self.source.files['efo']['url'], format='xml') efo_ontology.bind_all_namespaces() LOG.info("Finished loading EFO ontology") variant_curie, variant_type = self.source._get_curie_and_type_from_id( self.test_data['snp_label']) description = self.source._make_description( self.test_data['trait'], self.test_data['init_sample_desc'], self.test_data['replicated_sample_desc'], self.test_data['platform'], self.test_data['pvalue']) self.source._add_variant_trait_association( variant_curie, self.test_data['trait_uri'], efo_ontology, self.test_data['pubmed'], description) triples = """ MONARCH:bffc7a930c08cc8fe931 a OBAN:association ; dc:description "{0}" ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0003949 ; OBAN:association_has_predicate RO:0003304 ; OBAN:association_has_subject dbSNP:rs1491921 . MONARCH:bff9b97458d67ed7f517 a OBAN:association ; dc:description "{0}" ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0006995 ; OBAN:association_has_predicate RO:0003304 ; OBAN:association_has_subject dbSNP:rs1491921 . EFO:0003949 a owl:Class ; rdfs:label "eye color"^^xsd:string ; rdfs:subClassOf UPHENO:0001001 . dbSNP:rs1491921 RO:0003304 EFO:0003949, EFO:0006995 . PMID:25918132 a OBO:IAO_0000013 . """.format(description) # dbg # LOG.debug( # "Reference graph: %s", # self.source.graph.serialize(format="turtle").decode("utf-8")) self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph))
def testFakeDataSet2(self): """ Dataset contains a deprecated protein ID that we expect if filtered out by ensembl biomart We test that this returns an empty graph :return: """ string_db = StringDB('rdf_graph', True) string_db.graph = RDFGraph() self.assertEqual(len(string_db.graph), 0) dataframe = pd.DataFrame(data=self.test_set_2, columns=self.columns) string_db._process_protein_links(dataframe, self.protein_list, 9606) self.assertEqual(len(string_db.graph), 0)
def test_sex_specificity_model(self): self.mgi.graph = RDFGraph(True) # Reset graph self.mgi._process_evidence_view(limit=None) logger.debug("Reference graph: %s", self.mgi.graph.serialize(format="turtle").decode("utf-8")) expected_triples = """ :association RO:0002558 ECO:0000006 ; dc:source J:74619 ; :has_sex_specificity PATO:0000384 . J:74619 a IAO:0000310 . """ self.assertTrue( self.test_util.test_graph_equality(expected_triples, self.mgi.graph))
def setUp(self): g = RDFGraph() self.model = Model(g) this_curie_map = curie_map.get() self.cutil = CurieUtil(this_curie_map) # stuff to make test triples self.test_cat_subj_curie = "MGI:1234" self.test_cat_subj = self.cutil.get_uri("MGI:1234") self.test_cat_default_pred = self.cutil.get_uri("biolink:category") self.test_named_indiv = self.cutil.get_uri("owl:NamedIndividual") self.test_label_pred = self.cutil.get_uri("rdfs:label") self.test_label = "some label" self.test_comment_IRI = self.cutil.get_uri("rdfs:comment") self.test_comment = 'bonus eruptus'
def test_germline_variant_to_disease(self): self.orphanet.graph = RDFGraph() # Reset graph self.orphanet.files['disease-gene']['file'] = 'orph-germline.xml' self.orphanet._process_diseasegene(limit=None) logger.debug( "Reference graph: %s", self.orphanet.graph.serialize(format="turtle").decode("utf-8")) expected_triples = """ MONARCH:b2cd4dfacc21d0e28c39 a OBAN:association ; RO:0002558 ECO:0000322 ; OBAN:association_has_object Orphanet:938475 ; OBAN:association_has_predicate RO:0003303 ; OBAN:association_has_subject <https://monarchinitiative.org/.well-known/genid/b56f798350412a34> . ENSEMBL:ENSG00000166813 a owl:Class . HGNC:30497 a owl:Class . Orphanet:268061 a owl:Class ; rdfs:label "KS1" ; dc:description "kinesin family member 7" ; oboInOwl:hasExactSynonym "KAS1" ; rdfs:subClassOf OBO:SO_0001217 ; owl:equivalentClass ENSEMBL:ENSG00000166813, HGNC:30497 . <https://monarchinitiative.org/.well-known/genid/b56f798350412a34> a GENO:0000002 ; rdfs:label "germline variant of KS1" ; GENO:0000418 Orphanet:268061 ; RO:0003303 Orphanet:938475 ; :MONARCH_anonymous true ; :has_cell_origin GENO:0000900 . Orphanet:938475 a owl:Class ; rdfs:label "too much unit testing disorder" . """ self.assertTrue( self.test_util.test_graph_equality(expected_triples, self.orphanet.graph))
def test_germline_variant_to_disease(self): self.orphanet.graph = RDFGraph() # Reset graph self.orphanet.files['disease-gene']['file'] = 'orph-germline.xml' self.orphanet._process_diseasegene(limit=None) LOG.debug( "Reference graph: %s", self.orphanet.graph.serialize(format="turtle").decode("utf-8") ) expected_triples = """ MONARCH:b40e89f44906ccededb6 a OBAN:association ; RO:0002558 ECO:0000322 ; OBAN:association_has_object ORPHA:938475 ; OBAN:association_has_predicate RO:0003303 ; OBAN:association_has_subject <https://monarchinitiative.org/.well-known/genid/bc50c3aece4f4f161d4d> . ENSEMBL:ENSG00000166813 a owl:Class . HGNC:30497 a owl:Class . HGNC:30497 a owl:Class ; rdfs:label "KS1" ; oboInOwl:hasExactSynonym "KAS1" ; rdfs:subClassOf OBO:SO_0001217 ; owl:equivalentClass ENSEMBL:ENSG00000166813, ORPHA:268061 . <https://monarchinitiative.org/.well-known/genid/bc50c3aece4f4f161d4d> a GENO:0000002 ; rdfs:label "germline variant of KS1" ; GENO:0000418 HGNC:30497; RO:0003303 ORPHA:938475 ; :MONARCH_anonymous true ; :has_cell_origin GENO:0000900 . ORPHA:938475 a owl:Class ; rdfs:label "too much unit testing disorder" . """ self.assertTrue(self.test_util.test_graph_equality( expected_triples, self.orphanet.graph)) return
def testFakeDataSet1(self): string_db = StringDB('rdf_graph', True) string_db.graph = RDFGraph(True) self.assertEqual(len(string_db.graph), 0) ensembl = Ensembl('rdf_graph', True) prot_map = ensembl.fetch_protein_gene_map(9606) for key in prot_map.keys(): prot_map[key] = "ENSEMBL:{}".format(prot_map[key]) print("Finished fetching ENSP IDs, fetched {} proteins".format( len(prot_map.keys()))) dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns) string_db._process_protein_links(dataframe, prot_map, 9606) triples = """ ENSEMBL:ENSG00000001626 RO:0002434 ENSEMBL:ENSG00000004059 . """ self.assertTrue( self.test_util.test_graph_equality(triples, string_db.graph))
def main(): hpo = RDFGraph() root = "HP:0000118" hpo_terms = OrderedDict() hpo.parse("http://purl.obolibrary.org/obo/hp.owl", format='xml') hpo.bind_all_namespaces() hpo.bind("oboInOwl", "http://www.geneontology.org/formats/oboInOwl#") tree = {} tree[root] = {} path = [] hpo_to_tree(root, hpo_terms, hpo, tree, path) with open('hpo-tree.json', 'w') as outfile: json.dump(tree, outfile) with open('hpo-terms.tsv', 'w') as outfile: for key, value in hpo_terms.items(): outfile.write("{0}\t{1}\t{2}\t{3}\n".format( key, value['label'], "|".join(value['lay_person']), value['parents']))
def main(): hpo = RDFGraph() root = "HP:0000118" hpo_terms = OrderedDict() hpo.parse("http://purl.obolibrary.org/obo/hp.owl", format='xml') hpo.bind_all_namespaces() hpo.bind("oboInOwl", "http://www.geneontology.org/formats/oboInOwl#") tree = {} tree[root] = {} path = [] hpo_to_tree(root, hpo_terms, hpo, tree, path) with open('hpo-tree.json', 'w') as outfile: json.dump(tree, outfile) with open('hpo-terms.tsv', 'w') as outfile: for key, value in hpo_terms.items(): outfile.write("{0}\t{1}\t{2}\t{3}\n".format( key, value['label'], "|".join(value['lay_person']), value['parents'] ))
for n in wd_ontology.nodes(): proteins = wd.canned_query('disease2protein', n) anns = [a for p in proteins for a in aset.annotations(p)] if len(anns) > 0: print("{} {}".format(n, wd_ontology.label(n))) for a in anns: outfile.write("{}\t{}\n".format(a, go.label(a))) # Endpoints SCIGRAPH_ONTOLOGY = 'https://scigraph-ontology-dev.monarchinitiative.org/scigraph/' SCIGRAPH_DATA = 'https://scigraph-data-dev.monarchinitiative.org/scigraph/' GOLR_URL = 'https://solr.monarchinitiative.org/solr/golr/select' # Get mondo subset sickle_cell_anemia = 'OMIM:603903' disease_graph = RDFGraph() sg = SciGraph(SCIGRAPH_ONTOLOGY) parent_graph = sg.neighbors(sickle_cell_anemia, {'depth':25, 'direction': 'OUTGOING', 'relationshipType': 'subClassOf'}) child_graph = sg.neighbors(sickle_cell_anemia, {'depth':2, 'direction': 'INCOMING', 'relationshipType': 'subClassOf'}) eq_graph = sg.neighbors(sickle_cell_anemia, {'depth':10, 'relationshipType': 'equivalentClass'}) def add_triples_from_bbop(bbop_graph, rdf_graph): for e in bbop_graph.edges: if not re.search(r':', e.predicate): if e.predicate == 'subClassOf': e.predicate = 'rdfs:subClassOf' elif e.predicate == 'equivalentClass':
def test_snp_model(self): """ Test output model of _process_haplotype() self._process_haplotype( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology) """ variant_curie, variant_type = \ self.source._get_curie_and_type_from_id(self.test_data['snp_label']) so_ontology = RDFGraph() logger.info("Loading SO ontology in separate rdf graph") so_ontology.parse(self.source.files['so']['url'], format='xml') so_ontology.bind_all_namespaces() logger.info("Finished loading SO ontology") self.source._process_haplotype( variant_curie, self.test_data['snp_label'], self.test_data['chrom_num'], self.test_data['chrom_pos'], self.test_data['context'], self.test_data['allele_freq'], self.test_data['mapped_gene'], so_ontology) sparql_query = """ SELECT ?snp WHERE { :haplotype_bcb627b1f64039b0 a OBO:GENO_0000871 ; rdfs:label "rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?" ; OBO:GENO_0000382 ?snp, dbSNP:rs3758171, dbSNP:rs3824344, dbSNP:rs7020413 ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . ?snp a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs1329573-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3758171 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3758171-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3824344 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3824344-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs7020413 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs7020413-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> ; OBO:GENO_0000418 <http://identifiers.org/hgnc/HGNC:8619> ; OBO:RO_0002162 OBO:NCBITaxon_9606 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> a faldo:Position ; faldo:position 36997420 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> a faldo:Position ; faldo:position 36998996 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> a faldo:Position ; faldo:position 37000690 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> a faldo:Position ; faldo:position 37002118 ; faldo:reference OBO:CHR_GRCh38chr9 . } """ sparql_output = self.source.graph.query(sparql_query) # Test that query passes and returns one row results = list(sparql_output) expected = [(URIRef(self.source.graph._getNode("dbSNP:rs1329573")),)] self.assertEqual(results, expected)
def __init__( self, identifier, # name? should be Archive url via Source title, url, ingest_desc=None, license_url=None, data_rights=None, graph_type='rdf_graph', # rdf_graph, streamed_graph file_handle=None): if graph_type is None: self.graph = RDFGraph(None, identifier) elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, identifier, file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph(True, identifier) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # TODO: move hard coded curies to translation table calls self.identifier = identifier if title is None: self.title = identifier else: self.title = title self.version = None self.date_issued = None # The data_accesed value is later used as an literal of properties # such as dcterms:issued, which needs to conform xsd:dateTime format. # TODO ... we need to have a talk about typed literals and SPARQL self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') self.citation = set() self.license_url = license_url self.model.addType(self.identifier, 'dctypes:Dataset') self.graph.addTriple(self.identifier, 'dcterms:title', title, True) self.graph.addTriple( self.identifier, 'dcterms:identifier', identifier, True) if url is not None: self.graph.addTriple(self.identifier, 'foaf:page', url) # maybe in the future add the logo here: # schemaorg:logo <uri> # TODO add the license info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.addTriple( self.identifier, 'dcterms:license', license_url) else: LOG.debug('No license provided.') if data_rights is not None: self.graph.addTriple( self.identifier, 'dcterms:rights', data_rights, object_is_literal=True) else: LOG.debug('No rights provided.') if ingest_desc is not None: self.model.addDescription(self.identifier, ingest_desc) return
def test_snp_model(self): """ Test output model of _process_haplotype() self._process_haplotype( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology) """ self.assertTrue(len(list(self.source.graph)) == 0) variant_curie, variant_type = self.source._get_curie_and_type_from_id( self.test_data['snp_label']) so_ontology = RDFGraph() LOG.info("Loading SO ontology in separate rdf graph") so_ontology.parse(self.source.files['so']['url'], format='xml') so_ontology.bind_all_namespaces() LOG.info("Finished loading SO ontology") self.source._process_haplotype( variant_curie, self.test_data['snp_label'], self.test_data['chrom_num'], self.test_data['chrom_pos'], self.test_data['context'], self.test_data['allele_freq'], self.test_data['mapped_gene'], so_ontology) triples = """ :haplotype_bb627b1f64039b0f751a a OBO:GENO_0000871 ; rdfs:label "rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?" ; OBO:GENO_0000382 dbSNP:rs1329573, dbSNP:rs3758171, dbSNP:rs3824344, dbSNP:rs7020413 ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs1329573 a OBO:SO_0000694, SO:0001627 ; rdfs:label "rs1329573-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3758171 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3758171-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3824344 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3824344-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs7020413 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs7020413-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> a faldo:Position ; faldo:position 36997420 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> a faldo:Position ; faldo:position 36998996 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> a faldo:Position ; faldo:position 37000690 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> a faldo:Position ; faldo:position 37002118 ; faldo:reference OBO:CHR_GRCh38chr9 . """ # dbg # LOG.debug( # "Reference graph: %s", # self.source.graph.serialize(format="turtle").decode("utf-8")) # Does not seem to acknowlage these constant triples self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph))
class GeneralGraphTestCase(unittest.TestCase): def setUp(self): self.graph = RDFGraph() self.curie_map = curie_map.get() def tearDown(self): self.graph = None def test_curieprefixes(self): """ This will ensure that we can create identifiers for all of the defined curie prefixes using the GraphUtils.getNode() method :return: """ # add one id per curie as classes to the graph for p in self.curie_map.keys(): testid = p+':testme' n = self.graph._getnode(testid) m = "prefix \""+p+"\" has an error...can't create graph node" self.assertTrue(n is not None, m) return def readGraphFromTurtleFile(self, f): """ This will read the specified file into a graph. A simple parsing test. :param f: :return: """ import os vg = RDFGraph() p = os.path.abspath(f) logger.info("Testing reading turtle file from %s", p) vg.parse(f, format="turtle") logger.info('Found %s graph nodes in %s', len(vg), p) self.assertTrue(len(vg) > 0, "No nodes found in "+p) return def readGraphIntoOWL(self, f): """ test if the ttl can be parsed by owlparser this expects owltools to be accessible from commandline :param f: file of ttl :return: """ import subprocess from subprocess import check_call status = check_call(["owltools", f], stderr=subprocess.STDOUT) # returns zero is success! if status != 0: logger.error( 'finished verifying with owltools with status %s', status) self.assertTrue(status == 0) return
def setUp(self): self.graph = RDFGraph() self.curie_map = curie_map.get()
def process_catalog(self, limit=None): """ :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['catalog']['file'])) LOG.info("Processing Data from %s", raw) efo_ontology = RDFGraph(False, "EFO") LOG.info("Loading EFO ontology in separate rdf graph") efo_ontology.parse(self.files['efo']['url'], format='xml') efo_ontology.bind_all_namespaces() LOG.info("Finished loading EFO ontology") so_ontology = RDFGraph(False, "SO") LOG.info("Loading SO ontology in separate rdf graph") so_ontology.parse(self.files['so']['url'], format='xml') so_ontology.bind_all_namespaces() LOG.info("Finished loading SO ontology") with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') header = next(filereader, None) # the header row header_len = len(header) LOG.info('header length:\t %i', header_len) for row in filereader: if not row: pass else: if header_len != len(row): LOG.error('BadRow: %i has %i columns', filereader.line_num, row) (date_added_to_catalog, pubmed_num, first_author, pub_date, journal, link, study_name, disease_or_trait, initial_sample_description, replicate_sample_description, region, chrom_num, chrom_pos, reported_gene_nums, mapped_gene, upstream_gene_num, downstream_gene_num, snp_gene_nums, upstream_gene_distance, downstream_gene_distance, strongest_snp_risk_allele, snps, merged, snp_id_current, context, intergenic_flag, risk_allele_frequency, pvalue, pvalue_mlog, pvalue_text, or_or_beta, confidence_interval_95, platform_with_snps_passing_qc, cnv_flag, mapped_trait, mapped_trait_uri, study_accession, GENOTYPING_TECHNOLOGY ) = row if self.test_mode: continue # 06-May-2015 25917933 # Zai CC 20-Nov-2014 J Psychiatr Res http://europepmc.org/abstract/MED/25917933 # A genome-wide association study of suicide severity scores in bipolar disorder. # Suicide in bipolar disorder # 959 European ancestry individuals NA # 10p11.22 10 32704340 C10orf68, CCDC7, ITGB1 CCDC7 # rs7079041-A rs7079041 0 7079041 intron 0 2E-6 5.698970 variant_curie, variant_type = self._get_curie_and_type_from_id( strongest_snp_risk_allele) if strongest_snp_risk_allele.strip() == '': LOG.debug( "No strongest SNP risk allele for %s:\n%s", pubmed_num, str(row)) # still consider adding in the EFO terms # for what the study measured? continue if variant_type == 'snp': self._add_snp_to_graph( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency) self._add_deprecated_snp( variant_curie, snp_id_current, merged, chrom_num, chrom_pos) self._add_snp_gene_relation( variant_curie, snp_gene_nums, upstream_gene_num, downstream_gene_num) elif variant_type == 'haplotype': self._process_haplotype( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology) elif variant_type is None: LOG.warning( "There's a snp id i can't manage: %s", strongest_snp_risk_allele) continue description = self._make_description( disease_or_trait, initial_sample_description, replicate_sample_description, platform_with_snps_passing_qc, pvalue) self._add_variant_trait_association( variant_curie, mapped_trait_uri, efo_ontology, pubmed_num, description) if not self.test_mode and ( limit is not None and filereader.line_num > limit): break # TODO loop through the location hash, # and make all snps at that location equivalent for l in self.id_location_map: snp_ids = self.id_location_map[l] if len(snp_ids) > 1: LOG.info("%s has >1 snp id: %s", l, str(snp_ids)) return
class Dataset: """ this will produce the metadata about a dataset following the example laid out here: http://htmlpreview.github.io/? https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1 (mind the wrap) """ def __init__(self, identifier, title, url, description=None, license_url=None, data_rights=None, graph_type=None, file_handle=None): if graph_type is None: self.graph = RDFGraph(None, identifier) # elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph() self.model = Model(self.graph) self.identifier = ':' + identifier self.version = None self.date_issued = None # The data_accesed value is later used as an literal of properties # such as dct:issued, which needs to conform xsd:dateTime format. # TODO ... we need to have a talk about typed literals and SPARQL self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') self.citation = set() self.license = license_url self.model.addType(self.identifier, 'dctypes:Dataset') self.graph.addTriple(self.identifier, 'dct:title', title, True) self.graph.addTriple( self.identifier, 'dct:identifier', identifier, object_is_literal=True) self.graph.addTriple(self.identifier, 'foaf:page', url) # maybe in the future add the logo here: # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> . # TODO add the licence info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.addTriple( self.identifier, 'dct:license', license_url) else: logger.debug('No license provided.') if data_rights is not None: self.graph.addTriple( self.identifier, 'dct:rights', data_rights, object_is_literal=True) else: logger.debug('No rights provided.') if description is not None: self.model.addDescription(self.identifier, description) return def setVersion(self, date_issued, version_id=None): """ Legacy function... should use the other set_* for version and date as of 2016-10-20 used in: dipper/sources/HPOAnnotations.py 139: dipper/sources/CTD.py 99: dipper/sources/BioGrid.py 100: dipper/sources/MGI.py 255: dipper/sources/EOM.py 93: dipper/sources/Coriell.py 200: dipper/sources/MMRRC.py 77: # TODO set as deprecated :param date_issued: :param version_id: :return: """ if date_issued is not None: self.set_date_issued(date_issued) elif version_id is not None: self.set_version_by_num(version_id) else: logger.error("date or version not set!") # TODO throw error return if version_id is not None: self.set_version_by_num(version_id) else: logger.info("set version to %s", self.version) self.set_version_by_date(date_issued) logger.info("set version to %s", self.version) return def set_date_issued(self, date_issued): self.date_issued = date_issued self.graph.addTriple( self.identifier, 'dct:issued', date_issued, object_is_literal=True) logger.info("setting date to %s", date_issued) return def set_version_by_date(self, date_issued=None): """ This will set the version by the date supplied, the date already stored in the dataset description, or by the download date (today) :param date_issued: :return: """ if date_issued is not None: d = date_issued elif self.date_issued is not None: d = self.date_issued else: d = self.date_accessed logger.info( "No date supplied for setting version; " "using download timestamp for date_issued") logger.info("setting version by date") self.set_version_by_num(d) return def set_version_by_num(self, version_num): self.version = self.identifier+version_num self.graph.addTriple(self.version, 'dct:isVersionOf', self.identifier) self.graph.addTriple(self.version, 'pav:version', version_num, object_is_literal=True) logger.info("setting version to %s", self.version) # set the monarch-generated-version of the resource-version # TODO sync this up with the ontology version if version_num != self.date_accessed: dipperized_version = ':' + str(self.date_accessed) self.graph.addTriple( dipperized_version, 'dct:isVersionOf', self.version) self.graph.addTriple( dipperized_version, 'pav:version', self.date_accessed, object_is_literal=True) self.graph.addTriple( dipperized_version, 'dct:issued', self.date_accessed, object_is_literal=True, literal_type="xsd:dateTime") return def setFileAccessUrl(self, url, is_object_literal=False): self.graph.addTriple(self.identifier, 'dcat:accessURL', url, is_object_literal) def getGraph(self): return self.graph def set_license(self, license): self.license = license return def get_license(self): return self.license def set_citation(self, citation_id): self.citation.add(citation_id) # TODO # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id) return