def setUp(self): self.test_util = TestUtils() self.test_set_1 = { 'Allele': 'atp6-L183R (L183R)', 'Chemical': 'glycerol', 'Condition': 'elevated temperature (35 deg C)|nonfermentable carbon source', 'Details': 'similar results obtained with atp6-L247R, and atp6-W136R, all ' 'corresponding to human NARP syndrome mutants', 'Experiment Type': 'classical genetics', 'Feature Name': 'Q0085', 'Feature Type': 'ORF', 'Gene Name': 'ATP6', 'Mutant Type': 'reduction of function', 'Phenotype': 'respiratory growth: decreased rate', 'Reference': 'PMID: 21715656|SGD_REF: S000145858', 'Reporter': ' ', 'SGDID': 'S000007268', 'Strain Background': 'Other' } return
def setUp(self): """ """ self.test_util = TestUtils() self.orphanet = Orphanet('rdf_graph', True) self.orphanet.rawdir = os.path.join(os.path.dirname(__file__), 'resources/orphanet')
def setUp(self): self.test_util = TestUtils() self.source = GWASCatalog('rdf_graph', True) self.source.graph = RDFGraph(True) # Reset graph self.source.graph.bind_all_namespaces() self.test_data = { 'snp_label': 'rs1491921-C', 'chrom_num': '5', 'chrom_pos': '21259029', 'context': 'intergenic_variant', 'allele_freq': '0.013', 'trait': 'Diisocyanate-induced asthma', 'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0006995, http://www.ebi.ac.uk/efo/EFO_0003949', 'pvalue': '0.0000007', 'merged': '0', 'snp_id_current': '1491921', 'mapped_gene': 'LOC102723561 - GUSBP1', 'snp_gene_nums': '', 'upstream_gene_num': '107986179', 'downstream_gene_num': '107986180', 'init_sample_desc': '74 European ancestry cases, 824 European ancestry controls', 'replicated_sample_desc': 'NA', 'platform': 'Illumina [1556551]', 'pubmed': '25918132' }
def setUp(self): self.test_util = TestUtils() self.source = GWASCatalog('rdf_graph', True) self.source.graph = RDFGraph(True) self.test_data = { 'snp_label': 'rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?', 'chrom_num': '9;9;9;9', 'chrom_pos': '36998996;37002118;37000690;36997420', 'context': 'intron_variant; intron_variant; intron_variant; intron_variant', 'allele_freq': 'NR', 'trait': 'Intelligence', 'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0004337', 'pvalue': '0.00000004', 'merged': '0', 'snp_id_current': '', 'mapped_gene': 'PAX5; PAX5; PAX5; PAX5', 'snp_gene_nums': '', 'upstream_gene_num': '107986179', 'downstream_gene_num': '107986180', 'init_sample_desc': '656 European ancestry individuals from ADHD families', 'replicated_sample_desc': 'NA', 'platform': 'Illumina [795637]', 'pubmed': '22449649' }
def setUp(self): self.test_util = TestUtils() self.test_set_1 = \ ('ENSBTAP00000013354', 'R-BTA-3000480', 'http://www.reactome.org/PathwayBrowser/#/R-BTA-3000480', 'Scavenging by Class A Receptors', 'IEA', 'Bos taurus') return
def setUp(self): self.test_util = TestUtils() self.source = CTD('rdf_graph', True) self.source.graph = RDFGraph(True) self.test_row = [ 'Nicotine', 'D009538', '', 'TOBACCO ADDICTION, SUSCEPTIBILITY TO', 'OMIM:188890', 'therapeutic', '', '', '', '12345|56789' ] return
def setUp(self): self.test_util = TestUtils() self.source = MyChem('rdf_graph', True) # Replaces source.fetch() data_fh = open(TESTDATA, 'r') self.test_data = json.load(data_fh) data_fh.close() self.source.drugbank_targets.append(self.test_data[0]) self.source.drugcentral_interactors.append(self.test_data[0])
def setUp(self): """ """ self.test_util = TestUtils() self.orphanet = Orphanet('rdf_graph', True) # Override so tests don't break when we update terms self.globaltt = self.orphanet.open_and_parse_yaml( os.path.join(os.path.dirname(__file__), './resources/test_terms.yaml')) self.orphanet.rawdir = os.path.join(os.path.dirname(__file__), 'resources/orphanet')
def test_therapeutic_relationship(self): from dipper.utils.TestUtils import TestUtils from dipper.models.Model import Model # Make testutils object and load ttl test_query = TestUtils(self.source.graph) test_query.load_testgraph_from_turtle(self.source) graph = self.source.graph model = Model(graph) # Expected structure # TODO can this be unified OBAN and the Annot models # to be automatically generated? sparql_query = """ SELECT ?assoc ?disease ?rel ?chemical WHERE { ?assoc a OBAN:association ; OBAN:association_has_object ?disease ; OBAN:association_has_predicate ?rel ; OBAN:association_has_subject ?chemical .} """ # SPARQL variables to check chem_id = 'MESH:D009538' chem_uri = graph._getNode(chem_id) disease_id = 'OMIM:188890' disease_uri = graph._getNode(disease_id) rel_id = model.object_properties['substance_that_treats'] rel_uri = graph._getNode(rel_id) # TODO unused # pubmed_id = 'PMID:16785264' # pubmed_uri = gu.getNode(pubmed_id) # eco = 'ECO:0000033' assoc = G2PAssoc(graph, self.source.name, chem_id, disease_id, rel_id) assoc_id = assoc.make_g2p_id() assoc_uri = self.source.graph._getNode(assoc_id) # One of the expected outputs from query expected_output = [assoc_uri, disease_uri, rel_uri, chem_uri] # Query graph sparql_output = test_query.query_graph(sparql_query) self.assertTrue( expected_output in sparql_output, "did not find expected association: " + str(expected_output) + " found " + str(len(sparql_output)) + " others:\n" + str(sparql_output)) logger.info("Test query data finished.")
def setUp(self): """ Because _process_evidence_view uses self.rawdir to find the evidence file, the defaults are overriden here to point to our test file Note the file name must match what is in that method - evidence_view """ self.test_util = TestUtils() self.mgi = MGI('rdf_graph', True) self.mgi.rawdir = os.path.join(os.path.dirname(__file__), 'resources/mgi') self.mgi.idhash['annot']['6901981'] = ':association'
class EvidenceTestCase(unittest.TestCase): def setUp(self): """ Because _process_evidence_view uses self.rawdir to find the evidence file, the defaults are overriden here to point to our test file Note the file name must match what is in that method - evidence_view """ self.test_util = TestUtils() self.mgi = MGI('rdf_graph', True) self.mgi.rawdir = os.path.join(os.path.dirname(__file__), 'resources/mgi') self.mgi.idhash['annot']['6901981'] = ':association' def tearDown(self): self.mgi = None return def test_sex_specificity_model(self): self.mgi.graph = RDFGraph(True) # Reset graph self.mgi._process_evidence_view(limit=None) logger.debug("Reference graph: %s", self.mgi.graph.serialize(format="turtle").decode("utf-8")) expected_triples = """ :association RO:0002558 ECO:0000006 ; dc:source J:74619 ; :has_sex_specificity PATO:0000384 . J:74619 a IAO:0000310 . """ self.assertTrue( self.test_util.test_graph_equality(expected_triples, self.mgi.graph))
def test_parse(self): for rcv in RCVS: output_nt = rcv + '.nt' input_xml = rcv + '.xml.gz' reference_ttl = TTL_PATH + rcv + '.ttl' with self.subTest(rcv=rcv): mock_args = [ "test_clinvar.py", "--inputdir", XML_PATH, "--filename", input_xml, "--mapfile", MAP_FILE, "--destination", NT_PATH, "--output", output_nt ] patch('sys.argv', mock_args).start() clinvar_parse() query_graph = RDFGraph() query_graph.bind_all_namespaces() query_graph.parse(NT_PATH + output_nt, format='nt') with open(reference_ttl, 'r') as ref_fh: ref_graph = "\n".join(ref_fh.readlines()) # debug LOG.debug( "Reference graph: %s", query_graph.serialize(format="turtle").decode("utf-8")) # Convert output from ClinVar parse to dot then png dot_file_path = DOT_PATH + rcv + ".dot" with open(dot_file_path, 'w') as dot_file: rdf2dot(query_graph, dot_file) self.assertTrue( TestUtils.test_graph_equality(ref_graph, query_graph))
def setUp(self): self.test_util = TestUtils() self.assoc_curie = 'MONARCH:test_association' self.eco_id = 'ECO:0000015' self.test_set_1 = ( 'MGI:1920145', 'Setd5', 'WTSI', 'MEFW', 'male', 'heterozygote', 'MGI:4432631', 'Setd5<tm1a(EUCOMM)Wtsi>', 'targeted mutation 1a, Wellcome Trust Sanger Institute', 'MGI:2159965', 'C57BL/6N', 'MGP', 'Wellcome Trust Sanger Institute Mouse Genetics Project', 'MGP Select Pipeline', 'MGP_001', 'MGP_XRY_001', 'X-ray', 'IMPC_XRY_008_001', 'Number of ribs right', 'MP:0005390', 'skeleton phenotype', 'MP:0000480', 'increased rib number', '1.637023E-010', '', '8.885439E-007', 'Wilcoxon rank sum test with continuity correction', 'IMPC') # Generate test curies, these are otherwise generated # within _add_evidence() and _add_study_provenance() # these blank nodes are hardcoded as NOT Skolemized ... self.study_curie = "_:study" self.evidence_curie = "_:evidence" # IRIs for testing sparql output curie_dict = curie_map.get() curie_util = CurieUtil(curie_dict) self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie)) return
def test_parse(self): """ Runs WormBase.parse() and outputs dot file for each allele This is less of a unit test and more for viewing the output of an entire run on a single allele, dot files can be converted to images using scripts/dot-to-svg.sh """ for variant in VARIANTS: with self.subTest(variant_id=variant): self.tearDownAndSetUp() self.gwascatalog.rawdir = RAW_PATH + '/' + variant self.gwascatalog.parse() dot_file_path = DOT_PATH + variant + ".dot" with open(dot_file_path, 'w') as dot_file: rdf2dot(self.gwascatalog.graph, dot_file) # debug LOG.debug( "Reference graph: %s", self.gwascatalog.graph.serialize( format="turtle").decode("utf-8")) reference_ttl = TTL_PATH + variant + '.ttl' self.assertTrue( TestUtils.test_graph_equality(reference_ttl, self.gwascatalog.graph))
class TestMyChemParser(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.source = MyChem('rdf_graph', True) # Replaces source.fetch() data_fh = open(TESTDATA, 'r') self.test_data = json.load(data_fh) data_fh.close() self.source.drugbank_targets.append(self.test_data[0]) self.source.drugcentral_interactors.append(self.test_data[0]) def tearDown(self): self.source = None def test_parse(self): self.source.graph = RDFGraph(True) # Reset graph self.assertTrue(len(list(self.source.graph)) == 0) self.source.parse() triples = """ UNII:46U771ERWK RO:0002606 SNOMED:386761002 ; rdfs:subClassOf CHEBI:23367 . SNOMED:386761002 rdfs:label "Local anesthesia" ; rdfs:subClassOf DOID:4 . """ # dbg logger.debug( "Reference graph: %s", self.source.graph.serialize(format="turtle").decode("utf-8")) self.assertTrue( self.test_util.test_graph_equality(triples, self.source.graph))
def test_therapeutic_relationship(self): from dipper.utils.TestUtils import TestUtils from dipper.utils.GraphUtils import GraphUtils from dipper import curie_map # Make testutils object and load ttl test_query = TestUtils(self.source.graph) test_query.load_testgraph_from_turtle(self.source) # Expected structure # TODO can this be unified OBAN and the Annot models to be automatically generated? sparql_query = """ SELECT ?assoc ?pubmed ?disease ?chemical WHERE { ?assoc a Annotation: ; dc:evidence OBO:ECO_0000033 ; dc:source ?pubmed ; :hasObject ?disease ; :hasPredicate OBO:RO_0002606 ; :hasSubject ?chemical .} """ # SPARQL variables to check gu = GraphUtils(curie_map.get()) chem_id = 'MESH:D009538' chem_uri = gu.getNode(chem_id) disease_id = 'OMIM:188890' disease_uri = gu.getNode(disease_id) eco = 'ECO:0000033' rel_id = gu.object_properties['substance_that_treats'] pubmed_id = 'PMID:16785264' pubmed_uri = gu.getNode(pubmed_id) # consider replacing with make_ctd_chem_disease_assoc_id() assoc_id = self.source.make_association_id('ctd', chem_id, rel_id, disease_id, eco, pubmed_id) assoc_uri = gu.getNode(assoc_id) # One of the expected outputs from query expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri] # Query graph sparql_output = test_query.query_graph(sparql_query) self.assertTrue(expected_output in sparql_output, "did not find expected association: " + assoc_id + " found: " + pprint.pformat(sparql_output)) logger.info("Test query data finished.")
def test_therapeutic_relationship(self): from dipper.utils.TestUtils import TestUtils from dipper.utils.GraphUtils import GraphUtils # Make testutils object and load bindings test_query = TestUtils(self.ctd.graph) self.ctd.load_bindings() # Expected structure sparql_query = """ SELECT ?assoc ?pubmed ?disease ?chemical WHERE { ?assoc a Annotation: ; dc:evidence OBO:ECO_0000033 ; dc:source ?pubmed ; :hasObject ?disease ; :hasPredicate OBO:RO_0002606 ; :hasSubject ?chemical .} """ # SPARQL variables to check gu = GraphUtils(curie_map.get()) chem_id = 'MESH:D009538' chem_uri = gu.getNode(chem_id) disease_id = 'OMIM:188890' disease_uri = gu.getNode(disease_id) pubmed_id = 'PMID:16785264' pubmed_uri = gu.getNode(pubmed_id) rel_id = gu.object_properties['substance_that_treats'] eco = 'ECO:0000033' # TODO PYLINT make_association_id() does not exist in CTD # there is "_make_association()" with a different sig assoc_id = self.ctd.make_association_id( 'ctd', chem_id, rel_id, disease_id, eco, pubmed_id) assoc_uri = gu.getNode(assoc_id) # Expected output from query expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri] # Query graph sparql_output = test_query.query_graph(sparql_query) self.assertTrue(expected_output in sparql_output) logger.info("Test finished.")
def test_classes_indiv_properties(self): """ Given the above sample input, produce the following: A CGD:DiseaseID is an OWL Class A CGD:DiseaseID is a subclass of DOID:4 A CGD:Disease rdfs:label "Adenocarcinoma" A CGD:DiseaseInstance is an individual of CGD:DiseaseID A CGD:DiseaseInstance rdfs:label "Adenocarcinoma with response {1} to therapy" A CGD:DrugID is an OWL Class A CGD:DrugID is a subclass of CHEBI:23888 A CGD:DrugID rdfs:label "5FU-based adjuvant therapy" A CGD:RelationID is an object property PMID:12345 is a IAO:0000013 (journal article) """ from dipper.utils.TestUtils import TestUtils # Make testutils object and load bindings test_env = TestUtils(self.cgd.graph) self.cgd.load_bindings() sparql_query = """ SELECT ?disease ?diseaseInd ?diseaseQual ?drug ?source WHERE {{ ?disease a owl:Class ; rdfs:subClassOf DOID:4 ; rdfs:label "{0}" . ?diseaseInd a ?disease ; rdfs:label "{1}" ; BFO:0000159 ?diseaseQual . ?drug a owl:Class ; rdfs:subClassOf CHEBI:23888 ; rdfs:label "{2}" . <{3}> a owl:ObjectProperty . ?source a IAO:0000013 . }} """.format(self.disease_label, self.disease_instance_label, self.drug_label, self.relationship_uri) # Expected Results expected_results = [[self.disease_uri, self.disease_ind_uri, self.disease_quality_uri, self.drug_uri, self.source_uri]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def setUp(self): self.test_util = TestUtils() self.test_set_1 = { 'aspect': 'N', 'date': '2006-10-26', 'evidence': { 'has_supporting_reference': ['RGD:1581841', 'PMID:12799311'], 'type': 'IED', 'with_support_from': [] }, 'negated': False, 'object': { 'id': 'MP:0003340', 'taxon': 'NCBITaxon:10116' }, 'provided_by': 'RGD', 'qualifiers': [], 'relation': { 'id': None }, 'source_line': 'RGD\t2535\tEdnra\t\tMP:0003340\tRGD:1581841|PMID:12799311\t' 'IED\t\tN\tendothelin receptor type A\t\tgene\ttaxon:10116\t' '20061026\tRGD\t\t\n', 'subject': { 'fullname': 'endothelin receptor type A', 'id': 'RGD:2535', 'label': 'Ednra', 'synonyms': [], 'taxon': { 'id': 'NCBITaxon:10116' }, 'type': 'gene' }, 'subject_extensions': [{ 'filler': '\n', 'property': 'isoform' }] } return
def test_associations(self): """ Given the above sample input, produce the following: CGD:VariantID has_phenotype(RO:0002200) CGD:DiseaseInstance A CGD:AssociationID OBO:RO_0002558 Traceable Author Statement (ECO:0000033) A CGD:AssociationID dc:source PMID:20498393 A CGD:AssociationID has_environment CGD:DrugID A CGD:AssociationID OBAN:association_has_subject CGD:VariantID A CGD:AssociationID OBAN:association_has_object_property has_phenotype A CGD:AssociationID OBAN:association_has_object CGD:DiseaseInstance """ from dipper.utils.TestUtils import TestUtils # Make testutils object and load bindings cu = CurieUtil(self.curie_map) test_env = TestUtils(self.cgd.graph) self.cgd.load_bindings() evidence = 'OBO:ECO_0000033' evidence_uri = URIRef(cu.get_uri(evidence)) sparql_query = """ SELECT ?diseaseInd ?variant ?drug ?vdannot ?source ?evidence WHERE {{ ?variant OBO:RO_0002200 ?diseaseInd . ?vdannot a OBAN:association ; OBO:RO_0002558 ?evidence ; dc:source ?source ; <{0}> ?drug ; OBAN:association_has_object ?diseaseInd ; OBAN:association_has_object_property OBO:RO_0002200 ; OBAN:association_has_subject ?variant . }} """.format(self.relationship_uri) # Expected Results expected_results = [[self.disease_ind_uri, self.variant_uri, self.drug_uri, self.vd_annot_uri, self.source_uri, evidence_uri]] # Query graph sparql_output = test_env.query_graph(sparql_query) self.assertEqual(expected_results, sparql_output)
def test_therapeutic_relationship(self): from dipper.utils.TestUtils import TestUtils from dipper.utils.GraphUtils import GraphUtils # Make testutils object and load bindings test_query = TestUtils(self.ctd.graph) # Expected structure sparql_query = """ SELECT ?assoc ?pubmed ?disease ?chemical WHERE { ?assoc a Annotation: ; dc:evidence OBO:ECO_0000033 ; dc:source ?pubmed ; :hasObject ?disease ; :hasPredicate OBO:RO_0002606 ; :hasSubject ?chemical .} """ # SPARQL variables to check chem_id = 'MESH:D009538' chem_uri = self.graph._getNode(chem_id) disease_id = 'OMIM:188890' disease_uri = self.graph._getNode(disease_id) pubmed_id = 'PMID:16785264' pubmed_uri = self.graph._getNode(pubmed_id) rel_id = self.model.object_properties['substance_that_treats'] eco = 'ECO:0000033' # TODO PYLINT make_association_id() does not exist in CTD # there is "_make_association()" with a different sig assoc_id = self.ctd.make_association_id('ctd', chem_id, rel_id, disease_id, eco, pubmed_id) assoc_uri = self.graph._getNode(assoc_id) # Expected output from query expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri] # Query graph sparql_output = test_query.query_graph(sparql_query) self.assertTrue(expected_output in sparql_output) logger.info("Test finished.")
class SGDTestCase(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.test_set_1 = { 'Allele': 'atp6-L183R (L183R)', 'Chemical': 'glycerol', 'Condition': 'elevated temperature (35 deg C)|nonfermentable carbon source', 'Details': 'similar results obtained with atp6-L247R, and atp6-W136R, all ' 'corresponding to human NARP syndrome mutants', 'Experiment Type': 'classical genetics', 'Feature Name': 'Q0085', 'Feature Type': 'ORF', 'Gene Name': 'ATP6', 'Mutant Type': 'reduction of function', 'Phenotype': 'respiratory growth: decreased rate', 'Reference': 'PMID: 21715656|SGD_REF: S000145858', 'Reporter': ' ', 'SGDID': 'S000007268', 'Strain Background': 'Other'} return def tearDown(self): return def testSGDParser(self): sgd = SGD('rdf_graph', True) sgd.graph = RDFGraph(True) record = self.test_set_1 sgd.make_association(record) description = sgd._make_description(record) triples = """ :MONARCH_ba748c98c0f167739128 a OBAN:association ; OBO:RO_0002558 OBO:APO_0000020 ; dc:description "{0}"; dc:source PMID:21715656 ; OBAN:association_has_object MONARCH:APO_0000309APO_0000245 ; OBAN:association_has_predicate OBO:RO_0002200 ; OBAN:association_has_subject SGD:S000007268 . SGD:S000007268 rdfs:label "ATP6" ; RO:0002200 MONARCH:APO_0000309APO_0000245 . APO:0000020 rdfs:label "classical genetics" . PMID:21715656 a OBO:IAO_0000311 ; owl:sameAs SGD_REF:S000145858 . MONARCH:APO_0000309APO_0000245 rdfs:label "respiratory growth:decreased rate" ; rdfs:subClassOf UPHENO:0001001 . """.format(description) # test exact contents of graph self.assertTrue(self.test_util.test_graph_equality(triples, sgd.graph))
class SGDTestCase(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.test_set_1 = {'Allele': 'atp6-L183R (L183R)', 'Chemical': 'glycerol', 'Condition': 'elevated temperature (35 deg C)|nonfermentable carbon source', 'Details': 'similar results obtained with atp6-L247R, and atp6-W136R, all ' 'corresponding to human NARP syndrome mutants', 'Experiment Type': 'classical genetics', 'Feature Name': 'Q0085', 'Feature Type': 'ORF', 'Gene Name': 'ATP6', 'Mutant Type': 'reduction of function', 'Phenotype': 'respiratory growth: decreased rate', 'Reference': 'PMID: 21715656|SGD_REF: S000145858', 'Reporter': ' ', 'SGDID': 'S000007268', 'Strain Background': 'Other'} return def tearDown(self): return def testSGDParser(self): sgd = SGD('rdf_graph', True) sgd.graph = RDFGraph(True) record = self.test_set_1 sgd.make_association(record) description = sgd._make_description(record) triples = """ :MONARCH_95158d413dd73476 a OBAN:association ; OBO:RO_0002558 OBO:APO_0000020 ; dc:description "{0}"; dc:source PMID:21715656 ; OBAN:association_has_object MONARCH:OBO_APO_0000309OBO_APO_0000245 ; OBAN:association_has_predicate OBO:RO_0002200 ; OBAN:association_has_subject SGD:S000007268 . SGD:S000007268 rdfs:label "ATP6" ; RO:0002200 MONARCH:OBO_APO_0000309OBO_APO_0000245 . APO:0000020 rdfs:label "classical genetics" . PMID:21715656 a OBO:IAO_0000311 ; owl:sameAs SGD_REF:S000145858 . MONARCH:OBO_APO_0000309OBO_APO_0000245 rdfs:label "respiratory growth:decreased rate" ; rdfs:subClassOf UPHENO:0001001 . """.format(description) # test exact contents of graph self.assertTrue(self.test_util.test_graph_equality( triples, sgd.graph))
class RGDTestCase(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.test_set_1 = {'aspect': 'N', 'date': '2006-10-26', 'evidence': {'has_supporting_reference': ['RGD:1581841', 'PMID:12799311'], 'type': 'IED', 'with_support_from': []}, 'negated': False, 'object': {'id': 'MP:0003340', 'taxon': 'NCBITaxon:10116'}, 'provided_by': 'RGD', 'qualifiers': [], 'relation': {'id': None}, 'source_line': 'RGD\t2535\tEdnra\t\tMP:0003340\tRGD:1581841|PMID:12799311\t' 'IED\t\tN\tendothelin receptor type A\t\tgene\ttaxon:10116\t' '20061026\tRGD\t\t\n', 'subject': {'fullname': 'endothelin receptor type A', 'id': 'RGD:2535', 'label': 'Ednra', 'synonyms': [], 'taxon': {'id': 'NCBITaxon:10116'}, 'type': 'gene'}, 'subject_extensions': [{'filler': '\n', 'property': 'isoform'}]} return def tearDown(self): return def testRGDParser(self): rgd = RGD('rdf_graph', True) rgd.graph = RDFGraph(True) self.assertTrue(len(list(rgd.graph)) == 0) rgd.make_association(record=self.test_set_1) triples = """ :MONARCH_b4650e8c3d865f11a1a5 a OBAN:association ; RO:0002558 ECO:0005611 ; dc:source RGDRef:1581841 ; OBAN:association_has_object OBO:MP_0003340 ; OBAN:association_has_predicate OBO:RO_0002200 ; OBAN:association_has_subject RGD:2535 ; pav:createdOn "2006-10-26" . RGD:2535 OBO:RO_0002200 MP:0003340 . RGDRef:1581841 a IAO:0000311 ; owl:sameAs PMID:12799311 . """ # dbg logger.debug("Reference graph: %s", rgd.graph.serialize(format="turtle") .decode("utf-8") ) self.assertTrue(self.test_util.test_graph_equality( triples, rgd.graph))
def setUp(self): self.test_util = TestUtils() # Test set with two proteins from same species self.test_set_1 = [[ '9606.ENSP00000000233', '9606.ENSP00000003084', 0, 0, 0, 0, 300, 0, 150, 800]] # Test set with deprecated protein id self.test_set_2 = [[ '9606.ENSP00000000233', '9606.ENSP00000006101', 0, 0, 0, 0, 300, 0, 150, 800]] self.columns = [ 'protein1', 'protein2', 'neighborhood', 'fusion', 'cooccurence', 'coexpression', 'experimental', 'database', 'textmining', 'combined_score'] ensembl = Ensembl('rdf_graph', True) self.protein_list = ensembl.fetch_protein_gene_map('9606') return
class CTDTestCase(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.source = CTD('rdf_graph', True) self.source.graph = RDFGraph(True) self.test_row = [ 'Nicotine', 'D009538', '', 'TOBACCO ADDICTION, SUSCEPTIBILITY TO', 'OMIM:188890', 'therapeutic', '', '', '', '12345|56789' ] return def tearDown(self): self.source = None return def test_therapeutic_relationship(self): # test that graph is empty self.assertTrue(len(list(self.source.graph)) == 0) self.source._process_interactions(self.test_row) triples = """ :MONARCH_b6c289df47cb72653f79 a OBAN:association ; RO:0002558 ECO:0000033 ; dcterms:source PMID:12345, PMID:56789 ; OBAN:association_has_object OMIM:188890 ; OBAN:association_has_predicate RO:0002606 ; OBAN:association_has_subject MESH:D009538 . MESH:D009538 a owl:Class ; rdfs:label "Nicotine" ; biolink:category biolink:ChemicalSubstance ; RO:0002606 OMIM:188890 . PMID:12345 a IAO:0000013 . PMID:56789 a IAO:0000013 . OMIM:188890 a owl:Class ; biolink:category biolink:DiseaseOrPhenotypicFeature . """ # test exact contents of graph self.assertTrue(self.test_util.test_graph_equality( triples, self.source.graph))
class CTDTestCase(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.source = CTD('rdf_graph', True) self.source.graph = RDFGraph(True) self.test_row = [ 'Nicotine', 'D009538', '', 'TOBACCO ADDICTION, SUSCEPTIBILITY TO', 'OMIM:188890', 'therapeutic', '', '', '', '12345|56789' ] return def tearDown(self): self.source = None return def test_therapeutic_relationship(self): # test that graph is empty self.assertTrue(len(list(self.source.graph)) == 0) self.source._process_interactions(self.test_row) triples = """ :MONARCH_b6c289df47cb72653f79 a OBAN:association ; RO:0002558 ECO:0000033 ; dc:source PMID:12345, PMID:56789 ; OBAN:association_has_object OMIM:188890 ; OBAN:association_has_predicate RO:0002606 ; OBAN:association_has_subject MESH:D009538 . MESH:D009538 a owl:Class ; rdfs:label "Nicotine" ; RO:0002606 OMIM:188890 . PMID:12345 a IAO:0000013 . PMID:56789 a IAO:0000013 . OMIM:188890 a owl:Class . """ # test exact contents of graph self.assertTrue(self.test_util.test_graph_equality( triples, self.source.graph))
class ReactomeTestCase(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.test_set_1 = \ ('ENSBTAP00000013354', 'R-BTA-3000480', 'http://www.reactome.org/PathwayBrowser/#/R-BTA-3000480', 'Scavenging by Class A Receptors', 'IEA', 'Bos taurus') self.gaf_eco = {"IEA": "ECO:0000501"} return def tearDown(self): return def testEnsemblReactomeParser(self): ''' ''' reactome = Reactome('rdf_graph', True) reactome.graph = RDFGraph(True) self.assertTrue(len(list(reactome.graph)) == 0) # reactome.parse_gaf_eco('gaf-eco-mapping') (gene, pathway_id, pathway_iri, pathway_label, go_ecode, species_name) = self.test_set_1 reactome._add_component_pathway_association('ENSEMBL:' + gene, 'REACT:' + pathway_id, pathway_label, self.gaf_eco[go_ecode]) triples = """ ENSEMBL:ENSBTAP00000013354 RO:0002331 REACT:R-BTA-3000480 . :MONARCH_b582c188b7ec20016206 a OBAN:association ; OBO:RO_0002558 ECO:0000501 ; OBAN:association_has_object REACT:R-BTA-3000480 ; OBAN:association_has_predicate RO:0002331 ; OBAN:association_has_subject ENSEMBL:ENSBTAP00000013354 . REACT:R-BTA-3000480 a owl:Class ; rdfs:label "Scavenging by Class A Receptors" ; rdfs:subClassOf GO:0009987, PW:0000001 . """ self.assertTrue( self.test_util.test_graph_equality(triples, reactome.graph))
def test_gene_xref(self): """ test FlyBase._process_gene_xref() """ for allele in ALLELES: with self.subTest(allele_id=allele): self.tearDownAndSetUp() self.flybase.rawdir = RAW_PATH + '/' + allele self.flybase._process_gene_xref(limit=None) LOG.debug( "Reference graph: %s", self.flybase.graph.serialize(format="turtle").decode("utf-8")) reference_ttl = TTL_PATH + allele + '/' + 'gene_xref.ttl' self.assertTrue(TestUtils.test_graph_equality( reference_ttl, self.flybase.graph))
def setUp(self): self.test_util = TestUtils() self.test_set_1 = {'Allele': 'atp6-L183R (L183R)', 'Chemical': 'glycerol', 'Condition': 'elevated temperature (35 deg C)|nonfermentable carbon source', 'Details': 'similar results obtained with atp6-L247R, and atp6-W136R, all ' 'corresponding to human NARP syndrome mutants', 'Experiment Type': 'classical genetics', 'Feature Name': 'Q0085', 'Feature Type': 'ORF', 'Gene Name': 'ATP6', 'Mutant Type': 'reduction of function', 'Phenotype': 'respiratory growth: decreased rate', 'Reference': 'PMID: 21715656|SGD_REF: S000145858', 'Reporter': ' ', 'SGDID': 'S000007268', 'Strain Background': 'Other'} return
class ReactomeTestCase(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.test_set_1 = \ ('ENSBTAP00000013354', 'R-BTA-3000480', 'http://www.reactome.org/PathwayBrowser/#/R-BTA-3000480', 'Scavenging by Class A Receptors', 'IEA', 'Bos taurus') return def tearDown(self): return def testEnsemblReactomeParser(self): reactome = Reactome('rdf_graph', True) reactome.graph = RDFGraph(True) self.assertTrue(len(list(reactome.graph)) == 0) eco_map = Reactome.get_eco_map(Reactome.map_files['eco_map']) (gene, pathway_id, pathway_iri, pathway_label, go_ecode, species_name) = self.test_set_1 reactome._add_component_pathway_association( eco_map, gene, 'ENSEMBL', pathway_id, 'REACT', pathway_label, go_ecode) triples = """ ENSEMBL:ENSBTAP00000013354 RO:0002331 REACT:R-BTA-3000480 . :MONARCH_b582c188b7ec20016206 a OBAN:association ; OBO:RO_0002558 ECO:0000501 ; OBAN:association_has_object REACT:R-BTA-3000480 ; OBAN:association_has_predicate RO:0002331 ; OBAN:association_has_subject ENSEMBL:ENSBTAP00000013354 . REACT:R-BTA-3000480 a owl:Class ; rdfs:label "Scavenging by Class A Receptors" ; rdfs:subClassOf GO:0009987, PW:0000001 . """ self.assertTrue(self.test_util.test_graph_equality( triples, reactome.graph))
class TestMyChemParser(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.source = MyChem('rdf_graph', True) # Replaces source.fetch() data_fh = open(TESTDATA, 'r') self.test_data = json.load(data_fh) data_fh.close() self.source.drugbank_targets.append(self.test_data[0]) self.source.drugcentral_interactors.append(self.test_data[0]) def tearDown(self): self.source = None def test_parse(self): self.source.graph = RDFGraph(True) # Reset graph self.assertTrue(len(list(self.source.graph)) == 0) self.source.parse() triples = """ UNII:46U771ERWK RO:0002606 SNOMED:386761002 ; rdfs:subClassOf CHEBI:23367 . SNOMED:386761002 rdfs:label "Local anesthesia" ; rdfs:subClassOf DOID:4 . """ # dbg logger.debug("Reference graph: %s", self.source.graph.serialize(format="turtle") .decode("utf-8") ) self.assertTrue(self.test_util.test_graph_equality( triples, self.source.graph))
class EvidenceTestCase(unittest.TestCase): def setUp(self): """ Because _process_evidence_view uses self.rawdir to find the evidence file, the defaults are overriden here to point to our test file Note the file name must match what is in that method - evidence_view """ self.test_util = TestUtils() self.mgi = MGI('rdf_graph', True) self.mgi.rawdir = os.path.join( os.path.dirname(__file__), 'resources/mgi') self.mgi.idhash['annot']['6901981'] = ':association' def tearDown(self): self.mgi = None return def test_sex_specificity_model(self): self.mgi.graph = RDFGraph(True) # Reset graph self.mgi._process_evidence_view(limit=None) logger.debug( "Reference graph: %s", self.mgi.graph.serialize(format="turtle").decode("utf-8")) expected_triples = """ :association RO:0002558 ECO:0000006 ; dc:source J:74619 ; :has_sex_specificity PATO:0000384 . J:74619 a IAO:0000310 . """ self.assertTrue(self.test_util.test_graph_equality( expected_triples, self.mgi.graph))
def setUp(self): self.test_util = TestUtils() self.test_set_1 = {'aspect': 'N', 'date': '2006-10-26', 'evidence': {'has_supporting_reference': ['RGD:1581841', 'PMID:12799311'], 'type': 'IED', 'with_support_from': []}, 'negated': False, 'object': {'id': 'MP:0003340', 'taxon': 'NCBITaxon:10116'}, 'provided_by': 'RGD', 'qualifiers': [], 'relation': {'id': None}, 'source_line': 'RGD\t2535\tEdnra\t\tMP:0003340\tRGD:1581841|PMID:12799311\t' 'IED\t\tN\tendothelin receptor type A\t\tgene\ttaxon:10116\t' '20061026\tRGD\t\t\n', 'subject': {'fullname': 'endothelin receptor type A', 'id': 'RGD:2535', 'label': 'Ednra', 'synonyms': [], 'taxon': {'id': 'NCBITaxon:10116'}, 'type': 'gene'}, 'subject_extensions': [{'filler': '\n', 'property': 'isoform'}]} return
def main(): # TODO this should be generated by looking in the dipper/sources directory # or read from a sources/dataset/config yaml or dir of yamls source_to_class_map = { # 'facebase_alpha': 'FaceBase_alpha', 'hpoa': 'HPOAnnotations', # ~3 min 'zfin': 'ZFIN', 'omim': 'OMIM', # full file takes ~15 min, due to required throttling 'biogrid': 'BioGrid', # interactions file takes <10 minutes 'mgi': 'MGI', 'impc': 'IMPC', # Panther takes ~1hr to map 7 species-worth of associations 'panther': 'Panther', 'oma': 'OMA', 'ncbigene': 'NCBIGene', # takes about 4 minutes to process 2 species 'ucscbands': 'UCSCBands', 'ctd': 'CTD', 'genereviews': 'GeneReviews', 'eom': 'EOM', # Takes about 5 seconds. 'coriell': 'Coriell', # 'clinvar': 'ClinVar', # takes ~ half hour # 'clinvarxml_alpha': 'ClinVarXML_alpha', # takes ~ five minutes 'monochrom': 'Monochrom', 'kegg': 'KEGG', 'animalqtldb': 'AnimalQTLdb', 'ensembl': 'Ensembl', 'hgnc': 'HGNC', 'orphanet': 'Orphanet', 'omia': 'OMIA', 'flybase': 'FlyBase', 'mmrrc': 'MMRRC', 'wormbase': 'WormBase', 'mpd': 'MPD', 'gwascatalog': 'GWASCatalog', 'monarch': 'Monarch', 'go': 'GeneOntology', 'reactome': 'Reactome', 'udp': 'UDP', 'mgi-slim': 'MGISlim', 'zfinslim': 'ZFINSlim', 'bgee': 'Bgee', 'mydrug': 'MyDrug', 'stringdb': 'StringDB', 'rgd': 'RGD', 'sgd': 'SGD', 'mychem': 'MyChem' } logger = logging.getLogger(__name__) parser = argparse.ArgumentParser( description='Dipper: Data Ingestion Pipeline for SciGraph', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( '-g', '--graph', type=str, default="rdf_graph", help='graph type: rdf_graph, streamed_graph') parser.add_argument( '-s', '--sources', type=str, required=True, help='comma separated list of sources') parser.add_argument( '-l', '--limit', type=int, help='limit number of rows') parser.add_argument( '--parse_only', action='store_true', help='parse files without writing') parser.add_argument( '--fetch_only', action='store_true', help='fetch sources without parsing') parser.add_argument('-f', '--force', action='store_true', help='force re-download of files') parser.add_argument( '--no_verify', help='ignore the verification step', action='store_true') parser.add_argument('--query', help='enter in a sparql query', type=str) parser.add_argument( '-q', '--quiet', help='turn off info logging', action="store_true") parser.add_argument( '--debug', help='turn on debug logging', action="store_true") parser.add_argument( '--skip_tests', help='skip any testing', action="store_true") # Blank Nodes can't be visualized in Protege, default to Skolemizing them parser.add_argument( '-b', '--use_bnodes', help="use blank nodes instead of skolemizing", action="store_true", default=False) # TODO this should live in a global data file # and the same filter be applied to all sources parser.add_argument( '-t', '--taxon', type=str, help='Add a taxon constraint on a source. Enter 1+ NCBITaxon numbers,' ' comma delimited\n' 'Implemented taxa per source\n' 'NCBIGene: 9606,10090,7955\n' 'Panther: 9606,10090,10116,7227,7955,6239,8355\n' 'BioGrid: 9606,10090,10116,7227,7955,6239,8355\n' 'UCSCBands: 9606\n' 'GO: 9606,10090,10116,7227,7955,6239,9615,9823,9031,9913') parser.add_argument( '-o', '--test_only', help='only process and output the pre-configured test subset', action="store_true") parser.add_argument( '--dest_fmt', help='serialization format: [turtle], nt, nquads, rdfxml, n3, raw', type=str) parser.add_argument( '--version', '-v', help='version of source', type=str) args = parser.parse_args() tax_ids = None if args.taxon is not None: tax_ids = [int(t) for t in args.taxon.split(',')] taxa_supported = [ # these are not taxa 'Panther', 'NCBIGene', 'BioGrid', 'UCSCBands', 'GeneOntology', 'Bgee', 'Ensembl', 'StringDB', 'OMA'] formats_supported = [ 'turtle', 'ttl', 'ntriples', 'nt', 'nquads', 'nq', 'rdfxml', 'xml', 'notation3', 'n3', 'raw'] if args.quiet: logging.getLogger().setLevel(logging.WARNING) else: if args.debug: logging.getLogger().setLevel(logging.DEBUG) else: logging.getLogger().setLevel(logging.INFO) if not args.use_bnodes: logger.info("Will Skolemize Blank Nodes") if args.query is not None: test_query = TestUtils() for source in args.sources.split(','): source = source.lower() mysource = source_to_class_map[source]() # import source lib module = "dipper.sources.{0}".format(mysource) imported_module = importlib.import_module(module) source_class = getattr(imported_module, mysource) test_query.check_query_syntax(args.query, source_class) test_query.load_graph_from_turtle(source_class) print(test_query.query_graph(args.query, True)) exit(0) # run initial tests if (args.no_verify or args.skip_tests) is not True: unittest.TextTestRunner(verbosity=2).run(test_suite) # set serializer if args.dest_fmt is not None: if args.dest_fmt in formats_supported: if args.dest_fmt == 'ttl': args.dest_fmt = 'turtle' elif args.dest_fmt == 'ntriples': args.dest_fmt = 'nt' elif args.dest_fmt == 'nq': args.dest_fmt = 'nquads' elif args.dest_fmt == 'xml': args.dest_fmt = 'rdfxml' elif args.dest_fmt == 'notation3': args.dest_fmt = 'n3' else: logger.error( "You have specified an invalid serializer: %s", args.dest_fmt) exit(0) else: args.dest_fmt = 'turtle' # iterate through all the sources for source in args.sources.split(','): logger.info("\n******* %s *******", source) source = source.lower() src = source_to_class_map[source] # import source lib module = "dipper.sources.{0}".format(src) imported_module = importlib.import_module(module) source_class = getattr(imported_module, src) mysource = None # arg factory source_args = dict( graph_type=args.graph ) source_args['are_bnodes_skolemized'] = not args.use_bnodes if src in taxa_supported: source_args['tax_ids'] = tax_ids if args.version: source_args['version'] = args.version mysource = source_class(**source_args) if args.parse_only is False: start_fetch = time.clock() mysource.fetch(args.force) end_fetch = time.clock() logger.info("Fetching time: %d sec", end_fetch-start_fetch) mysource.settestonly(args.test_only) # run tests first if (args.no_verify or args.skip_tests) is not True: suite = mysource.getTestSuite() if suite is None: logger.warning( "No tests configured for this source: %s", source) else: unittest.TextTestRunner(verbosity=2).run(suite) else: logger.info("Skipping Tests for source: %s", source) if args.test_only is False and args.fetch_only is False: start_parse = time.clock() mysource.parse(args.limit) end_parse = time.clock() logger.info("Parsing time: %d sec", end_parse-start_parse) if args.graph == 'rdf_graph': logger.info("Found %d nodes", len(mysource.graph)) # Add property axioms start_axiom_exp = time.clock() logger.info("Adding property axioms") properties = GraphUtils.get_properties_from_graph(mysource.graph) GraphUtils.add_property_axioms(mysource.graph, properties) end_axiom_exp = time.clock() logger.info("Property axioms added: %d sec", end_axiom_exp-start_axiom_exp) start_write = time.clock() mysource.write(fmt=args.dest_fmt) end_write = time.clock() logger.info("Writing time: %d sec", end_write-start_write) # if args.no_verify is not True: # status = mysource.verify() # if status is not True: # logger.error( # 'Source %s did not pass verification tests.', source) # exit(1) # else: # logger.info('skipping verification step') logger.info('***** Finished with %s *****', source) # load configuration parameters # for example, keys logger.info("All done.")
class TestGwasHaplotypeModel(unittest.TestCase): """ Test the modelling of a SNP to trait association from sample GWAS catalog data """ def setUp(self): self.test_util = TestUtils() self.source = GWASCatalog('rdf_graph', True) self.source.graph = RDFGraph(True) self.test_data = { 'snp_label': 'rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?', 'chrom_num': '9;9;9;9', 'chrom_pos': '36998996;37002118;37000690;36997420', 'context': 'intron_variant; intron_variant; intron_variant; intron_variant', 'allele_freq': 'NR', 'trait': 'Intelligence', 'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0004337', 'pvalue': '0.00000004', 'merged': '0', 'snp_id_current': '', 'mapped_gene': 'PAX5; PAX5; PAX5; PAX5', 'snp_gene_nums': '', 'upstream_gene_num': '107986179', 'downstream_gene_num': '107986180', 'init_sample_desc': '656 European ancestry individuals from ADHD families', 'replicated_sample_desc': 'NA', 'platform': 'Illumina [795637]', 'pubmed': '22449649' } def tearDown(self): self.source = None def test_snp_model(self): """ Test output model of _process_haplotype() self._process_haplotype( variant_curie, strongest_snp_risk_allele, chrom_num, chrom_pos, context, risk_allele_frequency, mapped_gene, so_ontology) """ self.assertTrue(len(list(self.source.graph)) == 0) variant_curie, variant_type = self.source._get_curie_and_type_from_id( self.test_data['snp_label']) so_ontology = RDFGraph() LOG.info("Loading SO ontology in separate rdf graph") so_ontology.parse(self.source.files['so']['url'], format='xml') so_ontology.bind_all_namespaces() LOG.info("Finished loading SO ontology") self.source._process_haplotype( variant_curie, self.test_data['snp_label'], self.test_data['chrom_num'], self.test_data['chrom_pos'], self.test_data['context'], self.test_data['allele_freq'], self.test_data['mapped_gene'], so_ontology) triples = """ :haplotype_bb627b1f64039b0f751a a OBO:GENO_0000871 ; rdfs:label "rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?" ; OBO:GENO_0000382 dbSNP:rs1329573, dbSNP:rs3758171, dbSNP:rs3824344, dbSNP:rs7020413 ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs1329573 a OBO:SO_0000694, SO:0001627 ; rdfs:label "rs1329573-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3758171 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3758171-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs3824344 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs3824344-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . dbSNP:rs7020413 a OBO:SO_0000694, OBO:SO_0001627 ; rdfs:label "rs7020413-?" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> ; OBO:SO_0001627 HGNC:8619 ; OBO:RO_0002162 OBO:NCBITaxon_9606 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420-36997420-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996-36998996-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690-37000690-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118-37002118-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36997420> a faldo:Position ; faldo:position 36997420 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-36998996> a faldo:Position ; faldo:position 36998996 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37000690> a faldo:Position ; faldo:position 37000690 ; faldo:reference OBO:CHR_GRCh38chr9 . <https://monarchinitiative.org/.well-known/genid/GRCh38chr9-37002118> a faldo:Position ; faldo:position 37002118 ; faldo:reference OBO:CHR_GRCh38chr9 . """ # dbg # LOG.debug( # "Reference graph: %s", # self.source.graph.serialize(format="turtle").decode("utf-8")) # Does not seem to acknowlage these constant triples self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph))
class RGDTestCase(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.test_set_1 = { 'aspect': 'N', 'date': '2006-10-26', 'evidence': { 'has_supporting_reference': ['RGD:1581841', 'PMID:12799311'], 'type': 'IED', 'with_support_from': [] }, 'negated': False, 'object': { 'id': 'MP:0003340', 'taxon': 'NCBITaxon:10116' }, 'provided_by': 'RGD', 'qualifiers': [], 'relation': { 'id': None }, 'source_line': 'RGD\t2535\tEdnra\t\tMP:0003340\tRGD:1581841|PMID:12799311\t' 'IED\t\tN\tendothelin receptor type A\t\tgene\ttaxon:10116\t' '20061026\tRGD\t\t\n', 'subject': { 'fullname': 'endothelin receptor type A', 'id': 'RGD:2535', 'label': 'Ednra', 'synonyms': [], 'taxon': { 'id': 'NCBITaxon:10116' }, 'type': 'gene' }, 'subject_extensions': [{ 'filler': '\n', 'property': 'isoform' }] } return def tearDown(self): return def testRGDParser(self): rgd = RGD('rdf_graph', True) rgd.graph = RDFGraph(True) self.assertTrue(len(list(rgd.graph)) == 0) rgd.make_association(record=self.test_set_1) triples = """ :MONARCH_b4650e8c3d865f11a1a5 a OBAN:association ; RO:0002558 ECO:0005611 ; dcterms:source RGDRef:1581841 ; OBAN:association_has_object OBO:MP_0003340 ; OBAN:association_has_predicate OBO:RO_0002200 ; OBAN:association_has_subject RGD:2535 ; pav:createdOn "2006-10-26" . RGD:2535 OBO:RO_0002200 MP:0003340 . RGDRef:1581841 a IAO:0000311 ; owl:sameAs PMID:12799311 . """ # dbg logger.debug("Reference graph: %s", rgd.graph.serialize(format="turtle").decode("utf-8")) self.assertTrue(self.test_util.test_graph_equality(triples, rgd.graph))
class EvidenceProvenanceTestCase(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.assoc_curie = 'MONARCH:test_association' self.eco_id = 'ECO:0000015' # Headers: # 01 marker_accession_id, # 02 marker_symbol, # 03 phenotyping_center, # 04 colony_raw, # 05 sex, # 06 zygosity, # 07 allele_accession_id, # 08 allele_symbol, # 09 allele_name, # 10 strain_accession_id, # 11 strain_name, # 12 project_name, # 13 project_fullname, # 14 pipeline_name, # 15 pipeline_stable_id, # 16 procedure_stable_id, # 17 procedure_name, # 18 parameter_stable_id, # 19 parameter_name, # 20 top_level_mp_term_id, # 21 top_level_mp_term_name, # 22 mp_term_id, # 23 mp_term_name, # 24 p_value, # 25 percentage_change, # 26 effect_size, # 27 statistical_method, # 28 resource_name self.test_set_1 = ( 'MGI:1920145', # 01 'Setd5', # 02 'WTSI', # 03 'MEFW', # 04 'male', # 05 'heterozygote', # 06 'MGI:4432631', # 07 'Setd5<tm1a(EUCOMM)Wtsi>', # 08 'targeted mutation 1a, Wellcome Trust Sanger Institute', # 09 'MGI:2159965', # 10 'C57BL/6N', # 11 'MGP', # 12 'Wellcome Trust Sanger Institute Mouse Genetics Project', # 13 'MGP Select Pipeline', # 14 'MGP_001', # 15 'MGP_XRY_001', # 16 'X-ray', # 17 'IMPC_XRY_008_001', # 18 'Number of ribs right', # 19 'MP:0005390', # 20 'skeleton phenotype', # 21 'MP:0000480', # 22 'increased rib number', # 23 '1.637023E-010', # 24 '', # 25 '8.885439E-007', # 26 'Wilcoxon rank sum test with continuity correction', # 27 'IMPC' # 28 ) # Generate test curies, these are otherwise generated # within _add_evidence() and _add_study_provenance() # these blank nodes are hardcoded as NOT Skolemized ... self.study_curie = "_:study" self.evidence_curie = "_:evidence" # IRIs for testing sparql output curie_dict = curie_map.get() curie_util = CurieUtil(curie_dict) self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie)) return def test_evidence_model(self): """ Functional test for _add_evidence() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) # Reset graph # Test graph is empty self.assertTrue(len(list(impc.graph)) == 0) (p_value, percentage_change, effect_size) = self.test_set_1[23:26] impc._add_evidence( self.assoc_curie, self.eco_id, p_value, percentage_change, effect_size, self.study_curie) triples = """ :MONARCH_test_association SEPIO:0000007 <https://monarchinitiative.org/.well-known/genid/b97a98087df7a99d8a38> . <https://monarchinitiative.org/.well-known/genid/b97a98087df7a99d8a38> a ECO:0000015 ; SEPIO:0000084 <https://monarchinitiative.org/.well-known/genid/b41ad2bfd375c9de8888>, <https://monarchinitiative.org/.well-known/genid/b216606de82749b03956> ; SEPIO:0000085 <https://monarchinitiative.org/.well-known/genid/study> . <https://monarchinitiative.org/.well-known/genid/b216606de82749b03956> a OBI:0000175 ; RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ; STATO:0000129 1.637023e-10 . <https://monarchinitiative.org/.well-known/genid/b41ad2bfd375c9de8888> a STATO:0000085 ; RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ; STATO:0000129 "8.885439E-007" . """ self.assertTrue(self.test_util.test_graph_equality( triples, impc.graph)) def test_provenance_model(self): """ Functional test for _add_study_provenance() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) self.assertTrue(len(list(impc.graph)) == 0) (phenotyping_center, colony) = self.test_set_1[2:4] (project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name) = self.test_set_1[11:19] (statistical_method, resource_name) = self.test_set_1[26:28] impc._add_study_provenance( phenotyping_center, colony, project_name, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) # dbg LOG.info( "Provenance graph as turtle:\n%s\n", impc.graph.serialize(format="turtle").decode("utf-8") ) triples = """ <https://monarchinitiative.org/.well-known/genid/b0b26361b8687b5ad9ef> a owl:NamedIndividual ; rdfs:label "MEFW" . <https://monarchinitiative.org/.well-known/genid/b6f14f763c8d0629360e> a OBI:0000471 ; BFO:0000050 <http://www.sanger.ac.uk/science/data/mouse-genomes-project>, IMPC-pipe:MGP_001 ; BFO:0000051 STATO:0000076, IMPC-proc:MGP_XRY_001 ; SEPIO:0000017 <http://www.sanger.ac.uk/> ; SEPIO:0000114 <https://www.mousephenotype.org/impress/OntologyInfo?action=list&procID=MGP_XRY_001#IMPC_XRY_008_001> . <http://www.sanger.ac.uk/> a foaf:organization ; rdfs:label "WTSI" . <http://www.sanger.ac.uk/science/data/mouse-genomes-project> a VIVO:Project ; rdfs:label "MGP" . <https://www.mousephenotype.org/impress/OntologyInfo?action=list&procID=MGP_XRY_001#IMPC_XRY_008_001> a owl:NamedIndividual ; rdfs:label "Number of ribs right (X-ray)" . IMPC-pipe:MGP_001 a owl:NamedIndividual ; rdfs:label "MGP Select Pipeline" . IMPC-proc:MGP_XRY_001 a owl:NamedIndividual ; rdfs:label "X-ray" . """ # dbg LOG.info( "Reference graph: %s", impc.graph.serialize(format="turtle").decode("utf-8") ) self.assertTrue( self.test_util.test_graph_equality(triples, impc.graph)) def test_assertion_model(self): """ Functional test for _add_study_provenance() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) self.assertTrue(len(list(impc.graph)) == 0) impc._add_assertion_provenance(self.assoc_curie, self.evidence_curie) triples = """ MONARCH:test_association SEPIO:0000015 <https://monarchinitiative.org/.well-known/genid/bf92df374a884963e805> . <https://monarchinitiative.org/.well-known/genid/bf92df374a884963e805> a SEPIO:0000001 ; SEPIO:0000018 <https://www.mousephenotype.org/> ; SEPIO:0000111 <https://monarchinitiative.org/.well-known/genid/evidence> . <https://www.mousephenotype.org/> a foaf:organization ; rdfs:label "International Mouse Phenotyping Consortium" . """ # dbg LOG.info( "Assertion graph:\n %s\n", impc.graph.serialize( format="turtle").decode("utf-8") ) self.assertTrue(self.test_util.test_graph_equality(triples, impc.graph)) @unittest.skip("Timeouts on travis") def test_random_data_set(self): """ Download dataset using fetch(), then take a row of data and run through evidence and provenance functions to test the output Line of data is hardcoded, but theoretically should work on any line """ line_to_test = 1129 count = 0 impc = IMPC('rdf_graph', False) # Not Skolem self.test_set_N = [] # fetch file # impc.fetch(True) file_path = '/'.join((impc.rawdir, impc.files['all']['file'])) with gzip.open(file_path, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: count += 1 if count < line_to_test: continue elif count == line_to_test: self.test_set_N = row elif count > line_to_test: LOG.info("stopped at line:\t%s\n", count) break # Some DRY violation with the above tests (phenotyping_center, colony) = self.test_set_N[2:4] (project_name,project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name) = self.test_set_N[11:19] (statistical_method, resource_name) = self.test_set_N[26:28] (p_value, percentage_change, effect_size) = self.test_set_N[23:26] # adding evidence impc._add_evidence( self.assoc_curie, self.eco_id, p_value, percentage_change, effect_size, self.study_curie) # adding study impc._add_study_provenance( phenotyping_center, colony, project_name, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name, line_to_test) # Note that this doesn't test much since we're dealing with # multiple part_of and has_part links to individuals # which results in ambiguity = hard to test # dbg LOG.info( "Row %i graph as ntriples:\n%s\n", line_to_test, impc.graph.serialize(format="ntriples").decode("utf-8") ) sparql_query = """ SELECT * WHERE { ?assoc SEPIO:0000007 ?evidenceline . ?evidenceline a ECO:0000015 ; SEPIO:0000085 _:study . ?study a OBI:0000471 ; SEPIO:0000114 ?param ; SEPIO:0000017 ?agent . } """ sparql_output = impc.graph.query(sparql_query) LOG.info( "Test that query for row %i passes and returns one row", int(line_to_test)) # print("Sparql Output: %s\n", list(sparql_output) ) # it is an array with one list with five vars in it self.assertEqual(len(list(sparql_output)), 1) def tearDown(self): return
def setUp(self): self.test_util = TestUtils() self.assoc_curie = 'MONARCH:test_association' self.eco_id = 'ECO:0000015' # Headers: # 01 marker_accession_id, # 02 marker_symbol, # 03 phenotyping_center, # 04 colony_raw, # 05 sex, # 06 zygosity, # 07 allele_accession_id, # 08 allele_symbol, # 09 allele_name, # 10 strain_accession_id, # 11 strain_name, # 12 project_name, # 13 project_fullname, # 14 pipeline_name, # 15 pipeline_stable_id, # 16 procedure_stable_id, # 17 procedure_name, # 18 parameter_stable_id, # 19 parameter_name, # 20 top_level_mp_term_id, # 21 top_level_mp_term_name, # 22 mp_term_id, # 23 mp_term_name, # 24 p_value, # 25 percentage_change, # 26 effect_size, # 27 statistical_method, # 28 resource_name self.test_set_1 = ( 'MGI:1920145', # 01 'Setd5', # 02 'WTSI', # 03 'MEFW', # 04 'male', # 05 'heterozygote', # 06 'MGI:4432631', # 07 'Setd5<tm1a(EUCOMM)Wtsi>', # 08 'targeted mutation 1a, Wellcome Trust Sanger Institute', # 09 'MGI:2159965', # 10 'C57BL/6N', # 11 'MGP', # 12 'Wellcome Trust Sanger Institute Mouse Genetics Project', # 13 'MGP Select Pipeline', # 14 'MGP_001', # 15 'MGP_XRY_001', # 16 'X-ray', # 17 'IMPC_XRY_008_001', # 18 'Number of ribs right', # 19 'MP:0005390', # 20 'skeleton phenotype', # 21 'MP:0000480', # 22 'increased rib number', # 23 '1.637023E-010', # 24 '', # 25 '8.885439E-007', # 26 'Wilcoxon rank sum test with continuity correction', # 27 'IMPC' # 28 ) # Generate test curies, these are otherwise generated # within _add_evidence() and _add_study_provenance() # these blank nodes are hardcoded as NOT Skolemized ... self.study_curie = "_:study" self.evidence_curie = "_:evidence" # IRIs for testing sparql output curie_dict = curie_map.get() curie_util = CurieUtil(curie_dict) self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie)) return
class StringTestFakeData(unittest.TestCase): def setUp(self): self.test_util = TestUtils() # Test set with two proteins from same species self.test_set_1 = [[ '9606.ENSP00000000233', '9606.ENSP00000003084', 0, 0, 0, 0, 300, 0, 150, 800]] # Test set with deprecated protein id self.test_set_2 = [[ '9606.ENSP00000000233', '9606.ENSP00000006101', 0, 0, 0, 0, 300, 0, 150, 800]] self.columns = [ 'protein1', 'protein2', 'neighborhood', 'fusion', 'cooccurence', 'coexpression', 'experimental', 'database', 'textmining', 'combined_score'] ensembl = Ensembl('rdf_graph', True) self.protein_list = ensembl.fetch_protein_gene_map('9606') return def tearDown(self): return def testFakeDataSet1(self): string_db = StringDB('rdf_graph', True) string_db.graph = RDFGraph(True) self.assertEqual(len(string_db.graph), 0) ensembl = Ensembl('rdf_graph', True) prot_map = ensembl.fetch_protein_gene_map('9606') for key in prot_map.keys(): for i, gene in enumerate(prot_map[key]): prot_map[key][i] = "ENSEMBL:{}".format(gene) print( "Finished fetching ENSP IDs, fetched {} proteins" .format(len(prot_map.keys()))) dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns) string_db._process_protein_links(dataframe, prot_map, '9606') # g1 <interacts with> g2 triples = """ ENSEMBL:ENSG00000001626 RO:0002434 ENSEMBL:ENSG00000004059 . """ self.assertTrue(self.test_util.test_graph_equality(triples, string_db.graph)) def testFakeDataSet2(self): """ Dataset contains a deprecated protein ID that we expect if filtered out by ensembl biomart We test that this returns an empty graph :return: """ string_db = StringDB('rdf_graph', True) string_db.graph = RDFGraph() self.assertEqual(len(string_db.graph), 0) dataframe = pd.DataFrame(data=self.test_set_2, columns=self.columns) string_db._process_protein_links(dataframe, self.protein_list, '9606') self.assertEqual(len(string_db.graph), 0)
class TestGwasSNPModel(unittest.TestCase): """ Test the modelling of a SNP to trait association from sample GWAS catalog data """ def setUp(self): self.test_util = TestUtils() self.source = GWASCatalog('rdf_graph', True) self.source.graph = RDFGraph(True) # Reset graph self.source.graph.bind_all_namespaces() self.test_data = { 'snp_label': 'rs1491921-C', 'chrom_num': '5', 'chrom_pos': '21259029', 'context': 'intergenic_variant', 'allele_freq': '0.013', 'trait': 'Diisocyanate-induced asthma', 'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0006995, http://www.ebi.ac.uk/efo/EFO_0003949', 'pvalue': '0.0000007', 'merged': '0', 'snp_id_current': '1491921', 'mapped_gene': 'LOC102723561 - GUSBP1', 'snp_gene_nums': '', 'upstream_gene_num': '107986179', 'downstream_gene_num': '107986180', 'init_sample_desc': '74 European ancestry cases, 824 European ancestry controls', 'replicated_sample_desc': 'NA', 'platform': 'Illumina [1556551]', 'pubmed': '25918132' } def tearDown(self): self.source = None self.efo_ontology = None def test_snp_type_resolution(self): """ Given the label: rs1491921-C return dbSNP:rs1491921, snp """ self.assertTrue(len(list(self.source.graph)) == 0) variant_curie, variant_type = self.source._get_curie_and_type_from_id( self.test_data['snp_label']) self.assertEqual(variant_curie, "dbSNP:rs1491921") self.assertEqual(variant_type, 'snp') def test_snp_model(self): """ Test output model of _add_snp_to_graph() """ self.assertTrue(len(list(self.source.graph)) == 0) variant_curie, variant_type = self.source._get_curie_and_type_from_id( self.test_data['snp_label']) self.source._add_snp_to_graph( variant_curie, self.test_data['snp_label'], self.test_data['chrom_num'], self.test_data['chrom_pos'], self.test_data['context'], self.test_data['allele_freq']) triples = """ dbSNP:rs1491921 a OBO:SO_0000694, OBO:SO_0001628 ; rdfs:label "rs1491921-C" ; faldo:location <https://monarchinitiative.org/.well-known/genid/GRCh38chr5-21259029-21259029-Region> ; OBO:RO_0002162 OBO:NCBITaxon_9606 ; dc:description "0.013 [risk allele frequency]" . <https://monarchinitiative.org/.well-known/genid/GRCh38chr5-21259029-21259029-Region> a faldo:Region ; faldo:begin <https://monarchinitiative.org/.well-known/genid/GRCh38chr5-21259029> ; faldo:end <https://monarchinitiative.org/.well-known/genid/GRCh38chr5-21259029> . <https://monarchinitiative.org/.well-known/genid/GRCh38chr5-21259029> a faldo:Position ; faldo:position 21259029 ; faldo:reference OBO:CHR_GRCh38chr5 . """ # To debug # print(self.source.graph.serialize(format="turtle").decode("utf-8")) # self.assertTrue(False) # dbg # LOG.debug( # "Reference graph: %s", # self.source.graph.serialize(format="turtle").decode("utf-8")) self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph)) def test_snp_gene_relation(self): """ test the _add_snp_gene_relation function :return: """ self.assertTrue(len(list(self.source.graph)) == 0) variant_curie, variant_type = self.source._get_curie_and_type_from_id( self.test_data['snp_label']) self.source._add_snp_gene_relation( variant_curie, self.test_data['snp_gene_nums'], self.test_data['upstream_gene_num'], self.test_data['downstream_gene_num']) triples = """ dbSNP:rs1491921 OBO:RO_0002528 NCBIGene:107986180 ; OBO:RO_0002529 NCBIGene:107986179 . """ self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph)) def test_deprecated_snp(self): """ test the _add_deprecated_snp :return: """ self.assertTrue(len(list(self.source.graph)) == 0) # fake data snp_id_current = '12345' merged = '1' variant_curie, variant_type = self.source._get_curie_and_type_from_id( self.test_data['snp_label']) self.source._add_deprecated_snp( variant_curie, snp_id_current, merged, self.test_data['chrom_num'], self.test_data['chrom_pos']) triples = """ dbSNP:rs1491921 a owl:NamedIndividual ; OBO:IAO_0100001 dbSNP:rs12345 ; owl:deprecated true . dbSNP:rs12345 MONARCH:cliqueLeader true . """ self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph)) def test_snp_trait_association(self): """ test the _add_variant_trait_association :return: """ self.assertTrue(len(list(self.source.graph)) == 0) efo_ontology = RDFGraph() LOG.info("Loading EFO ontology in separate rdf graph") efo_ontology.parse(self.source.files['efo']['url'], format='xml') efo_ontology.bind_all_namespaces() LOG.info("Finished loading EFO ontology") variant_curie, variant_type = self.source._get_curie_and_type_from_id( self.test_data['snp_label']) description = self.source._make_description( self.test_data['trait'], self.test_data['init_sample_desc'], self.test_data['replicated_sample_desc'], self.test_data['platform'], self.test_data['pvalue']) self.source._add_variant_trait_association( variant_curie, self.test_data['trait_uri'], efo_ontology, self.test_data['pubmed'], description) triples = """ MONARCH:bffc7a930c08cc8fe931 a OBAN:association ; dc:description "{0}" ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0003949 ; OBAN:association_has_predicate RO:0003304 ; OBAN:association_has_subject dbSNP:rs1491921 . MONARCH:bff9b97458d67ed7f517 a OBAN:association ; dc:description "{0}" ; OBO:RO_0002558 OBO:ECO_0000213 ; dc:source PMID:25918132 ; OBAN:association_has_object EFO:0006995 ; OBAN:association_has_predicate RO:0003304 ; OBAN:association_has_subject dbSNP:rs1491921 . EFO:0003949 a owl:Class ; rdfs:label "eye color"^^xsd:string ; rdfs:subClassOf UPHENO:0001001 . dbSNP:rs1491921 RO:0003304 EFO:0003949, EFO:0006995 . PMID:25918132 a OBO:IAO_0000013 . """.format(description) # dbg # LOG.debug( # "Reference graph: %s", # self.source.graph.serialize(format="turtle").decode("utf-8")) self.assertTrue(self.test_util.test_graph_equality(triples, self.source.graph))
class UDPTestCase(unittest.TestCase): """ Test UDP parser """ def setUp(self): self.test_util = TestUtils() return def tearDown(self): return def test_dbsnp_indel_resolution(self): """ unit test for _get_rs_id() Test that we can resolve indels that have different insertion sequence(s) for one rsid 15 51766637 374313651 in-del -/A/AA/AAA/AAAA/CAAA/TAAA """ udp = UDP('rdf_graph', True) rs_map = udp._parse_rs_map_file(udp.map_files['dbsnp_map']) variant_type = 'indel' variant = { 'build': 'hg19', 'chromosome': 'chr15', 'reference_allele': '-', 'variant_allele': 'AAAA', 'position': '51766637' } rsid = udp._get_rs_id(variant, rs_map, variant_type) self.assertEqual(rsid, '374313651') def test_dbsnp_snp_mapping(self): """ unit test for _get_rs_id() Test that we can resolve snps in dbsnp to rsids """ udp = UDP('rdf_graph', True) rs_map = udp._parse_rs_map_file(udp.map_files['dbsnp_map']) variant_type = 'snp' variant = { 'build': 'hg19', 'chromosome': 'chr15', 'reference_allele': 'A', 'variant_allele': 'C', 'position': '54624219' } rsid = udp._get_rs_id(variant, rs_map, variant_type) self.assertEqual(rsid, '755532609') def test_patient_phenotype_model(self): """ functional test for _parse_patient_phenotypes() """ udp = UDP('rdf_graph', True) udp.graph = RDFGraph(True) # test that graph is empty self.assertTrue(len(list(udp.graph)) == 0) mock_lines = [ 'patient_1\tHP:000001\tyes', 'patient_1\tHP:000002\tno' ] mock_data = MagicMock() mock_data.__iter__.return_value = iter(mock_lines) mock_file = mock_open(mock=mock_data) udp._parse_patient_phenotypes(mock_file) triples = """ :patient_1 a foaf:Person ; rdfs:label "patient_1" ; RO:0002200 DOID:4, HP:000001 . """ self.assertTrue(self.test_util.test_graph_equality( triples, udp.graph)) def test_variant_model(self): """ functional test for _parse_patient_variants() """ udp = UDP('rdf_graph', True) udp.graph = RDFGraph(True) # test that graph is empty self.assertTrue(len(list(udp.graph)) == 0) data = ['patient_1', 'family_1', '1', 'HG19', '155230432', 'G', 'A', 'Maternal', 'Biallelic', 'Non-synonymous;DOWNSTREAM', 'CLK2', '', '', '', '', '', '', '', 'Compound heterozygous', 'Heterozygous', '', '0.002747253', ''] test_data = "\t".join(data) mock_lines = [test_data] mock_data = MagicMock() mock_data.__iter__.return_value = iter(mock_lines) mock_file = mock_open(mock=mock_data) udp._parse_patient_variants(mock_file) triples = """ :patient_1 GENO:0000222 <https://monarchinitiative.org/.well-known/genid/ba5f377fc8c95d4a6d7a> . <https://monarchinitiative.org/.well-known/genid/b41e8da0787b45e24c4f> a SO:0001059 ; rdfs:label "hg19chr1(CLK2):g.155230432G>A" ; GENO:0000418 HGNC:2069 ; RO:0002162 NCBITaxon:9606 ; owl:sameAs dbSNP:rs11557757 . <https://monarchinitiative.org/.well-known/genid/ba5f377fc8c95d4a6d7a> a GENO:0000000 ; rdfs:label "patient_1 genotype" ; GENO:0000382 <https://monarchinitiative.org/.well-known/genid/b41e8da0787b45e24c4f> . """ self.assertTrue(self.test_util.test_graph_equality(triples, udp.graph))
def main(): source_to_class_map = { 'hpoa': HPOAnnotations, # ~3 min 'zfin': ZFIN, 'omim': OMIM, # full file takes ~15 min, due to required throttling 'biogrid': BioGrid, # interactions file takes <10 minutes 'mgi': MGI, 'impc': IMPC, 'panther': Panther, # this takes a very long time, ~1hr to map 7 species-worth of associations 'ncbigene': NCBIGene, # takes about 4 minutes to process 2 species 'ucscbands': UCSCBands, 'ctd': CTD, 'genereviews': GeneReviews, 'eom': EOM, # Takes about 5 seconds. 'coriell': Coriell, 'clinvar': ClinVar, 'monochrom': Monochrom, 'kegg': KEGG, 'animalqtldb': AnimalQTLdb, 'ensembl': Ensembl, 'hgnc': HGNC, 'orphanet': Orphanet } logger = logging.getLogger(__name__) parser = argparse.ArgumentParser(description='Dipper: Data Ingestion' ' Pipeline for SciGraph', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-s', '--sources', type=str, required=True, help='comma separated list of sources') parser.add_argument('-l', '--limit', type=int, help='limit number of rows') parser.add_argument('--parse_only', action='store_true', help='parse files without writing'), parser.add_argument('--fetch_only', action='store_true', help='fetch sources without parsing') parser.add_argument('-f', '--force', action='store_true', help='force re-download of files') parser.add_argument('--no_verify', help='ignore the verification step', action='store_true') parser.add_argument('--query', help='enter in a sparql query', type=str) parser.add_argument('-q', '--quiet', help='turn off info logging', action="store_true") parser.add_argument('--debug', help='turn on debug logging', action="store_true") # BNodes can't be visualized in Protege, so you can materialize them for testing purposes with this flag parser.add_argument('-nb', '--no_bnodes', help="convert blank nodes into identified nodes", action="store_true") # TODO this preconfiguration should probably live in the conf.json, and the same filter be applied to all sources parser.add_argument('-t', '--taxon', type=str, help='Add a taxon constraint on a source. Enter 1+ NCBITaxon numbers, comma delimited\n' 'Implemented taxa per source\n' 'NCBIGene: 9606,10090,7955\n' 'Panther: 9606,10090,10116,7227,7955,6239,8355\n' 'BioGrid: 9606,10090,10116,7227,7955,6239,8355\n' 'UCSCBands: 9606') parser.add_argument('-o', '--test_only', help='only process and output the pre-configured test subset', action="store_true") args = parser.parse_args() tax_ids = None if args.taxon is not None: tax_ids = list(map(int, args.taxon.split(','))) taxa_supported = [Panther, NCBIGene, BioGrid, UCSCBands] if args.quiet: logging.basicConfig(level=logging.ERROR) else: if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if args.no_bnodes is True: logger.info("Will materialize all BNodes into BASE space") if args.query is not None: test_query = TestUtils() for source in args.sources.split(','): source = source.lower() mysource = source_to_class_map[source]() test_query.check_query_syntax(args.query, mysource) test_query.load_graph_from_turtle(mysource) print(test_query.query_graph(args.query, True)) exit(0) # run initial tests if args.no_verify is not True: unittest.TextTestRunner(verbosity=2).run(test_suite) # iterate through all the sources for source in args.sources.split(','): logger.info("\n******* %s *******", source) source = source.lower() src = source_to_class_map[source] mysource = None if src in taxa_supported: mysource = src(tax_ids) else: mysource = src() if args.parse_only is False: mysource.fetch(args.force) mysource.settestonly(args.test_only) mysource.setnobnodes(args.no_bnodes) # run tests first if args.no_verify is not True: suite = mysource.getTestSuite() if suite is None: logger.warn("No tests configured for this source: %s", source) else: unittest.TextTestRunner(verbosity=2).run(suite) else: logger.info("Skipping Tests for source: %s", source) if args.test_only is False and args.fetch_only is False: mysource.parse(args.limit) mysource.write(format='turtle') # if args.no_verify is not True: # status = mysource.verify() # if status is not True: # logger.error('Source %s did not pass verification tests.', source) # exit(1) # else: # logger.info('skipping verification step') logger.info('***** Finished with %s *****', source) # load configuration parameters # for example, keys logger.info("All done.")
class EvidenceProvenanceTestCase(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.assoc_curie = 'MONARCH:test_association' self.eco_id = 'ECO:0000015' # Headers: # 01 marker_accession_id, # 02 marker_symbol, # 03 phenotyping_center, # 04 colony_raw, # 05 sex, # 06 zygosity, # 07 allele_accession_id, # 08 allele_symbol, # 09 allele_name, # 10 strain_accession_id, # 11 strain_name, # 12 project_name, # 13 project_fullname, # 14 pipeline_name, # 15 pipeline_stable_id, # 16 procedure_stable_id, # 17 procedure_name, # 18 parameter_stable_id, # 19 parameter_name, # 20 top_level_mp_term_id, # 21 top_level_mp_term_name, # 22 mp_term_id, # 23 mp_term_name, # 24 p_value, # 25 percentage_change, # 26 effect_size, # 27 statistical_method, # 28 resource_name self.test_set_1 = ( 'MGI:1920145', # 01 'Setd5', # 02 'WTSI', # 03 'MEFW', # 04 'male', # 05 'heterozygote', # 06 'MGI:4432631', # 07 'Setd5<tm1a(EUCOMM)Wtsi>', # 08 'targeted mutation 1a, Wellcome Trust Sanger Institute', # 09 'MGI:2159965', # 10 'C57BL/6N', # 11 'MGP', # 12 'Wellcome Trust Sanger Institute Mouse Genetics Project', # 13 'MGP Select Pipeline', # 14 'MGP_001', # 15 'MGP_XRY_001', # 16 'X-ray', # 17 'IMPC_XRY_008_001', # 18 'Number of ribs right', # 19 'MP:0005390', # 20 'skeleton phenotype', # 21 'MP:0000480', # 22 'increased rib number', # 23 '1.637023E-010', # 24 '', # 25 '8.885439E-007', # 26 'Wilcoxon rank sum test with continuity correction', # 27 'IMPC' # 28 ) # Generate test curies, these are otherwise generated # within _add_evidence() and _add_study_provenance() # these blank nodes are hardcoded as NOT Skolemized ... self.study_curie = "_:study" self.evidence_curie = "_:evidence" # IRIs for testing sparql output curie_dict = curie_map.get() curie_util = CurieUtil(curie_dict) self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie)) return def test_evidence_model(self): """ Functional test for _add_evidence() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) # Reset graph # Test graph is empty self.assertTrue(len(list(impc.graph)) == 0) (p_value, percentage_change, effect_size) = self.test_set_1[23:26] impc._add_evidence( self.assoc_curie, self.eco_id, p_value, percentage_change, effect_size, self.study_curie) triples = """ :MONARCH_test_association SEPIO:0000007 <https://monarchinitiative.org/.well-known/genid/b97a98087df7a99d8a38> . <https://monarchinitiative.org/.well-known/genid/b97a98087df7a99d8a38> a ECO:0000015 ; SEPIO:0000084 <https://monarchinitiative.org/.well-known/genid/b41ad2bfd375c9de8888>, <https://monarchinitiative.org/.well-known/genid/b216606de82749b03956> ; SEPIO:0000085 <https://monarchinitiative.org/.well-known/genid/study> . <https://monarchinitiative.org/.well-known/genid/b216606de82749b03956> a OBI:0000175 ; RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ; STATO:0000129 1.637023e-10 . <https://monarchinitiative.org/.well-known/genid/b41ad2bfd375c9de8888> a STATO:0000085 ; RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ; STATO:0000129 "8.885439E-007" . """ self.assertTrue(self.test_util.test_graph_equality( triples, impc.graph)) def test_provenance_model(self): """ Functional test for _add_study_provenance() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) self.assertTrue(len(list(impc.graph)) == 0) (phenotyping_center, colony) = self.test_set_1[2:4] (project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name) = self.test_set_1[12:19] (statistical_method, resource_name) = self.test_set_1[26:28] impc._add_study_provenance( phenotyping_center, colony, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name, 0) # dbg logger.info( "Provenance graph as turtle:\n%s\n", impc.graph.serialize(format="turtle").decode("utf-8") ) triples = """ <https://monarchinitiative.org/.well-known/genid/bdd05a8ca155ddaf415e> a OBI:0000471 ; BFO:0000051 OBO:STATO_0000076, <https://www.mousephenotype.org/impress/protocol/175/15> ; BFO:0000050 IMPRESS-procedure:15 , <http://www.sanger.ac.uk/science/data/mouse-genomes-project> ; SEPIO:0000114 <https://www.mousephenotype.org/impress/parameterontologies/1867/91> ; SEPIO:0000017 <http://www.sanger.ac.uk/> . <https://monarchinitiative.org/.well-known/genid/b0b26361b8687b5ad9ef> a owl:NamedIndividual ; rdfs:label "MEFW" . <http://www.sanger.ac.uk/> a foaf:organization ; rdfs:label "WTSI" . <http://www.sanger.ac.uk/science/data/mouse-genomes-project> a VIVO:Project ; rdfs:label "Wellcome Trust Sanger Institute Mouse Genetics Project" . <https://www.mousephenotype.org/impress/parameterontologies/1867/91> a owl:NamedIndividual ; rdfs:label "Number of ribs right (X-ray)" . IMPRESS-procedure:15 a owl:NamedIndividual ; rdfs:label "MGP Select Pipeline" . <https://www.mousephenotype.org/impress/protocol/175/15> a owl:NamedIndividual ; rdfs:label "X-ray" . """ # dbg logger.debug( "Reference graph: %s", impc.graph.serialize(format="turtle").decode("utf-8") ) self.assertTrue( self.test_util.test_graph_equality(triples, impc.graph)) def test_assertion_model(self): """ Functional test for _add_study_provenance() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) self.assertTrue(len(list(impc.graph)) == 0) impc._add_assertion_provenance(self.assoc_curie, self.evidence_curie) triples = """ MONARCH:test_association SEPIO:0000015 <https://monarchinitiative.org/.well-known/genid/bf92df374a884963e805> . <https://monarchinitiative.org/.well-known/genid/bf92df374a884963e805> a SEPIO:0000001 ; SEPIO:0000018 <https://www.mousephenotype.org/> ; SEPIO:0000111 <https://monarchinitiative.org/.well-known/genid/evidence> . <https://www.mousephenotype.org/> a foaf:organization ; rdfs:label "International Mouse Phenotyping Consortium" . """ # dbg logger.info( "Assertion graph:\n %s\n", impc.graph.serialize( format="turtle").decode("utf-8") ) self.assertTrue(self.test_util.test_graph_equality(triples, impc.graph)) def test_random_data_set(self): """ Download dataset using fetch(), then take a row of data and run through evidence and provenance functions to test the output Line of data is hardcoded, but theoretically should work on any line """ line_to_test = 1129 count = 0 impc = IMPC('rdf_graph', False) # Not Skolem self.test_set_N = [] # fetch file # impc.fetch(True) file_path = '/'.join((impc.rawdir, impc.files['all']['file'])) with gzip.open(file_path, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: count += 1 if count < line_to_test: continue elif count == line_to_test: self.test_set_N = row elif count > line_to_test: logger.info("stopped at line:\t%s\n", count) break # Some DRY violation with the above tests (phenotyping_center, colony) = self.test_set_N[2:4] (project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name) = self.test_set_N[12:19] (statistical_method, resource_name) = self.test_set_N[26:28] (p_value, percentage_change, effect_size) = self.test_set_N[23:26] # adding evidence impc._add_evidence( self.assoc_curie, self.eco_id, p_value, percentage_change, effect_size, self.study_curie) # adding study impc._add_study_provenance( phenotyping_center, colony, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name, line_to_test) # Note that this doesn't test much since we're dealing with # multiple part_of and has_part links to individuals # which results in ambiguity = hard to test # dbg logger.info( "Row %i graph as ntriples:\n%s\n", line_to_test, impc.graph.serialize( format="ntriples").decode("utf-8") ) sparql_query = """ SELECT * WHERE { ?assoc SEPIO:0000007 ?evidenceline . ?evidenceline a ECO:0000015 ; SEPIO:0000085 _:study . ?study a OBI:0000471 ; SEPIO:0000114 ?param ; SEPIO:0000017 ?agent . } """ sparql_output = impc.graph.query(sparql_query) logger.info("Test that query for row %i passes and returns one row", int(line_to_test)) # print("Sparql Output: %s\n", list(sparql_output) ) # it is an array with one list with five vars in it self.assertEqual(len(list(sparql_output)), 1) def tearDown(self): return
class StringTestFakeData(unittest.TestCase): def setUp(self): self.test_util = TestUtils() # Test set with two proteins from same species self.test_set_1 = \ [['9606.ENSP00000000233', '9606.ENSP00000003084', 0, 0, 0, 0, 300, 0, 150, 800]] # Test set with deprecated protein id self.test_set_2 = \ [['9606.ENSP00000000233', '9606.ENSP00000006101', 0, 0, 0, 0, 300, 0, 150, 800]] self.columns = [ 'protein1', 'protein2', 'neighborhood', 'fusion', 'cooccurence', 'coexpression', 'experimental', 'database', 'textmining', 'combined_score' ] ensembl = Ensembl('rdf_graph', True) self.protein_list = ensembl.fetch_protein_gene_map(9606) return def tearDown(self): return def testFakeDataSet1(self): string_db = StringDB('rdf_graph', True) string_db.graph = RDFGraph(True) self.assertEqual(len(string_db.graph), 0) ensembl = Ensembl('rdf_graph', True) prot_map = ensembl.fetch_protein_gene_map(9606) for key in prot_map.keys(): prot_map[key] = "ENSEMBL:{}".format(prot_map[key]) print("Finished fetching ENSP IDs, fetched {} proteins".format( len(prot_map.keys()))) dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns) string_db._process_protein_links(dataframe, prot_map, 9606) triples = """ ENSEMBL:ENSG00000001626 RO:0002434 ENSEMBL:ENSG00000004059 . """ self.assertTrue( self.test_util.test_graph_equality(triples, string_db.graph)) def testFakeDataSet2(self): """ Dataset contains a deprecated protein ID that we expect if filtered out by ensembl biomart We test that this returns an empty graph :return: """ string_db = StringDB('rdf_graph', True) string_db.graph = RDFGraph() self.assertEqual(len(string_db.graph), 0) dataframe = pd.DataFrame(data=self.test_set_2, columns=self.columns) string_db._process_protein_links(dataframe, self.protein_list, 9606) self.assertEqual(len(string_db.graph), 0)
def main(): # TODO this should be generated by looking in the dipper/sources directory # or read from a sources/dataset/config yaml or dir of yamls source_to_class_map = { # 'facebase_alpha': 'FaceBase_alpha', 'hpoa': 'HPOAnnotations', # ~3 min 'zfin': 'ZFIN', 'omim': 'OMIM', # full file takes ~15 min, due to required throttling 'biogrid': 'BioGrid', # interactions file takes <10 minutes 'mgi': 'MGI', 'impc': 'IMPC', # Panther takes ~1hr to map 7 species-worth of associations 'panther': 'Panther', 'oma': 'OMA', 'ncbigene': 'NCBIGene', # takes about 4 minutes to process 2 species 'ucscbands': 'UCSCBands', 'ctd': 'CTD', 'genereviews': 'GeneReviews', 'eom': 'EOM', # Takes about 5 seconds. 'coriell': 'Coriell', # 'clinvar': 'ClinVar', # takes ~ half hour # 'clinvarxml_alpha': 'ClinVarXML_alpha', # takes ~ five minutes 'monochrom': 'Monochrom', 'kegg': 'KEGG', 'animalqtldb': 'AnimalQTLdb', 'ensembl': 'Ensembl', 'hgnc': 'HGNC', 'orphanet': 'Orphanet', 'omia': 'OMIA', 'flybase': 'FlyBase', 'mmrrc': 'MMRRC', 'wormbase': 'WormBase', 'mpd': 'MPD', 'gwascatalog': 'GWASCatalog', 'monarch': 'Monarch', 'go': 'GeneOntology', 'reactome': 'Reactome', 'udp': 'UDP', 'mgi-slim': 'MGISlim', 'zfin-slim': 'ZFINSlim', 'bgee': 'Bgee', 'mydrug': 'MyDrug', 'stringdb': 'StringDB', 'rgd': 'RGD', 'sgd': 'SGD' } logger = logging.getLogger(__name__) parser = argparse.ArgumentParser( description='Dipper: Data Ingestion Pipeline for SciGraph', formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( '-g', '--graph', type=str, default="rdf_graph", help='graph type: rdf_graph, streamed_graph') parser.add_argument( '-s', '--sources', type=str, required=True, help='comma separated list of sources') parser.add_argument( '-l', '--limit', type=int, help='limit number of rows') parser.add_argument( '--parse_only', action='store_true', help='parse files without writing') parser.add_argument( '--fetch_only', action='store_true', help='fetch sources without parsing') parser.add_argument('-f', '--force', action='store_true', help='force re-download of files') parser.add_argument( '--no_verify', help='ignore the verification step', action='store_true') parser.add_argument('--query', help='enter in a sparql query', type=str) parser.add_argument( '-q', '--quiet', help='turn off info logging', action="store_true") parser.add_argument( '--debug', help='turn on debug logging', action="store_true") parser.add_argument( '--skip_tests', help='skip any testing', action="store_true") # Blank Nodes can't be visualized in Protege, default to Skolemizing them parser.add_argument( '-b', '--use_bnodes', help="use blank nodes instead of skolemizing", action="store_true", default=False) # TODO this should live in a global data file # and the same filter be applied to all sources parser.add_argument( '-t', '--taxon', type=str, help='Add a taxon constraint on a source. Enter 1+ NCBITaxon numbers,' ' comma delimited\n' 'Implemented taxa per source\n' 'NCBIGene: 9606,10090,7955\n' 'Panther: 9606,10090,10116,7227,7955,6239,8355\n' 'BioGrid: 9606,10090,10116,7227,7955,6239,8355\n' 'UCSCBands: 9606\n' 'GO: 9606,10090,10116,7227,7955,6239,9615,9823,9031,9913') parser.add_argument( '-o', '--test_only', help='only process and output the pre-configured test subset', action="store_true") parser.add_argument( '--dest_fmt', help='serialization format: [turtle], nt, nquads, rdfxml, n3, raw', type=str) parser.add_argument( '--version', '-v', help='version of source', type=str) args = parser.parse_args() tax_ids = None if args.taxon is not None: tax_ids = [int(t) for t in args.taxon.split(',')] taxa_supported = [ # these are not taxa 'Panther', 'NCBIGene', 'BioGrid', 'UCSCBands', 'GeneOntology', 'Bgee', 'Ensembl', 'StringDB', 'OMA'] formats_supported = [ 'turtle', 'ttl', 'ntriples', 'nt', 'nquads', 'nq', 'rdfxml', 'xml', 'notation3', 'n3', 'raw'] if args.quiet: logging.basicConfig(level=logging.ERROR) else: if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if not args.use_bnodes: logger.info("Will Skolemize Blank Nodes") if args.query is not None: test_query = TestUtils() for source in args.sources.split(','): source = source.lower() mysource = source_to_class_map[source]() # import source lib module = "dipper.sources.{0}".format(mysource) imported_module = importlib.import_module(module) source_class = getattr(imported_module, mysource) test_query.check_query_syntax(args.query, source_class) test_query.load_graph_from_turtle(source_class) print(test_query.query_graph(args.query, True)) exit(0) # run initial tests if (args.no_verify or args.skip_tests) is not True: unittest.TextTestRunner(verbosity=2).run(test_suite) # set serializer if args.dest_fmt is not None: if args.dest_fmt in formats_supported: if args.dest_fmt == 'ttl': args.dest_fmt = 'turtle' elif args.dest_fmt == 'ntriples': args.dest_fmt = 'nt' elif args.dest_fmt == 'nq': args.dest_fmt = 'nquads' elif args.dest_fmt == 'xml': args.dest_fmt = 'rdfxml' elif args.dest_fmt == 'notation3': args.dest_fmt = 'n3' else: logger.error( "You have specified an invalid serializer: %s", args.dest_fmt) exit(0) else: args.dest_fmt = 'turtle' # iterate through all the sources for source in args.sources.split(','): logger.info("\n******* %s *******", source) source = source.lower() src = source_to_class_map[source] # import source lib module = "dipper.sources.{0}".format(src) imported_module = importlib.import_module(module) source_class = getattr(imported_module, src) mysource = None # arg factory source_args = dict( graph_type=args.graph ) source_args['are_bnodes_skolemized'] = not args.use_bnodes if src in taxa_supported: source_args['tax_ids'] = tax_ids if args.version: source_args['version'] = args.version mysource = source_class(**source_args) if args.parse_only is False: start_fetch = time.clock() mysource.fetch(args.force) end_fetch = time.clock() logger.info("Fetching time: %d sec", end_fetch-start_fetch) mysource.settestonly(args.test_only) # run tests first if (args.no_verify or args.skip_tests) is not True: suite = mysource.getTestSuite() if suite is None: logger.warning( "No tests configured for this source: %s", source) else: unittest.TextTestRunner(verbosity=2).run(suite) else: logger.info("Skipping Tests for source: %s", source) if args.test_only is False and args.fetch_only is False: start_parse = time.clock() mysource.parse(args.limit) end_parse = time.clock() logger.info("Parsing time: %d sec", end_parse-start_parse) if args.graph == 'rdf_graph': logger.info("Found %d nodes", len(mysource.graph)) # Add property axioms start_axiom_exp = time.clock() logger.info("Adding property axioms") properties = GraphUtils.get_properties_from_graph(mysource.graph) GraphUtils.add_property_axioms(mysource.graph, properties) end_axiom_exp = time.clock() logger.info("Property axioms added: %d sec", end_axiom_exp-start_axiom_exp) start_write = time.clock() mysource.write(fmt=args.dest_fmt) end_write = time.clock() logger.info("Writing time: %d sec", end_write-start_write) # if args.no_verify is not True: # status = mysource.verify() # if status is not True: # logger.error( # 'Source %s did not pass verification tests.', source) # exit(1) # else: # logger.info('skipping verification step') logger.info('***** Finished with %s *****', source) # load configuration parameters # for example, keys logger.info("All done.")
def setUp(self): self.test_util = TestUtils() return
class GeneVariantDiseaseTest(unittest.TestCase): def setUp(self): """ """ self.test_util = TestUtils() self.orphanet = Orphanet('rdf_graph', True) self.orphanet.rawdir = os.path.join(os.path.dirname(__file__), 'resources/orphanet') def tearDown(self): self.orphanet = None return def test_germline_variant_to_disease(self): self.orphanet.graph = RDFGraph() # Reset graph self.orphanet.files['disease-gene']['file'] = 'orph-germline.xml' self.orphanet._process_diseasegene(limit=None) LOG.debug( "Reference graph: %s", self.orphanet.graph.serialize(format="turtle").decode("utf-8")) expected_triples = """ MONARCH:ba2ac5d2153c70e2bb98 a OBAN:association ; RO:0002558 ECO:0000322 ; OBAN:association_has_object ORPHA:938475 ; OBAN:association_has_predicate RO:0004013 ; OBAN:association_has_subject HGNC:30497 . ENSEMBL:ENSG00000166813 a owl:Class . HGNC:30497 a owl:Class ; RO:0004013 ORPHA:938475 ; oboInOwl:hasExactSynonym "KAS1" ; owl:equivalentClass ENSEMBL:ENSG00000166813, ORPHA:268061 . ORPHA:268061 a owl:Class . ORPHA:938475 a owl:Class ; rdfs:label "too much unit testing disorder" . ENSEMBL:ENSG00000166813 biolink:category biolink:Gene . ECO:0000322 biolink:category biolink:EvidenceType . HGNC:30497 biolink:category biolink:Genotype . HGNC:30497 biolink:category biolink:Gene . ORPHA:268061 biolink:category biolink:Gene . ORPHA:938475 biolink:category biolink:Disease . MONARCH:ba2ac5d2153c70e2bb98 biolink:category biolink:Association . """ self.assertTrue( self.test_util.test_graph_equality(expected_triples, self.orphanet.graph)) return def test_germline_lof_variant_to_disease(self): self.orphanet.graph = RDFGraph() # Reset graph self.orphanet.files['disease-gene']['file'] = 'orph-germline-lof.xml' self.orphanet._process_diseasegene(limit=None) LOG.debug( "Reference graph: %s", self.orphanet.graph.serialize(format="turtle").decode("utf-8")) expected_triples = """ MONARCH:b9ad1b0c562ad4db3f1e a OBAN:association ; RO:0002558 ECO:0000322 ; OBAN:association_has_object ORPHA:938475 ; OBAN:association_has_predicate RO:0004012 ; OBAN:association_has_subject ORPHA:268061 . ORPHA:268061 RO:0004012 ORPHA:938475 ; oboInOwl:hasExactSynonym "KAS1" . ORPHA:938475 a owl:Class ; rdfs:label "too much unit testing disorder" . ECO:0000322 biolink:category biolink:EvidenceType . ORPHA:268061 biolink:category biolink:Gene . ORPHA:268061 biolink:category biolink:Genotype . ORPHA:938475 biolink:category biolink:Disease . MONARCH:b9ad1b0c562ad4db3f1e biolink:category biolink:Association . """ self.assertTrue( self.test_util.test_graph_equality(expected_triples, self.orphanet.graph)) return def test_gene_to_disease(self): self.orphanet.graph = RDFGraph() # Reset graph self.orphanet.files['disease-gene']['file'] = 'orph-no-variant.xml' self.orphanet._process_diseasegene(limit=None) LOG.debug( "Reference graph: %s", self.orphanet.graph.serialize(format="turtle").decode("utf-8")) expected_triples = """ MONARCH:bdbeb077e365ddedda20 a OBAN:association ; RO:0002558 ECO:0000322 ; OBAN:association_has_object ORPHA:938475 ; OBAN:association_has_predicate RO:0004015 ; OBAN:association_has_subject ORPHA:268061 . ORPHA:268061 RO:0004015 ORPHA:938475 ; oboInOwl:hasExactSynonym "KAS1" . ORPHA:938475 a owl:Class ; rdfs:label "too much unit testing disorder" . ECO:0000322 biolink:category biolink:EvidenceType . ORPHA:268061 biolink:category biolink:Gene . ORPHA:268061 biolink:category biolink:Genotype . ORPHA:938475 biolink:category biolink:Disease . MONARCH:bdbeb077e365ddedda20 biolink:category biolink:Association . """ self.assertTrue( self.test_util.test_graph_equality(expected_triples, self.orphanet.graph)) return def test_unmapped_disease_assoc_type(self): """ Test that a gene disease type that we have not mapped in translationtable/orphanet.yaml raises a ValueError """ self.orphanet.graph = RDFGraph() # Reset graph self.orphanet.files['disease-gene']['file'] = 'orph-no-mapping.xml' self.assertRaises( KeyError, lambda: self.orphanet._process_diseasegene(limit=None)) return
class EvidenceProvenanceTestCase(unittest.TestCase): def setUp(self): self.test_util = TestUtils() self.assoc_curie = 'MONARCH:test_association' self.eco_id = 'ECO:0000015' self.test_set_1 = ( 'MGI:1920145', 'Setd5', 'WTSI', 'MEFW', 'male', 'heterozygote', 'MGI:4432631', 'Setd5<tm1a(EUCOMM)Wtsi>', 'targeted mutation 1a, Wellcome Trust Sanger Institute', 'MGI:2159965', 'C57BL/6N', 'MGP', 'Wellcome Trust Sanger Institute Mouse Genetics Project', 'MGP Select Pipeline', 'MGP_001', 'MGP_XRY_001', 'X-ray', 'IMPC_XRY_008_001', 'Number of ribs right', 'MP:0005390', 'skeleton phenotype', 'MP:0000480', 'increased rib number', '1.637023E-010', '', '8.885439E-007', 'Wilcoxon rank sum test with continuity correction', 'IMPC') # Generate test curies, these are otherwise generated # within _add_evidence() and _add_study_provenance() # these blank nodes are hardcoded as NOT Skolemized ... self.study_curie = "_:study" self.evidence_curie = "_:evidence" # IRIs for testing sparql output curie_dict = curie_map.get() curie_util = CurieUtil(curie_dict) self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie)) return def test_evidence_model(self): """ Functional test for _add_evidence() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) # Reset graph # Test graph is empty self.assertTrue(len(list(impc.graph)) == 0) impc_map = impc.open_and_parse_yaml(impc.map_files['impc_map']) (p_value, percentage_change, effect_size) = self.test_set_1[23:26] impc._add_evidence(self.assoc_curie, self.eco_id, impc_map, p_value, percentage_change, effect_size, self.study_curie) triples = """ :MONARCH_test_association SEPIO:0000007 <https://monarchinitiative.org/.well-known/genid/b097a98087df7a99> . <https://monarchinitiative.org/.well-known/genid/b097a98087df7a99> a ECO:0000015 ; SEPIO:0000084 <https://monarchinitiative.org/.well-known/genid/b89ee584330837c9>, <https://monarchinitiative.org/.well-known/genid/bc0eeccdea27a1d8> ; SEPIO:0000085 <https://monarchinitiative.org/.well-known/genid/study> . <https://monarchinitiative.org/.well-known/genid/bc0eeccdea27a1d8> a OBI:0000175 ; RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ; STATO:0000129 1.637023e-10 . <https://monarchinitiative.org/.well-known/genid/b89ee584330837c9> a STATO:0000085 ; RO:0002353 <https://monarchinitiative.org/.well-known/genid/study> ; STATO:0000129 "8.885439E-007" . """ self.assertTrue(self.test_util.test_graph_equality( triples, impc.graph)) def test_provenance_model(self): """ Functional test for _add_study_provenance() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) self.assertTrue(len(list(impc.graph)) == 0) impc_map = impc.open_and_parse_yaml(impc.map_files['impc_map']) impress_map = json.loads( impc.fetch_from_url( impc.map_files['impress_map']).read().decode('utf-8')) (phenotyping_center, colony) = self.test_set_1[2:4] (project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name) = self.test_set_1[12:19] (statistical_method, resource_name) = self.test_set_1[26:28] impc._add_study_provenance( impc_map, impress_map, phenotyping_center, colony, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) triples = """ <https://monarchinitiative.org/.well-known/genid/bbdd05a8ca155dda> a OBI:0000471 ; BFO:0000051 OBO:STATO_0000076, <https://www.mousephenotype.org/impress/protocol/175/15> ; BFO:0000050 IMPRESS-procedure:15 , <http://www.sanger.ac.uk/science/data/mouse-genomes-project> ; SEPIO:0000114 <https://www.mousephenotype.org/impress/parameterontologies/1867/91> ; SEPIO:0000017 <http://www.sanger.ac.uk/> . <https://monarchinitiative.org/.well-known/genid/bc0b26361b8687b5> a owl:NamedIndividual ; rdfs:label "MEFW" . <http://www.sanger.ac.uk/> a foaf:organization ; rdfs:label "WTSI" . <http://www.sanger.ac.uk/science/data/mouse-genomes-project> a VIVO:Project ; rdfs:label "Wellcome Trust Sanger Institute Mouse Genetics Project" . <https://www.mousephenotype.org/impress/parameterontologies/1867/91> a owl:NamedIndividual ; rdfs:label "Number of ribs right (X-ray)" . IMPRESS-procedure:15 a owl:NamedIndividual ; rdfs:label "MGP Select Pipeline" . <https://www.mousephenotype.org/impress/protocol/175/15> a owl:NamedIndividual ; rdfs:label "X-ray" . """ # dbg logger.debug("Reference graph: %s", impc.graph.serialize(format="turtle") .decode("utf-8") ) self.assertTrue(self.test_util.test_graph_equality( triples, impc.graph)) def test_assertion_model(self): """ Functional test for _add_study_provenance() """ impc = IMPC('rdf_graph', True) impc.graph = RDFGraph(True) self.assertTrue(len(list(impc.graph)) == 0) impc_map = impc.open_and_parse_yaml(impc.map_files['impc_map']) impc._add_assertion_provenance( self.assoc_curie, self.evidence_curie, impc_map) triples = """ MONARCH:test_association SEPIO:0000015 <https://monarchinitiative.org/.well-known/genid/bcb2c00a5c2f9c43> . <https://monarchinitiative.org/.well-known/genid/bcb2c00a5c2f9c43> a SEPIO:0000001 ; SEPIO:0000018 <http://www.mousephenotype.org/> ; SEPIO:0000111 <https://monarchinitiative.org/.well-known/genid/evidence> . <http://www.mousephenotype.org/> a foaf:organization ; rdfs:label "International Mouse Phenotyping Consortium" . """ # dbg logger.debug("Reference graph: %s", impc.graph.serialize(format="turtle") .decode("utf-8") ) self.assertTrue(self.test_util.test_graph_equality( triples, impc.graph)) def test_random_data_set(self): """ Download dataset using fetch(), then take a row of data and run through evidence and provenance functions to test the output Line of data is hardcoded, but theoretically should work on any line """ line_to_test = 1129 count = 0 impc = IMPC('rdf_graph', False) # Not Skolem impress_map = json.loads( impc.fetch_from_url( impc.map_files['impress_map']).read().decode('utf-8')) impc_map = impc.open_and_parse_yaml(impc.map_files['impc_map']) # fetch file impc.fetch(True) file_path = '/'.join((impc.rawdir, impc.files['all']['file'])) with gzip.open(file_path, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: count += 1 if count == line_to_test: self.test_set_1 = row break # Some DRY violation with the above tests (phenotyping_center, colony) = row[2:4] (project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name) = row[12:19] (statistical_method, resource_name) = row[26:28] (p_value, percentage_change, effect_size) = self.test_set_1[23:26] impc._add_evidence(self.assoc_curie, self.eco_id, impc_map, p_value, percentage_change, effect_size, self.study_curie) impc._add_study_provenance( impc_map, impress_map, phenotyping_center, colony, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) # Note that this doesn't test much since we're dealing with # multiple part_of and has_part links to individuals # which results in ambiguity = hard to test sparql_query = """ SELECT * WHERE { ?assoc SEPIO:0000007 ?evidenceline . ?evidenceline a ECO:0000015 ; SEPIO:0000085 _:study . ?study a OBI:0000471 ; SEPIO:0000114 ?param ; SEPIO:0000017 ?agent . } """ sparql_output = impc.graph.query(sparql_query) # Test that query passes and returns one row self.assertEqual(len(list(sparql_output)), 1) def tearDown(self): return