Пример #1
0
 def setUp(self):
     self.test_util = TestUtils()
     self.source = GWASCatalog('rdf_graph', True)
     self.source.graph = RDFGraph(True)
     self.test_data = {
         'snp_label': 'rs1329573-?; rs7020413-?; rs3824344-?; rs3758171-?',
         'chrom_num': '9;9;9;9',
         'chrom_pos': '36998996;37002118;37000690;36997420',
         'context':
         'intron_variant; intron_variant; intron_variant; intron_variant',
         'allele_freq': 'NR',
         'trait': 'Intelligence',
         'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0004337',
         'pvalue': '0.00000004',
         'merged': '0',
         'snp_id_current': '',
         'mapped_gene': 'PAX5; PAX5; PAX5; PAX5',
         'snp_gene_nums': '',
         'upstream_gene_num': '107986179',
         'downstream_gene_num': '107986180',
         'init_sample_desc':
         '656 European ancestry individuals from ADHD families',
         'replicated_sample_desc': 'NA',
         'platform': 'Illumina [795637]',
         'pubmed': '22449649'
     }
Пример #2
0
 def setUp(self):
     """
     """
     self.test_util = TestUtils()
     self.orphanet = Orphanet('rdf_graph', True)
     self.orphanet.rawdir = os.path.join(os.path.dirname(__file__),
                                         'resources/orphanet')
Пример #3
0
 def setUp(self):
     self.test_util = TestUtils()
     self.test_set_1 = \
         ('ENSBTAP00000013354', 'R-BTA-3000480',
          'http://www.reactome.org/PathwayBrowser/#/R-BTA-3000480',
          'Scavenging by Class A Receptors', 'IEA', 'Bos taurus')
     return
Пример #4
0
    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = {
            'Allele':
            'atp6-L183R (L183R)',
            'Chemical':
            'glycerol',
            'Condition':
            'elevated temperature (35 deg C)|nonfermentable carbon source',
            'Details':
            'similar results obtained with atp6-L247R, and atp6-W136R, all '
            'corresponding to human NARP syndrome mutants',
            'Experiment Type':
            'classical genetics',
            'Feature Name':
            'Q0085',
            'Feature Type':
            'ORF',
            'Gene Name':
            'ATP6',
            'Mutant Type':
            'reduction of function',
            'Phenotype':
            'respiratory growth: decreased rate',
            'Reference':
            'PMID: 21715656|SGD_REF: S000145858',
            'Reporter':
            ' ',
            'SGDID':
            'S000007268',
            'Strain Background':
            'Other'
        }

        return
Пример #5
0
 def setUp(self):
     self.test_util = TestUtils()
     self.source = GWASCatalog('rdf_graph', True)
     self.source.graph = RDFGraph(True)  # Reset graph
     self.source.graph.bind_all_namespaces()
     self.test_data = {
         'snp_label': 'rs1491921-C',
         'chrom_num': '5',
         'chrom_pos': '21259029',
         'context': 'intergenic_variant',
         'allele_freq': '0.013',
         'trait': 'Diisocyanate-induced asthma',
         'trait_uri': 'http://www.ebi.ac.uk/efo/EFO_0006995, http://www.ebi.ac.uk/efo/EFO_0003949',
         'pvalue': '0.0000007',
         'merged': '0',
         'snp_id_current': '1491921',
         'mapped_gene': 'LOC102723561 - GUSBP1',
         'snp_gene_nums': '',
         'upstream_gene_num': '107986179',
         'downstream_gene_num': '107986180',
         'init_sample_desc': '74 European ancestry cases, 824 European ancestry controls',
         'replicated_sample_desc': 'NA',
         'platform': 'Illumina [1556551]',
         'pubmed': '25918132'
     }
Пример #6
0
 def setUp(self):
     self.test_util = TestUtils()
     self.source = CTD('rdf_graph', True)
     self.source.graph = RDFGraph(True)
     self.test_row = [
         'Nicotine', 'D009538', '', 'TOBACCO ADDICTION, SUSCEPTIBILITY TO',
         'OMIM:188890', 'therapeutic', '', '', '', '12345|56789'
     ]
     return
Пример #7
0
    def setUp(self):
        self.test_util = TestUtils()
        self.source = MyChem('rdf_graph', True)

        # Replaces source.fetch()
        data_fh = open(TESTDATA, 'r')
        self.test_data = json.load(data_fh)
        data_fh.close()
        self.source.drugbank_targets.append(self.test_data[0])
        self.source.drugcentral_interactors.append(self.test_data[0])
Пример #8
0
 def setUp(self):
     """
     """
     self.test_util = TestUtils()
     self.orphanet = Orphanet('rdf_graph', True)
     # Override so tests don't break when we update terms
     self.globaltt = self.orphanet.open_and_parse_yaml(
         os.path.join(os.path.dirname(__file__),
                      './resources/test_terms.yaml'))
     self.orphanet.rawdir = os.path.join(os.path.dirname(__file__),
                                         'resources/orphanet')
Пример #9
0
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.models.Model import Model

        # Make testutils object and load ttl
        test_query = TestUtils(self.source.graph)
        test_query.load_testgraph_from_turtle(self.source)
        graph = self.source.graph
        model = Model(graph)

        # Expected structure
        # TODO can this be unified OBAN and the Annot models
        # to be automatically generated?
        sparql_query = """
                       SELECT ?assoc ?disease ?rel ?chemical
                       WHERE {
                           ?assoc a OBAN:association ;
                           OBAN:association_has_object ?disease ;
                           OBAN:association_has_predicate ?rel ;
                           OBAN:association_has_subject ?chemical .}
                       """

        # SPARQL variables to check
        chem_id = 'MESH:D009538'
        chem_uri = graph._getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = graph._getNode(disease_id)
        rel_id = model.object_properties['substance_that_treats']
        rel_uri = graph._getNode(rel_id)
        # TODO unused
        # pubmed_id = 'PMID:16785264'
        # pubmed_uri = gu.getNode(pubmed_id)
        # eco = 'ECO:0000033'

        assoc = G2PAssoc(graph, self.source.name, chem_id, disease_id, rel_id)
        assoc_id = assoc.make_g2p_id()
        assoc_uri = self.source.graph._getNode(assoc_id)

        # One of the expected outputs from query
        expected_output = [assoc_uri, disease_uri, rel_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(
            expected_output in sparql_output,
            "did not find expected association: " + str(expected_output) +
            " found " + str(len(sparql_output)) + " others:\n" +
            str(sparql_output))

        logger.info("Test query data finished.")
Пример #10
0
 def setUp(self):
     """
     Because _process_evidence_view uses
     self.rawdir to find the evidence file,
     the defaults are overriden here to
     point to our test file
     Note the file name must match what is in
     that method - evidence_view
     """
     self.test_util = TestUtils()
     self.mgi = MGI('rdf_graph', True)
     self.mgi.rawdir = os.path.join(os.path.dirname(__file__),
                                    'resources/mgi')
     self.mgi.idhash['annot']['6901981'] = ':association'
Пример #11
0
    def test_classes_indiv_properties(self):
        """
        Given the above sample input, produce the following:
        A CGD:DiseaseID is an OWL Class
        A CGD:DiseaseID is a subclass of DOID:4
        A CGD:Disease rdfs:label "Adenocarcinoma"
        A CGD:DiseaseInstance is an individual of CGD:DiseaseID
        A CGD:DiseaseInstance rdfs:label "Adenocarcinoma with response {1} to therapy"
        A CGD:DrugID is an OWL Class
        A CGD:DrugID is a subclass of CHEBI:23888
        A CGD:DrugID rdfs:label "5FU-based adjuvant therapy"
        A CGD:RelationID is an object property
        PMID:12345 is a IAO:0000013 (journal article)
        """
        from dipper.utils.TestUtils import TestUtils

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        self.cgd.load_bindings()

        sparql_query = """
                       SELECT ?disease ?diseaseInd ?diseaseQual ?drug ?source
                       WHERE {{
                           ?disease a owl:Class ;
                               rdfs:subClassOf DOID:4 ;
                               rdfs:label "{0}" .
                           ?diseaseInd a ?disease ;
                               rdfs:label "{1}" ;
                               BFO:0000159 ?diseaseQual .
                           ?drug a owl:Class ;
                               rdfs:subClassOf CHEBI:23888 ;
                               rdfs:label "{2}" .
                           <{3}> a owl:ObjectProperty .
                           ?source a IAO:0000013 .
                       }}
                       """.format(self.disease_label, self.disease_instance_label,
                                  self.drug_label, self.relationship_uri)

        # Expected Results
        expected_results = [[self.disease_uri, self.disease_ind_uri,
                             self.disease_quality_uri, self.drug_uri,
                             self.source_uri]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Пример #12
0
    def setUp(self):
        self.test_util = TestUtils()
        self.test_set_1 = {
            'aspect':
            'N',
            'date':
            '2006-10-26',
            'evidence': {
                'has_supporting_reference': ['RGD:1581841', 'PMID:12799311'],
                'type': 'IED',
                'with_support_from': []
            },
            'negated':
            False,
            'object': {
                'id': 'MP:0003340',
                'taxon': 'NCBITaxon:10116'
            },
            'provided_by':
            'RGD',
            'qualifiers': [],
            'relation': {
                'id': None
            },
            'source_line':
            'RGD\t2535\tEdnra\t\tMP:0003340\tRGD:1581841|PMID:12799311\t'
            'IED\t\tN\tendothelin receptor type A\t\tgene\ttaxon:10116\t'
            '20061026\tRGD\t\t\n',
            'subject': {
                'fullname': 'endothelin receptor type A',
                'id': 'RGD:2535',
                'label': 'Ednra',
                'synonyms': [],
                'taxon': {
                    'id': 'NCBITaxon:10116'
                },
                'type': 'gene'
            },
            'subject_extensions': [{
                'filler': '\n',
                'property': 'isoform'
            }]
        }

        return
Пример #13
0
    def test_associations(self):
        """
        Given the above sample input, produce the following:
        CGD:VariantID has_phenotype(RO:0002200) CGD:DiseaseInstance

        A CGD:AssociationID OBO:RO_0002558 Traceable Author Statement (ECO:0000033)
        A CGD:AssociationID dc:source PMID:20498393
        A CGD:AssociationID has_environment CGD:DrugID
        A CGD:AssociationID OBAN:association_has_subject CGD:VariantID
        A CGD:AssociationID OBAN:association_has_object_property has_phenotype
        A CGD:AssociationID OBAN:association_has_object CGD:DiseaseInstance
        """
        from dipper.utils.TestUtils import TestUtils

        # Make testutils object and load bindings
        cu = CurieUtil(self.curie_map)
        test_env = TestUtils(self.cgd.graph)
        self.cgd.load_bindings()
        evidence = 'OBO:ECO_0000033'
        evidence_uri = URIRef(cu.get_uri(evidence))

        sparql_query = """
                       SELECT ?diseaseInd ?variant ?drug ?vdannot ?source ?evidence
                       WHERE {{
                           ?variant OBO:RO_0002200 ?diseaseInd .

                           ?vdannot a OBAN:association ;
                               OBO:RO_0002558 ?evidence ;
                               dc:source ?source ;
                               <{0}> ?drug ;
                               OBAN:association_has_object ?diseaseInd ;
                               OBAN:association_has_object_property OBO:RO_0002200 ;
                               OBAN:association_has_subject ?variant .
                       }}
                       """.format(self.relationship_uri)

        # Expected Results
        expected_results = [[self.disease_ind_uri, self.variant_uri, self.drug_uri,
                             self.vd_annot_uri,
                             self.source_uri, evidence_uri]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Пример #14
0
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.utils.GraphUtils import GraphUtils

        # Make testutils object and load bindings
        test_query = TestUtils(self.ctd.graph)

        # Expected structure
        sparql_query = """
                       SELECT ?assoc ?pubmed ?disease ?chemical
                       WHERE {
                       ?assoc a Annotation: ;
                           dc:evidence OBO:ECO_0000033 ;
                           dc:source ?pubmed ;
                           :hasObject ?disease ;
                           :hasPredicate OBO:RO_0002606 ;
                           :hasSubject ?chemical .}
                       """

        # SPARQL variables to check
        chem_id = 'MESH:D009538'
        chem_uri = self.graph._getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = self.graph._getNode(disease_id)
        pubmed_id = 'PMID:16785264'
        pubmed_uri = self.graph._getNode(pubmed_id)
        rel_id = self.model.object_properties['substance_that_treats']
        eco = 'ECO:0000033'
        # TODO PYLINT  make_association_id() does not exist in CTD
        # there is "_make_association()" with a different sig

        assoc_id = self.ctd.make_association_id('ctd', chem_id, rel_id,
                                                disease_id, eco, pubmed_id)
        assoc_uri = self.graph._getNode(assoc_id)

        # Expected output from query
        expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(expected_output in sparql_output)

        logger.info("Test finished.")
Пример #15
0
    def setUp(self):
        self.test_util = TestUtils()
        # Test set with two proteins from same species
        self.test_set_1 = [[
            '9606.ENSP00000000233', '9606.ENSP00000003084',
            0, 0, 0, 0, 300, 0, 150, 800]]

        # Test set with deprecated protein id
        self.test_set_2 = [[
            '9606.ENSP00000000233', '9606.ENSP00000006101',
            0, 0, 0, 0, 300, 0, 150, 800]]

        self.columns = [
            'protein1', 'protein2', 'neighborhood', 'fusion', 'cooccurence',
            'coexpression', 'experimental', 'database', 'textmining', 'combined_score']

        ensembl = Ensembl('rdf_graph', True)
        self.protein_list = ensembl.fetch_protein_gene_map('9606')

        return
Пример #16
0
    def test_amino_acid_position_region_model(self):
        """
        Test modelling of amino acid positions
        Using test data set 1, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:RegionID is an instance of faldo:Region
        CGD:RegionID faldo:begin BothStrandPositionID
        CGD:RegionID faldo:end BothStrandPositionID

        CGD:BothStrandPositionID is an instance of faldo:BothStrandPosition
        CGD:BothStrandPositionID is an instance of faldo:Position
        CGD:BothStrandPositionID faldo:position 741
        CGD:BothStrandPositionID faldo:reference UniProtID
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_1)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source) = self.test_set_1[0][0:11]

        position = 741
        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))

        uniprot_curie = "UniProtKB:Q99062#Q99062-1"
        uniprot_id = "Q99062#Q99062-1"
        region_id = ":_{0}{1}{2}Region".format(position, position,
                                               uniprot_curie)
        both_strand_id = ":_{0}-{1}".format(uniprot_id, position)

        region_uri = URIRef(cu.get_uri(region_id))
        both_strand_uri = URIRef(cu.get_uri(both_strand_id))
        uniprot_uri = URIRef(cu.get_uri(uniprot_curie))

        sparql_query = """
                       SELECT ?region ?bsPosition ?protein
                       WHERE {{
                           ?region a faldo:Region ;
                               faldo:begin ?bsPosition ;
                               faldo:end ?bsPosition .

                           ?bsPosition a faldo:Position ;
                               faldo:position {0} ;
                               faldo:reference ?protein .
                       }}
                       """.format(position)

        # Expected Results
        expected_results = [[region_uri, both_strand_uri, uniprot_uri]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Пример #17
0
    def setUp(self):
        self.test_util = TestUtils()
        self.assoc_curie = 'MONARCH:test_association'
        self.eco_id = 'ECO:0000015'

        # Headers:
        # 01 marker_accession_id,
        # 02 marker_symbol,
        # 03 phenotyping_center,
        # 04 colony_raw,
        # 05 sex,
        # 06 zygosity,
        # 07 allele_accession_id,
        # 08 allele_symbol,
        # 09 allele_name,
        # 10 strain_accession_id,
        # 11 strain_name,
        # 12 project_name,
        # 13 project_fullname,
        # 14 pipeline_name,
        # 15 pipeline_stable_id,
        # 16 procedure_stable_id,
        # 17 procedure_name,
        # 18 parameter_stable_id,
        # 19 parameter_name,
        # 20 top_level_mp_term_id,
        # 21 top_level_mp_term_name,
        # 22 mp_term_id,
        # 23 mp_term_name,
        # 24 p_value,
        # 25 percentage_change,
        # 26 effect_size,
        # 27 statistical_method,
        # 28 resource_name

        self.test_set_1 = (
            'MGI:1920145',              # 01
            'Setd5',                    # 02
            'WTSI',                     # 03
            'MEFW',                     # 04
            'male',                     # 05
            'heterozygote',             # 06
            'MGI:4432631',              # 07
            'Setd5<tm1a(EUCOMM)Wtsi>',  # 08
            'targeted mutation 1a, Wellcome Trust Sanger Institute',    # 09
            'MGI:2159965',              # 10
            'C57BL/6N',                 # 11
            'MGP',                      # 12
            'Wellcome Trust Sanger Institute Mouse Genetics Project',   # 13
            'MGP Select Pipeline',      # 14
            'MGP_001',                  # 15
            'MGP_XRY_001',              # 16
            'X-ray',                    # 17
            'IMPC_XRY_008_001',         # 18
            'Number of ribs right',     # 19
            'MP:0005390',               # 20
            'skeleton phenotype',       # 21
            'MP:0000480',               # 22
            'increased rib number',     # 23
            '1.637023E-010',            # 24
            '',                         # 25
            '8.885439E-007',            # 26
            'Wilcoxon rank sum test with continuity correction',    # 27
            'IMPC'            # 28
        )

        # Generate test curies, these are otherwise generated
        # within _add_evidence() and _add_study_provenance()
        # these blank nodes are hardcoded as NOT Skolemized  ...
        self.study_curie = "_:study"
        self.evidence_curie = "_:evidence"

        # IRIs for testing sparql output
        curie_dict = curie_map.get()
        curie_util = CurieUtil(curie_dict)
        self.assoc_iri = URIRef(curie_util.get_uri(self.assoc_curie))

        return
Пример #18
0
    def test_variant_position_region_model(self):
        """
        Test modelling of variant positions on a transcript
        Using test data set 2, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:RegionID is an instance of faldo:Region
        CGD:RegionID faldo:begin BothStrandPositionID
        CGD:RegionID faldo:end BothStrandPositionID

        CGD:BothStrandPositionID is an instance of faldo:BothStrandPosition
        CGD:BothStrandPositionID is an instance of faldo:Position
        CGD:BothStrandPositionID faldo:position 944
        CGD:BothStrandPositionID faldo:reference CGD:TranscriptID
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id,
         db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base,
         primary_transcript_exons, primary_transcript_variant_sub_types,
         variant_type, chromosome, genome_build, build_version,
         build_date) = self.test_set_2[0]

        transcript_curie = self.cgd._make_transcript_curie(transcript_id)
        ccds_id = "35166.1"
        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))

        region_id = ":_{0}Region".format(transcript_curie)
        both_strand_id = ":_{0}-{1}".format(ccds_id, bp_pos)

        region_uri = URIRef(cu.get_uri(region_id))
        both_strand_uri = URIRef(cu.get_uri(both_strand_id))
        ccds_uri = URIRef(cu.get_uri(transcript_curie))

        sparql_query = """
                       SELECT ?region ?bsPosition ?transcript
                       WHERE {{
                           ?region a faldo:Region ;
                               faldo:begin ?bsPosition ;
                               faldo:end ?bsPosition .

                           ?bsPosition a faldo:Position ;
                               faldo:position {0} ;
                               faldo:reference ?transcript .
                       }}
                       """.format(bp_pos)

        # Expected Results
        expected_results = [[region_uri, both_strand_uri, ccds_uri]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Пример #19
0
    def test_genome_build_chromosome_model(self):
        """
        Test modelling of genome, builds, and chromosomes
        Using test data set 2, and the function add_variant_info_to_graph()
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        genome = ":9606genome"
        genome_label = "Human genome"
        chromosome = "CHR:9606chr9"
        chromosome_label = "chr9 (Human)"
        build_curie = "UCSC:hg19"
        build_label = "hg19"
        chrom_on_build = ":MONARCH_hg19chr9"
        chrom_build_label = "chr9 (hg19)"

        genome_uri = URIRef(cu.get_uri(genome))
        chromosome_uri = URIRef(cu.get_uri(chromosome))
        build_uri = URIRef(cu.get_uri(build_curie))
        chrom_on_build_uri = URIRef(cu.get_uri(chrom_on_build))
        '''
        sparql_query = """
                       SELECT ?genome ?chromosome ?build ?chromOnBuild
                       WHERE {{
                           ?genome a owl:Class ;
                               rdfs:label "{0}" ;
                               OBO:RO_0002162 OBO:NCBITaxon_9606 ;
                               OBO:RO_0002351 ?chromosome ;
                               rdfs:subClassOf OBO:SO_0001026 .

                           ?chromosome a owl:Class ;
                               rdfs:label "{1}" ;
                               OBO:RO_0002350 ?genome ;
                               rdfs:subClassOf OBO:SO_0000340 .

                           ?build a OBO:SO_0001505 ;
                               a ?genome ;
                               rdfs:label "{2}" ;
                               OBO:RO_0002351 ?chromOnBuild ;
                               rdfs:subClassOf ?genome .

                           ?chromOnBuild a ?chromosome ;
                               rdfs:label "{3}" ;
                               OBO:RO_0002350 ?build .
                       }}
                       """.format(genome_label, chromosome_label,
                                  build_label, chrom_build_label)
        '''
        sparql_query = """
                       SELECT ?genome ?chromosome ?build ?chromOnBuild
                       WHERE {{
                           ?genome a owl:Class ;
                               rdfs:label "{0}" ;
                               rdfs:subClassOf OBO:SO_0001026 .

                           ?chromosome a owl:Class ;
                               rdfs:label "{1}" ;
                               rdfs:subClassOf OBO:SO_0000340 .

                           ?build a OBO:SO_0001505 ;
                               a ?genome ;
                               rdfs:label "{2}" ;
                               OBO:RO_0002162 OBO:NCBITaxon_9606 ;
                               OBO:RO_0002351 ?chromOnBuild .

                           ?chromOnBuild a ?chromosome ;
                               a OBO:SO_0000340 ;
                               rdfs:label "{3}" ;
                               OBO:RO_0002350 ?build .
                       }}
                       """.format(genome_label, chromosome_label, build_label,
                                  chrom_build_label)

        # Expected Results
        expected_results = [[
            genome_uri, chromosome_uri, build_uri, chrom_on_build_uri
        ]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Пример #20
0
def main():
    # TODO this should be generated by looking in the dipper/sources directory
    # or read from a sources/dataset/config yaml or dir of yamls
    source_to_class_map = {
        # 'facebase_alpha': 'FaceBase_alpha',
        'hpoa': 'HPOAnnotations',   # ~3 min
        'zfin': 'ZFIN',
        'omim': 'OMIM',  # full file takes ~15 min, due to required throttling
        'biogrid': 'BioGrid',  # interactions file takes <10 minutes
        'mgi': 'MGI',
        'impc': 'IMPC',
        # Panther takes ~1hr to map 7 species-worth of associations
        'panther': 'Panther',
        'oma': 'OMA',
        'ncbigene': 'NCBIGene',  # takes about 4 minutes to process 2 species
        'ucscbands': 'UCSCBands',
        'ctd': 'CTD',
        'genereviews': 'GeneReviews',
        'eom': 'EOM',  # Takes about 5 seconds.
        'coriell': 'Coriell',
        # 'clinvar': 'ClinVar',                   # takes ~ half hour
        # 'clinvarxml_alpha': 'ClinVarXML_alpha', # takes ~ five minutes
        'monochrom': 'Monochrom',
        'kegg': 'KEGG',
        'animalqtldb': 'AnimalQTLdb',
        'ensembl': 'Ensembl',
        'hgnc': 'HGNC',
        'orphanet': 'Orphanet',
        'omia': 'OMIA',
        'flybase': 'FlyBase',
        'mmrrc': 'MMRRC',
        'wormbase': 'WormBase',
        'mpd': 'MPD',
        'gwascatalog': 'GWASCatalog',
        'monarch': 'Monarch',
        'go': 'GeneOntology',
        'reactome': 'Reactome',
        'udp': 'UDP',
        'mgi-slim': 'MGISlim',
        'zfin-slim': 'ZFINSlim',
        'bgee': 'Bgee',
        'mydrug': 'MyDrug',
        'stringdb': 'StringDB',
        'rgd': 'RGD',
        'sgd': 'SGD'
    }

    logger = logging.getLogger(__name__)

    parser = argparse.ArgumentParser(
        description='Dipper: Data Ingestion Pipeline for SciGraph',
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        '-g', '--graph', type=str, default="rdf_graph",
        help='graph type: rdf_graph, streamed_graph')
    parser.add_argument(
        '-s', '--sources', type=str, required=True,
        help='comma separated list of sources')
    parser.add_argument(
        '-l', '--limit', type=int,
        help='limit number of rows')
    parser.add_argument(
        '--parse_only', action='store_true',
        help='parse files without writing')
    parser.add_argument(
        '--fetch_only', action='store_true',
        help='fetch sources without parsing')
    parser.add_argument('-f', '--force', action='store_true',
                        help='force re-download of files')
    parser.add_argument(
        '--no_verify',
        help='ignore the verification step', action='store_true')
    parser.add_argument('--query', help='enter in a sparql query', type=str)
    parser.add_argument(
        '-q', '--quiet',
        help='turn off info logging', action="store_true")
    parser.add_argument(
        '--debug', help='turn on debug logging', action="store_true")
    parser.add_argument(
        '--skip_tests', help='skip any testing', action="store_true")

    # Blank Nodes can't be visualized in Protege, default to Skolemizing them
    parser.add_argument(
        '-b', '--use_bnodes',
        help="use blank nodes instead of skolemizing", action="store_true",
        default=False)

    # TODO this should live in a global data file
    #   and the same filter be applied to all sources
    parser.add_argument(
        '-t', '--taxon', type=str,
        help='Add a taxon constraint on a source. Enter 1+ NCBITaxon numbers,'
        ' comma delimited\n'
        'Implemented taxa per source\n'
        'NCBIGene: 9606,10090,7955\n'
        'Panther: 9606,10090,10116,7227,7955,6239,8355\n'
        'BioGrid: 9606,10090,10116,7227,7955,6239,8355\n'
        'UCSCBands: 9606\n'
        'GO: 9606,10090,10116,7227,7955,6239,9615,9823,9031,9913')
    parser.add_argument(
        '-o', '--test_only',
        help='only process and output the pre-configured test subset',
        action="store_true")

    parser.add_argument(
        '--dest_fmt',
        help='serialization format: [turtle], nt, nquads, rdfxml, n3, raw',
        type=str)

    parser.add_argument(
        '--version', '-v',
        help='version of source',
        type=str)

    args = parser.parse_args()
    tax_ids = None
    if args.taxon is not None:
        tax_ids = [int(t) for t in args.taxon.split(',')]

    taxa_supported = [  # these are not taxa
        'Panther', 'NCBIGene', 'BioGrid', 'UCSCBands', 'GeneOntology',
        'Bgee', 'Ensembl', 'StringDB', 'OMA']

    formats_supported = [
        'turtle', 'ttl',
        'ntriples', 'nt',
        'nquads', 'nq',
        'rdfxml', 'xml',
        'notation3', 'n3',
        'raw']

    if args.quiet:
        logging.basicConfig(level=logging.ERROR)
    else:
        if args.debug:
            logging.basicConfig(level=logging.DEBUG)
        else:
            logging.basicConfig(level=logging.INFO)

    if not args.use_bnodes:
        logger.info("Will Skolemize Blank Nodes")

    if args.query is not None:
        test_query = TestUtils()
        for source in args.sources.split(','):
            source = source.lower()
            mysource = source_to_class_map[source]()

            # import source lib
            module = "dipper.sources.{0}".format(mysource)
            imported_module = importlib.import_module(module)
            source_class = getattr(imported_module, mysource)

            test_query.check_query_syntax(args.query, source_class)
            test_query.load_graph_from_turtle(source_class)

        print(test_query.query_graph(args.query, True))
        exit(0)

    # run initial tests
    if (args.no_verify or args.skip_tests) is not True:
        unittest.TextTestRunner(verbosity=2).run(test_suite)

    # set serializer
    if args.dest_fmt is not None:
        if args.dest_fmt in formats_supported:
            if args.dest_fmt == 'ttl':
                args.dest_fmt = 'turtle'
            elif args.dest_fmt == 'ntriples':
                args.dest_fmt = 'nt'
            elif args.dest_fmt == 'nq':
                args.dest_fmt = 'nquads'
            elif args.dest_fmt == 'xml':
                args.dest_fmt = 'rdfxml'
            elif args.dest_fmt == 'notation3':
                args.dest_fmt = 'n3'
        else:
            logger.error(
                "You have specified an invalid serializer: %s", args.dest_fmt)

            exit(0)
    else:
        args.dest_fmt = 'turtle'

    # iterate through all the sources
    for source in args.sources.split(','):
        logger.info("\n******* %s *******", source)
        source = source.lower()
        src = source_to_class_map[source]

        # import source lib
        module = "dipper.sources.{0}".format(src)
        imported_module = importlib.import_module(module)
        source_class = getattr(imported_module, src)
        mysource = None
        # arg factory
        source_args = dict(
            graph_type=args.graph
        )
        source_args['are_bnodes_skolemized'] = not args.use_bnodes
        if src in taxa_supported:
            source_args['tax_ids'] = tax_ids
        if args.version:
            source_args['version'] = args.version

        mysource = source_class(**source_args)
        if args.parse_only is False:
            start_fetch = time.clock()
            mysource.fetch(args.force)
            end_fetch = time.clock()
            logger.info("Fetching time: %d sec", end_fetch-start_fetch)

        mysource.settestonly(args.test_only)

        # run tests first
        if (args.no_verify or args.skip_tests) is not True:
            suite = mysource.getTestSuite()
            if suite is None:
                logger.warning(
                    "No tests configured for this source: %s", source)
            else:
                unittest.TextTestRunner(verbosity=2).run(suite)
        else:
            logger.info("Skipping Tests for source: %s", source)

        if args.test_only is False and args.fetch_only is False:
            start_parse = time.clock()
            mysource.parse(args.limit)
            end_parse = time.clock()
            logger.info("Parsing time: %d sec", end_parse-start_parse)
            if args.graph == 'rdf_graph':
                logger.info("Found %d nodes", len(mysource.graph))

                # Add property axioms
                start_axiom_exp = time.clock()
                logger.info("Adding property axioms")

                properties = GraphUtils.get_properties_from_graph(mysource.graph)
                GraphUtils.add_property_axioms(mysource.graph, properties)
                end_axiom_exp = time.clock()
                logger.info("Property axioms added: %d sec",
                            end_axiom_exp-start_axiom_exp)

                start_write = time.clock()
                mysource.write(fmt=args.dest_fmt)
                end_write = time.clock()
                logger.info("Writing time: %d sec", end_write-start_write)
        # if args.no_verify is not True:

        #    status = mysource.verify()
        #    if status is not True:
        #        logger.error(
        #            'Source %s did not pass verification tests.', source)
        #        exit(1)
        # else:
        #    logger.info('skipping verification step')
        logger.info('***** Finished with %s *****', source)
    # load configuration parameters
    # for example, keys

    logger.info("All done.")
Пример #21
0
    def test_chromosome_position_model(self):
        """
        Test modelling of genomic positions
        Using test data set 2, and the function add_variant_info_to_graph()
        """
        from dipper.utils.TestUtils import TestUtils
        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id,
         db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base,
         primary_transcript_exons, primary_transcript_variant_sub_types,
         variant_type, chromosome, genome_build, build_version,
         build_date) = self.test_set_2[0]

        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))

        chromosome_curie = ":MONARCH_hg19chr9"
        region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build, chromosome,
                                                    genome_pos_start,
                                                    genome_pos_end)
        start_id = ":_hg19chr9-{0}".format(genome_pos_start)
        end_id = ":_hg19chr9-{0}".format(genome_pos_end)

        region_uri = URIRef(cu.get_uri(region_id))
        start_uri = URIRef(cu.get_uri(start_id))
        end_uri = URIRef(cu.get_uri(end_id))
        chromosome_uri = URIRef(cu.get_uri(chromosome_curie))

        sparql_query = """
                       SELECT ?region ?startPosition ?endPosition ?chromosome
                       WHERE {{
                           ?region a faldo:Region ;
                               faldo:begin ?startPosition ;
                               faldo:end ?endPosition .

                           ?startPosition a faldo:Position ;
                               faldo:position {0} ;
                               faldo:reference ?chromosome .

                           ?endPosition a faldo:Position ;
                               faldo:position {1} ;
                               faldo:reference ?chromosome .
                       }}
                       """.format(
            genome_pos_start,
            genome_pos_end,
        )

        # Expected Results
        expected_results = [[region_uri, start_uri, end_uri, chromosome_uri]]

        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Пример #22
0
    def test_missense_variant_protein_model(self):
        """
        Test missense variant with only protein information
        Using test data set 1, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:VariantID is an instance of OBO:SO_0001059
        CGD:VariantID is an instance of OBO:SO_0001583
        CGD:VariantID has the label "CSF3R Q741X  missense mutation"
        CGD:VariantID is_sequence_variant_instance_of (OBO:GENO_0000408) NCBIGene:1441
        CGD:VariantID has location (faldo:location) CGD:RegionID
        CGD:VariantID OBO:GENO_reference_amino_acid "Q"
        CGD:VariantID OBO:GENO_results_in_amino_acid_change "X"
        CGD:VariantID RO:0002205 CCDS:413.1

        CCDS:413.1 is an instance of OBO:GENO_primary
        CCDS:413.1 has the label "CCDS413.1"
        """
        from dipper.utils.TestUtils import TestUtils

        self.cgd.add_variant_info_to_graph(self.test_set_1)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source) = self.test_set_1[0][0:11]

        gene_id = self.cgd.gene_map[transcript_gene]
        ref_amino_acid = "Q"
        altered_amino_acid = "X"
        position = 741
        uniprot_curie = "UniProtKB:Q99062#Q99062-1"

        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))
        transcript = "CCDS:413.1"
        region_id = ":_{0}{1}{2}Region".format(position, position,
                                               uniprot_curie)
        variant_uri = URIRef(cu.get_uri(variant_id))
        transcript_uri = URIRef(cu.get_uri(transcript))
        gene_uri = URIRef(cu.get_uri(gene_id))
        region_uri = URIRef(cu.get_uri(region_id))

        sparql_query = """
                       SELECT ?variant ?gene ?region ?transcript
                       WHERE {{
                           ?variant a OBO:SO_0001059;
                               a OBO:SO_0001583 ;
                               rdfs:label "{0}" ;
                               OBO:GENO_0000408 ?gene ;
                               faldo:location ?region ;
                               OBO:GENO_reference_amino_acid "{1}" ;
                               OBO:GENO_results_in_amino_acid_change "{2}" ;
                               RO:0002205 ?transcript .

                           ?transcript a OBO:SO_0000233 ;
                               rdfs:label "{3}" .
                       }}
                       """.format(variant_label, ref_amino_acid,
                                  altered_amino_acid, transcript_id)

        # Expected Results
        expected_results = [[
            variant_uri, gene_uri, region_uri, transcript_uri
        ]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)
Пример #23
0
 def setUp(self):
     self.test_util = TestUtils()
     return
Пример #24
0
    def test_missense_variant_cdna_model(self):
        """
        Test missense variant with cdna information
        Using test data set 2, and the function add_variant_info_to_graph()
        We want to test the following triples:

        CGD:VariantID is an instance of OBO:SO_0001059
        CGD:VariantID is an instance of OBO:SO_0001583
        CGD:VariantID has the label "ABL1 T315I missense mutation"
        CGD:VariantID is_sequence_variant_instance_of (OBO:GENO_0000408) NCBIGene:25
        CGD:VariantID has location (faldo:location) AminoAcidRegionID
        CGD:VariantID has location (faldo:location) CDNARegionID
        CGD:VariantID has location (faldo:location) ChromosomalRegionID
        CGD:VariantID OBO:GENO_reference_amino_acid "T"
        CGD:VariantID OBO:GENO_results_in_amino_acid_change "I"
        CGD:VariantID owl:sameAs dbSNP:rs121913459
        CGD:VariantID owl:sameAs COSMIC:12560
        CGD:VariantID RO:0002205 (transcribed_to) CCDS:35166.1

        CCDS:35166.1 is an instance of OBO:SO_0000233
        CCDS:35166.1 has the label "CCDS35166.1"
        CCDS:35166.1 OBO:RO_0002513 (translates_to) UniProtKB:P00519#P00519-1
        CCDS:35166.1 OBO:RO_0002513 (translates_to) NCBIProtein:NP_005148.2

        UniProtKB:P00519#P00519-1 owl:sameAs NCBIProtein:NP_005148.2

        UniProtKB:P00519#P00519-1 is an instance of OBO:SO_0000104 (polypeptide)
        UniProtKB:P00519#P00519-1 has the label "P00519#P00519-1"

        NCBIProtein:NP_005148.2 is an instance of OBO:SO_0000104 (polypeptide)
        NCBIProtein:NP_005148.2 has the label "NP_005148.2"
        """
        from dipper.utils.TestUtils import TestUtils

        self.cgd.add_variant_info_to_graph(self.test_set_2)

        # Make testutils object and load bindings
        test_env = TestUtils(self.cgd.graph)
        cu = CurieUtil(self.curie_map)
        self.cgd.load_bindings()

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id,
         db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base,
         primary_transcript_exons, primary_transcript_variant_sub_types,
         variant_type, chromosome, genome_build, build_version,
         build_date) = self.test_set_2[0]

        gene_id = self.cgd.gene_map[transcript_gene]
        ref_amino_acid = "T"
        altered_amino_acid = "I"
        db_snp_curie = "dbSNP:121913459"
        cosmic_curie = "COSMIC:12560"
        uniprot_curie = "UniProtKB:P00519#P00519-1"
        uniprot_id = "P00519#P00519-1"
        refseq_curie = "NCBIProtein:NP_005148.2"
        transcript_curie = "CCDS:35166.1"
        ccds_id = "35166.1"
        position = 315
        chromosome_curie = "hg19chr9"

        variant_id = self.cgd.make_cgd_id('variant{0}'.format(variant_key))
        aa_region_id = ":_{0}{1}{2}Region".format(position, position,
                                                  uniprot_curie)
        cdna_region_id = ":_{0}Region".format(transcript_curie)
        chr_region_id = ":_{0}{1}Region-{2}-{3}".format(
            genome_build, chromosome, genome_pos_start, genome_pos_end)
        aa_coord_id = ":_{0}-{1}".format(uniprot_id, position)
        cdna_coord_id = ":_{0}-{1}".format(ccds_id, bp_pos)
        # chr_coord_id = "CHR:{0}-{1}".format(chromosome_curie, genome_pos_start)
        chr_coord_id = ":_{0}-{1}".format(chromosome_curie, genome_pos_start)

        variant_uri = URIRef(cu.get_uri(variant_id))
        transcript_uri = URIRef(cu.get_uri(transcript_curie))
        gene_uri = URIRef(cu.get_uri(gene_id))
        db_snp_uri = URIRef(cu.get_uri(db_snp_curie))
        cosmic_uri = URIRef(cu.get_uri(cosmic_curie))
        uniprot_uri = URIRef(cu.get_uri(uniprot_curie))
        refseq_uri = URIRef(cu.get_uri(refseq_curie))
        aa_region_uri = URIRef(cu.get_uri(aa_region_id))
        cdna_region_uri = URIRef(cu.get_uri(cdna_region_id))
        chr_region_uri = URIRef(cu.get_uri(chr_region_id))
        aa_coord_uri = URIRef(cu.get_uri(aa_coord_id))
        cdna_coord_uri = URIRef(cu.get_uri(cdna_coord_id))
        chr_coord_uri = URIRef(cu.get_uri(chr_coord_id))

        sparql_query = """
                       SELECT ?cosmic ?gene ?aaRegion ?cdnaRegion ?chrRegion
                              ?dbSNP ?transcript ?uniprot ?refseq
                              ?aaCoord ?cdnaCoord ?chrCoord
                       WHERE {{
                           ?cosmic a OBO:SO_0001059;
                               a OBO:SO_0001583 ;
                               OBO:GENO_0000408 ?gene ;
                               faldo:location ?aaRegion ;
                               faldo:location ?cdnaRegion ;
                               faldo:location ?chrRegion ;
                               OBO:GENO_reference_amino_acid "{0}" ;
                               OBO:GENO_reference_nucleotide "{1}" ;
                               OBO:GENO_altered_nucleotide "{2}" ;
                               OBO:GENO_results_in_amino_acid_change "{3}" ;
                               owl:sameAs ?dbSNP ;
                               RO:0002205 ?transcript .

                           ?cosmic owl:sameAs ?dbSNP .

                           ?transcript a OBO:SO_0000233 ;
                               rdfs:label "{4}" ;
                               OBO:RO_0002513 ?uniprot ;
                               OBO:RO_0002513 ?refseq .

                           ?uniprot a OBO:SO_0000104 ;
                               rdfs:label "P00519-1" .

                           ?refseq a OBO:SO_0000104 ;
                               rdfs:label "NP_005148.2" .

                           ?refseq owl:sameAs ?uniprot .

                           ?aaRegion faldo:begin ?aaCoord .
                           ?cdnaRegion faldo:begin ?cdnaCoord .
                           ?chrRegion faldo:begin ?chrCoord .

                           ?aaCoord faldo:position {5} .
                           ?cdnaCoord faldo:position {6} .
                           ?chrCoord faldo:position {7} .

                           ?dbSNP rdfs:label "{8}" .
                       }}
                       """.format(ref_amino_acid, ref_base, variant_base,
                                  altered_amino_acid, transcript_id, position,
                                  bp_pos, genome_pos_start, db_snp_id)

        # Expected Results
        expected_results = [[
            cosmic_uri, gene_uri, aa_region_uri, cdna_region_uri,
            chr_region_uri, db_snp_uri, transcript_uri, uniprot_uri,
            refseq_uri, aa_coord_uri, cdna_coord_uri, chr_coord_uri
        ]]
        # Query graph
        sparql_output = test_env.query_graph(sparql_query)

        self.assertEqual(expected_results, sparql_output)