class DatasetTestCase(unittest.TestCase): """ For testing metadata emitted by Dataset class Dataset creates a graph describing the metadata associated with the dataset in question, which should follow the HCLS specification for dataset descriptions https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/ """ @classmethod def setUpClass(cls): cls.curie_map = curiemap.get() # parameters passed to code, to be returned in graph cls.monarch_archive_curie_prefix = "MonarchArchive" cls.identifier = "fakeingest" cls.ingest_description = "some ingest description" cls.ingest_url = "http://fakeingest.com" cls.ingest_title = "this ingest title" cls.ingest_logo_url = "logo.png" cls.license_url = "https://choosealicense.com/licenses/mit/" cls.license_url_default = "https://project-open-data.cio.gov/unknown-license/" cls.data_rights = "https://www.gnu.org/licenses/gpl-3.0.html" cls.distribution_type = "ttl" # parse test graph once, to test triples counts/statistics below cls.test_ttl = "tests/resources/fakeingest/test_graph_simple.ttl" cls.test_graph = RDFGraph() cls.test_graph.parse(cls.test_ttl, format="turtle") # expected things: cls.expected_curie_prefix = "MonarchArchive" cls.timestamp_date = datetime.today().strftime("%Y%m%d") cls.base_cito = 'http://purl.org/spar/cito/' cls.base_dcat = 'http://www.w3.org/ns/dcat#' cls.base_dcterms = 'http://purl.org/dc/terms/' cls.base_dctypes = 'http://purl.org/dc/dcmitype/' cls.base_pav = 'http://purl.org/pav/' cls.base_rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' cls.base_rdfs = 'http://www.w3.org/2000/01/rdf-schema#' cls.base_schema = 'http://schema.org/' cls.base_void = 'http://rdfs.org/ns/void#' cls.base_owl = 'http://www.w3.org/2002/07/owl#' cls.base_logo_url = "https://github.com/monarch-initiative/monarch-ui/blob/master/public/img/sources/" # expected summary level IRI cls.summary_level_IRI = URIRef( cls.curie_map.get(cls.expected_curie_prefix) + "#" + cls.identifier) # expected version level IRI cls.data_release_version = "19700101" cls.version_level_IRI = URIRef( cls.curie_map.get(cls.expected_curie_prefix) + cls.data_release_version + "/" + "#" + cls.identifier) cls.version_level_IRI_default_version = \ URIRef( cls.curie_map.get(cls.expected_curie_prefix) + cls.timestamp_date + "/" + "#" + cls.identifier) # expected distribution level IRI (for ttl resource) cls.distribution_level_IRI_ttl = \ URIRef( cls.curie_map.get(cls.expected_curie_prefix) + cls.data_release_version + "/rdf/" + cls.identifier + "." + cls.distribution_type) cls.distribution_level_IRI_ttl_default_version = \ URIRef( cls.curie_map.get(cls.expected_curie_prefix) + cls.timestamp_date + "/rdf/" + cls.identifier + "." + cls.distribution_type) # set expected IRIs for predicates and other things cls.iri_rdf_type = URIRef(cls.base_rdf + "type") cls.iri_title = URIRef(cls.base_dcterms + "title") cls.iri_dataset = URIRef(cls.base_dctypes + "Dataset") cls.iri_description = URIRef(cls.base_dcterms + "description") cls.iri_publisher = URIRef(cls.base_dcterms + "Publisher") cls.iri_source = URIRef(cls.base_dcterms + "source") cls.iri_logo = URIRef(cls.base_schema + "logo") cls.iri_mi_org = URIRef("https://monarchinitiative.org/") cls.iri_created = URIRef(cls.base_dcterms + "created") cls.iri_version = URIRef(cls.base_pav + "version") cls.iri_retrieved_on = URIRef(cls.base_pav + "retrievedOn") cls.iri_creator = URIRef(cls.base_dcterms + "creator") cls.iri_is_version_of = URIRef(cls.base_dcterms + "isVersionOf") cls.iri_distribution = URIRef(cls.base_dcat + "Distribution") cls.iri_created_with = URIRef(cls.base_pav + "createdWith") cls.iri_format = URIRef(cls.base_dcterms + "format") cls.iri_download_url = URIRef(cls.base_dcterms + "downloadURL") cls.iri_license = URIRef(cls.base_dcterms + "license") cls.iri_data_rights = URIRef(cls.base_dcterms + "rights") cls.iri_cites_as_authority = URIRef(cls.base_cito + "citesAsAuthority") cls.iri_rdfs_label = URIRef(cls.base_rdfs + "label") cls.iri_owl_ontology = URIRef(cls.base_owl + "Ontology") cls.iri_owl_version_iri = URIRef(cls.base_owl + "versionIRI") cls.iri_owl_version_info = URIRef(cls.base_owl + "versionInfo") cls.iri_returned_logo = URIRef(cls.base_logo_url + cls.ingest_logo_url) cls.iri_expected_download_url_value = \ URIRef( cls.curie_map.get("MonarchArchive") + cls.data_release_version + "/rdf/" + cls.identifier + "." + cls.distribution_type) cls.iri_dipper = URIRef("https://github.com/monarch-initiative/dipper") cls.iri_ttl_spec = URIRef("https://www.w3.org/TR/turtle/") @classmethod def tearDownClass(cls): pass def setUp(self): self.dataset = Dataset(identifier=self.identifier, data_release_version=self.data_release_version, ingest_name=self.identifier, ingest_title=self.ingest_title, ingest_url=self.ingest_url, ingest_logo=self.ingest_logo_url, ingest_description=self.ingest_description, license_url=self.license_url, data_rights=self.data_rights) # put all triples in a list for debugging below self.all_triples = list(self.dataset.graph.triples((None, None, None))) def tearDown(self): pass def test_dataset_has_graph(self): self.assertIsInstance(self.dataset.graph, Graph, "dataset doesn't contain an RDF graph") def test_get_graph(self): self.assertIsInstance(self.dataset.get_graph(), RDFGraph, "get_graph() didn't return an RDF graph") def test_get_license(self): gpl2_iri = "https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html" self.dataset.license_url = gpl2_iri self.assertEqual(self.dataset.get_license(), gpl2_iri, "set_license didn't set license_url correctly") def test_set_citation(self): citation_iri =\ "http://purl.obolibrary.org/obo/uberon/releases/2016-01-26/uberon.owl" self.dataset.set_citation(citation_iri) self.assertTrue(self.dataset.citation.issuperset([citation_iri])) triples = list( self.dataset.graph.triples( (self.version_level_IRI, URIRef(self.iri_cites_as_authority), URIRef(citation_iri)))) self.assertTrue(len(triples) == 1, "missing citation triple") def test_set_ingest_source_file_version_num(self): this_version = "version1234" file_iri = "http://somefilesource.org/file.txt" self.dataset.set_ingest_source_file_version_num(file_iri, this_version) triples = list( self.dataset.graph.triples( (URIRef(file_iri), self.iri_version, Literal(this_version)))) self.assertTrue( len(triples) == 1, "ingest source file version not set") def test_set_ingest_source_file_version_date(self): this_version = "1970-01-01" file_iri = "http://somefilesource.org/file.txt" self.dataset.set_ingest_source_file_version_date( file_iri, this_version) triples = list( self.dataset.graph.triples((URIRef(file_iri), self.iri_version, Literal(this_version, datatype=XSD.date)))) self.assertTrue( len(triples) == 1, "ingest source file version not set with literal type of date") # # Test summary level triples: # def test_summary_level_type(self): triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_rdf_type, self.iri_dataset))) self.assertTrue(len(triples) == 1, "missing summary level type triple") def test_summary_level_title(self): triples = list( self.dataset.graph.triples((self.summary_level_IRI, self.iri_title, Literal(self.ingest_title)))) self.assertTrue( len(triples) == 1, "missing summary level title triple") def test_summary_level_description(self): # by default, description is the class's docstring triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_description, Literal(self.ingest_description)))) self.assertTrue( len(triples) == 1, "missing summary level description triple") def test_summary_level_publisher(self): triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_publisher, self.iri_mi_org))) self.assertTrue( len(triples) == 1, "missing summary level publisher triple") def test_summary_level_source_web_page(self): triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_source, URIRef(self.ingest_url)))) self.assertTrue( len(triples) == 1, "missing summary level source page triple") def test_summary_level_source_logo(self): triples = list( self.dataset.graph.triples((self.summary_level_IRI, self.iri_logo, URIRef(self.iri_returned_logo)))) self.assertTrue( len(triples) == 1, "missing summary level source logo triple") def test_summary_level_ontology_type_declaration(self): triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_rdf_type, self.iri_owl_ontology))) self.assertTrue( len(triples) == 1, "missing distribution level owl ontology type triple") def test_summary_level_owl_version_iri(self): triples = list( self.dataset.graph.triples( (self.summary_level_IRI, self.iri_owl_version_iri, None))) self.assertTrue( len(triples) == 1, "missing distribution level owl version iri triple") self.assertEqual(triples[0][2], URIRef(self.version_level_IRI), "owl version iri triple has the wrong object") # # Test version level resource triples: # def test_version_level_type(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_rdf_type, self.iri_dataset))) self.assertTrue(len(triples) == 1, "missing version level type triple") def test_version_level_title(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_title, None))) self.assertTrue( len(triples) == 1, "missing version level title triple") self.assertEqual( triples[0][2], Literal(self.ingest_title + " Monarch version " + self.data_release_version), "version level title triple has wrong value") def test_version_level_description(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_description, Literal(self.ingest_description)))) self.assertTrue( len(triples) == 1, "missing version level description triple") def test_version_level_created(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_created, None))) self.assertTrue( len(triples) == 1, "didn't get exactly 1 version level created triple") self.assertEqual( triples[0][2], Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date), "version level created triple has the wrong timestamp") def test_version_level_version_default(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_version, None))) self.assertTrue( len(triples) == 1, "didn't get exactly one version level version triple") self.assertEqual( triples[0][2], Literal(self.data_release_version, datatype=XSD.date), "version level version triple (default) has the wrong " + "timestamp") def test_version_level_version_set_explicitly(self): self.dataset = Dataset(identifier=self.identifier, data_release_version=self.data_release_version, ingest_name=self.identifier, ingest_title=self.ingest_title, ingest_url=self.ingest_url, ingest_logo=self.ingest_logo_url, ingest_description=self.ingest_description, license_url=None, data_rights=self.data_rights) triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_version, None))) self.assertTrue( len(triples) == 1, "didn't get exactly one version level version triple") self.assertEqual( triples[0][2], Literal(self.data_release_version, datatype=XSD.date), "version level version triple (set explicitly) is wrong ") def test_version_level_creator(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_creator, self.iri_mi_org))) self.assertTrue( len(triples) == 1, "missing version level creator triple") def test_version_level_publisher(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_publisher, self.iri_mi_org))) self.assertTrue( len(triples) == 1, "missing version level publisher triple") def test_version_level_isVersionOf(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_is_version_of, self.summary_level_IRI))) self.assertTrue( len(triples) == 1, "missing version level isVersionOf triple") def test_version_level_distribution(self): triples = list( self.dataset.graph.triples( (self.version_level_IRI, self.iri_distribution, self.distribution_level_IRI_ttl))) self.assertTrue( len(triples) == 1, "missing version level distribution triple") # # test distribution level triples # def test_distribution_level_dataset_type(self): triples = list( self.dataset.graph.triples((self.distribution_level_IRI_ttl, self.iri_rdf_type, self.iri_dataset))) self.assertTrue( len(triples) == 1, "missing version level type dataset triple") def test_distribution_level_distribution_type(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_rdf_type, self.iri_distribution))) self.assertTrue( len(triples) == 1, "missing version level type distribution triple") def test_distribution_level_title(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_title, None))) self.assertTrue( len(triples) == 1, "missing distribution level type title triple") self.assertEqual( triples[0][2], Literal(self.ingest_title + " distribution " + self.distribution_type), "distribution level title triple has wrong value") def test_distribution_level_description(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_description, Literal(self.ingest_description)))) self.assertTrue( len(triples) == 1, "missing version level type description triple") def test_distribution_level_created(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_created, None))) self.assertTrue( len(triples) == 1, "didn't get exactly 1 version level type created triple") self.assertEqual( triples[0][2], Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date)) def test_distribution_level_version(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_version, None))) self.assertTrue( len(triples) == 1, "didn't get exactly 1 version level type version triple") self.assertEqual(triples[0][2], Literal(self.data_release_version, datatype=XSD.date)) def test_distribution_level_creator(self): triples = list( self.dataset.graph.triples((self.distribution_level_IRI_ttl, self.iri_creator, self.iri_mi_org))) self.assertTrue( len(triples) == 1, "missing distribution level creator triple") def test_distribution_level_publisher(self): triples = list( self.dataset.graph.triples((self.distribution_level_IRI_ttl, self.iri_publisher, self.iri_mi_org))) self.assertTrue( len(triples) == 1, "missing distribution level publisher triple") def test_distribution_level_created_with(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_created_with, self.iri_dipper))) self.assertTrue( len(triples) == 1, "missing distribution level createdWith triple") def test_distribution_level_format(self): triples = list( self.dataset.graph.triples((self.distribution_level_IRI_ttl, self.iri_format, self.iri_ttl_spec))) self.assertTrue( len(triples) == 1, "missing distribution level format triple") def test_distribution_level_download_url(self): triples = list( self.dataset.graph.triples((self.distribution_level_IRI_ttl, self.iri_download_url, None))) self.assertTrue( len(triples) == 1, "didn't get exactly 1 downloadURL triple") self.assertEqual(triples[0][2], self.iri_expected_download_url_value, "didn't get the expected downloadURL value") def test_distribution_level_license_url(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_license, URIRef(self.license_url)))) self.assertTrue( len(triples) == 1, "missing distribution level license triple") def test_distribution_level_data_rights(self): triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl, self.iri_data_rights, URIRef(self.data_rights)))) self.assertTrue( len(triples) == 1, "missing distribution level data rights triple") def test_distribution_level_no_license_url_default_value(self): self.dataset = Dataset(identifier=self.identifier, data_release_version=None, ingest_name=self.identifier, ingest_title=self.ingest_title, ingest_url=self.ingest_url, ingest_logo=self.ingest_logo_url, ingest_description=self.ingest_description, license_url=None, data_rights=self.data_rights) triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl_default_version, self.iri_license, URIRef(self.license_url_default)))) self.assertTrue( len(triples) == 1, "distribution level default license triple not set")
class GeneReviews(Source): """ Here we process the GeneReviews mappings to OMIM, plus inspect the GeneReviews (html) books to pull the clinical descriptions in order to populate the definitions of the terms in the ontology. We define the GeneReviews items as classes that are either grouping classes over OMIM disease ids (gene ids are filtered out), or are made as subclasses of DOID:4 (generic disease). Note that GeneReviews [copyright policy](http://www.ncbi.nlm.nih.gov/books/NBK138602/) (as of 2015.11.20) says: GeneReviews® chapters are owned by the University of Washington, Seattle, © 1993-2015. Permission is hereby granted to reproduce, distribute, and translate copies of content materials provided that (i) credit for source (www.ncbi.nlm.nih.gov/books/NBK1116/) and copyright (University of Washington, Seattle) are included with each copy; (ii) a link to the original material is provided whenever the material is published elsewhere on the Web; and (iii) reproducers, distributors, and/or translators comply with this copyright notice and the GeneReviews Usage Disclaimer. This script doesn't pull the GeneReviews books from the NCBI Bookshelf directly; scripting this task is expressly prohibited by [NCBIBookshelf policy](http://www.ncbi.nlm.nih.gov/books/NBK45311/). However, assuming you have acquired the books (in html format) via permissible means, a parser for those books is provided here to extract the clinical descriptions to define the NBK identified classes. """ files = { 'idmap': { 'file': 'NBKid_shortname_OMIM.txt', 'url': GRDL + '/NBKid_shortname_OMIM.txt' }, 'titles': { 'file': 'GRtitle_shortname_NBKid.txt', 'url': GRDL + '/GRtitle_shortname_NBKid.txt' } } def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'genereviews') self.dataset = Dataset('genereviews', 'Gene Reviews', 'http://genereviews.org/', None, 'http://www.ncbi.nlm.nih.gov/books/NBK138602/') self.dataset.set_citation('GeneReviews:NBK1116') self.book_ids = set() self.all_books = {} if 'test_ids' not in config.get_config() or\ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = list() else: # select ony those test ids that are omim's. self.test_ids = config.get_config()['test_ids']['disease'] return def fetch(self, is_dl_forced=False): """ We fetch GeneReviews id-label map and id-omim mapping files from NCBI. :return: None """ self.get_files(is_dl_forced) return def parse(self, limit=None): """ :return: None """ if self.testOnly: self.testMode = True self._get_titles(limit) self._get_equivids(limit) self.create_books() self.process_nbk_html(limit) # no test subset for now; test == full graph self.testgraph = self.graph return def _get_equivids(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes(not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['idmap']['file'])) model = Model(self.graph) line_counter = 0 # we look some stuff up in OMIM, so initialize here omim = OMIM(self.graph_type, self.are_bnodes_skized) id_map = {} allomimids = set() with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter == 1: # skip header continue (nbk_num, shortname, omim_num) = row gr_id = 'GeneReviews:' + nbk_num omim_id = 'OMIM:' + omim_num if not ((self.testMode and len(self.test_ids) > 0 and omim_id in self.test_ids) or not self.testMode): continue # sometimes there's bad omim nums if len(omim_num) > 6: logger.warning( "OMIM number incorrectly formatted " + "in row %d; skipping:\n%s", line_counter, '\t'.join(row)) continue # build up a hashmap of the mappings; then process later if nbk_num not in id_map: id_map[nbk_num] = set() id_map[nbk_num].add(omim_num) # add the class along with the shortname model.addClassToGraph(gr_id, None) model.addSynonym(gr_id, shortname) allomimids.add(omim_num) if not self.testMode and \ limit is not None and line_counter > limit: break # end looping through file # get the omim ids that are not genes entries_that_are_phenotypes = \ omim.process_entries( list(allomimids), filter_keep_phenotype_entry_ids, None, None, limit) logger.info("Filtered out %d/%d entries that are genes or features", len(allomimids) - len(entries_that_are_phenotypes), len(allomimids)) for nbk_num in self.book_ids: gr_id = 'GeneReviews:' + nbk_num if nbk_num in id_map: omim_ids = id_map.get(nbk_num) for omim_num in omim_ids: omim_id = 'OMIM:' + omim_num # add the gene reviews as a superclass to the omim id, # but only if the omim id is not a gene if omim_id in entries_that_are_phenotypes: model.addClassToGraph(omim_id, None) model.addSubClass(omim_id, gr_id) # add this as a generic subclass of DOID:4 model.addSubClass(gr_id, 'DOID:4') return def _get_titles(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes (not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['titles']['file'])) model = Model(self.graph) line_counter = 0 with open(raw, 'r', encoding='latin-1') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') header = next(filereader) line_counter = 1 colcount = len(header) if colcount != 4: # ('GR_shortname', 'GR_Title', 'NBK_id', 'PMID') logger.error("Unexpected Header ", header) exit(-1) for row in filereader: line_counter += 1 if len(row) != colcount: logger.error("Unexpected row. got: ", row) logger.error("Expected data for: ", header) exit(-1) (shortname, title, nbk_num, pmid) = row gr_id = 'GeneReviews:' + nbk_num self.book_ids.add(nbk_num) # a global set of the book nums if limit is None or line_counter < limit: model.addClassToGraph(gr_id, title) model.addSynonym(gr_id, shortname) # TODO include the new PMID? return def create_books(self): # note that although we put in the url to the book, # NCBI Bookshelf does not allow robots to download content book_item = {'file': 'books/', 'url': ''} for nbk in self.book_ids: b = book_item.copy() b['file'] = '/'.join(('books', nbk + '.html')) b['url'] = 'http://www.ncbi.nlm.nih.gov/books/' + nbk self.all_books[nbk] = b return def process_nbk_html(self, limit): """ Here we process the gene reviews books to fetch the clinical descriptions to include in the ontology. We only use books that have been acquired manually, as NCBI Bookshelf does not permit automated downloads. This parser will only process the books that are found in the ```raw/genereviews/books``` directory, permitting partial completion. :param limit: :return: """ model = Model(self.graph) c = 0 books_not_found = set() for nbk in self.book_ids: c += 1 nbk_id = 'GeneReviews:' + nbk book_item = self.all_books.get(nbk) url = '/'.join((self.rawdir, book_item['file'])) # figure out if the book is there; if so, process, otherwise skip book_dir = '/'.join((self.rawdir, 'books')) book_files = os.listdir(book_dir) if ''.join((nbk, '.html')) not in book_files: # logger.warning("No book found locally for %s; skipping", nbk) books_not_found.add(nbk) continue logger.info("Processing %s", nbk) page = open(url) soup = BeautifulSoup(page.read()) # sec0 == clinical description clin_summary = \ soup.find( 'div', id=re.compile(".*Summary.sec0")) if clin_summary is not None: p = clin_summary.find('p') ptext = p.text ptext = re.sub(r'\s+', ' ', ptext) ul = clin_summary.find('ul') if ul is not None: item_text = list() for li in ul.find_all('li'): item_text.append(re.sub(r'\s+', ' ', li.text)) ptext += ' '.join(item_text) # add in the copyright and citation info to description ptext = \ ' '.join( (ptext, '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' + nbk_id+']')) model.addDefinition(nbk_id, ptext.strip()) # get the pubs pmid_set = set() pub_div = soup.find('div', id=re.compile(r".*Literature_Cited")) if pub_div is not None: ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"}) for r in ref_list: for a in r.find_all('a', attrs={'href': re.compile(r"pubmed")}): if re.match(r'PubMed:', a.text): pmnum = re.sub(r'PubMed:\s*', '', a.text) else: pmnum = \ re.search( r'\/pubmed\/(\d+)$', a['href']).group(1) if pmnum is not None: pmid = 'PMID:' + str(pmnum) self.graph.addTriple( pmid, model.object_properties['is_about'], nbk_id) pmid_set.add(pmnum) reference = Reference( self.graph, pmid, Reference.ref_types['journal_article']) reference.addRefToGraph() # TODO add author history, copyright, license to dataset # TODO get PMID-NBKID equivalence (near foot of page), # and make it "is about" link # self.gu.addTriple( # self.graph, pmid, # self.gu.object_properties['is_about'], nbk_id) # for example: NBK1191 PMID:20301370 # add the book to the dataset self.dataset.setFileAccessUrl(book_item['url']) if limit is not None and c > limit: break # finish looping through books l = len(books_not_found) if len(books_not_found) > 0: if l > 100: logger.warning("There were %d books not found.", l) else: logger.warning( "The following %d books were not found locally: %s", l, str(books_not_found)) logger.info("Finished processing %d books for clinical descriptions", c - l) return def getTestSuite(self): import unittest from tests.test_genereviews import GeneReviewsTestCase test_suite = \ unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase) return test_suite
class GeneReviews(Source): """ Here we process the GeneReviews mappings to OMIM, plus inspect the GeneReviews (html) books to pull the clinical descriptions in order to populate the definitions of the terms in the ontology. We define the GeneReviews items as classes that are either grouping classes over OMIM disease ids (gene ids are filtered out), or are made as subclasses of DOID:4 (generic disease). Note that GeneReviews [copyright policy](http://www.ncbi.nlm.nih.gov/books/NBK138602/) (as of 2015.11.20) says: GeneReviews® chapters are owned by the University of Washington, Seattle, © 1993-2015. Permission is hereby granted to reproduce, distribute, and translate copies of content materials provided that (i) credit for source (www.ncbi.nlm.nih.gov/books/NBK1116/) and copyright (University of Washington, Seattle) are included with each copy; (ii) a link to the original material is provided whenever the material is published elsewhere on the Web; and (iii) reproducers, distributors, and/or translators comply with this copyright notice and the GeneReviews Usage Disclaimer. This script doesn't pull the GeneReviews books from the NCBI Bookshelf directly; scripting this task is expressly prohibited by [NCBIBookshelf policy](http://www.ncbi.nlm.nih.gov/books/NBK45311/). However, assuming you have acquired the books (in html format) via permissible means, a parser for those books is provided here to extract the clinical descriptions to define the NBK identified classes. """ files = { 'idmap': {'file': 'NBKid_shortname_OMIM.txt', 'url': GRDL + '/NBKid_shortname_OMIM.txt'}, 'titles': {'file': 'GRtitle_shortname_NBKid.txt', 'url': GRDL + '/GRtitle_shortname_NBKid.txt'} } def __init__(self): Source.__init__(self, 'genereviews') self.load_bindings() self.dataset = Dataset( 'genereviews', 'Gene Reviews', 'http://genereviews.org/', None, 'http://www.ncbi.nlm.nih.gov/books/NBK138602/') self.dataset.set_citation('GeneReviews:NBK1116') self.gu = GraphUtils(curie_map.get()) self.book_ids = set() self.all_books = {} if 'test_ids' not in config.get_config() or\ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = list() else: # select ony those test ids that are omim's. self.test_ids = config.get_config()['test_ids']['disease'] return def fetch(self, is_dl_forced=False): """ We fetch GeneReviews id-label map and id-omim mapping files from NCBI. :return: None """ self.get_files(is_dl_forced) return def parse(self, limit=None): """ :return: None """ if self.testOnly: self.testMode = True self._get_titles(limit) self._get_equivids(limit) self.create_books() self.process_nbk_html(limit) self.load_bindings() # no test subset for now; test == full graph self.testgraph = self.graph logger.info("Found %d nodes", len(self.graph)) return def _get_equivids(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes(not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['idmap']['file'])) gu = GraphUtils(curie_map.get()) line_counter = 0 # we look some stuff up in OMIM, so initialize here omim = OMIM() id_map = {} allomimids = set() with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter == 1: # skip header continue (nbk_num, shortname, omim_num) = row gr_id = 'GeneReviews:'+nbk_num omim_id = 'OMIM:'+omim_num if not ( (self.testMode and len(self.test_ids) > 0 and omim_id in self.test_ids) or not self.testMode): continue # sometimes there's bad omim nums if len(omim_num) > 6: logger.warning( "OMIM number incorrectly formatted " + "in row %d; skipping:\n%s", line_counter, '\t'.join(row)) continue # build up a hashmap of the mappings; then process later if nbk_num not in id_map: id_map[nbk_num] = set() id_map[nbk_num].add(omim_num) # add the class along with the shortname gu.addClassToGraph(self.graph, gr_id, None) gu.addSynonym(self.graph, gr_id, shortname) allomimids.add(omim_num) if not self.testMode and \ limit is not None and line_counter > limit: break # end looping through file # get the omim ids that are not genes entries_that_are_phenotypes = \ omim.process_entries( list(allomimids), filter_keep_phenotype_entry_ids, None, None, limit) logger.info("Filtered out %d/%d entries that are genes or features", len(allomimids)-len(entries_that_are_phenotypes), len(allomimids)) for nbk_num in self.book_ids: gr_id = 'GeneReviews:'+nbk_num if nbk_num in id_map: omim_ids = id_map.get(nbk_num) for omim_num in omim_ids: omim_id = 'OMIM:'+omim_num # add the gene reviews as a superclass to the omim id, # but only if the omim id is not a gene if omim_id in entries_that_are_phenotypes: gu.addClassToGraph(self.graph, omim_id, None) gu.addSubclass(self.graph, gr_id, omim_id) # add this as a generic subclass of DOID:4 gu.addSubclass(self.graph, 'DOID:4', gr_id) return def _get_titles(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes (not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['titles']['file'])) gu = GraphUtils(curie_map.get()) line_counter = 0 with open(raw, 'r', encoding='latin-1') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter == 1: # skip header continue (shortname, title, nbk_num) = row gr_id = 'GeneReviews:'+nbk_num self.book_ids.add(nbk_num) # a global set of the book nums if limit is None or line_counter < limit: gu.addClassToGraph(self.graph, gr_id, title) gu.addSynonym(self.graph, gr_id, shortname) return def create_books(self): # note that although we put in the url to the book, # NCBI Bookshelf does not allow robots to download content book_item = {'file': 'books/', 'url': ''} for nbk in self.book_ids: b = book_item.copy() b['file'] = '/'.join(('books', nbk+'.html')) b['url'] = 'http://www.ncbi.nlm.nih.gov/books/'+nbk self.all_books[nbk] = b return def process_nbk_html(self, limit): """ Here we process the gene reviews books to fetch the clinical descriptions to include in the ontology. We only use books that have been acquired manually, as NCBI Bookshelf does not permit automated downloads. This parser will only process the books that are found in the ```raw/genereviews/books``` directory, permitting partial completion. :param limit: :return: """ c = 0 books_not_found = set() for nbk in self.book_ids: c += 1 nbk_id = 'GeneReviews:'+nbk book_item = self.all_books.get(nbk) url = '/'.join((self.rawdir, book_item['file'])) # figure out if the book is there; if so, process, otherwise skip book_dir = '/'.join((self.rawdir, 'books')) book_files = os.listdir(book_dir) if ''.join((nbk, '.html')) not in book_files: # logger.warning("No book found locally for %s; skipping", nbk) books_not_found.add(nbk) continue logger.info("Processing %s", nbk) page = open(url) soup = BeautifulSoup(page.read()) # sec0 == clinical description clin_summary = \ soup.find( 'div', id=re.compile(".*Summary.sec0")) if clin_summary is not None: p = clin_summary.find('p') ptext = p.text ptext = re.sub(r'\s+', ' ', ptext) ul = clin_summary.find('ul') if ul is not None: item_text = list() for li in ul.find_all('li'): item_text.append(re.sub(r'\s+', ' ', li.text)) ptext += ' '.join(item_text) # add in the copyright and citation info to description ptext = \ ' '.join( (ptext, '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' + nbk_id+']')) self.gu.addDefinition(self.graph, nbk_id, ptext.strip()) # get the pubs pmid_set = set() pub_div = soup.find('div', id=re.compile(r".*Literature_Cited")) if pub_div is not None: ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"}) for r in ref_list: for a in r.find_all( 'a', attrs={'href': re.compile(r"pubmed")}): if re.match(r'PubMed:', a.text): pmnum = re.sub(r'PubMed:\s*', '', a.text) else: pmnum = \ re.search( r'\/pubmed\/(\d+)$', a['href']).group(1) if pmnum is not None: pmid = 'PMID:'+str(pmnum) self.gu.addTriple( self.graph, pmid, self.gu.object_properties['is_about'], nbk_id) pmid_set.add(pmnum) r = Reference( pmid, Reference.ref_types['journal_article']) r.addRefToGraph(self.graph) # TODO add author history, copyright, license to dataset # TODO get PMID-NBKID equivalence (near foot of page), # and make it "is about" link # self.gu.addTriple( # self.graph, pmid, # self.gu.object_properties['is_about'], nbk_id) # for example: NBK1191 PMID:20301370 # add the book to the dataset self.dataset.setFileAccessUrl(book_item['url']) if limit is not None and c > limit: break # finish looping through books l = len(books_not_found) if len(books_not_found) > 0: if l > 100: logger.warning("There were %d books not found.", l) else: logger.warning( "The following %d books were not found locally: %s", l, str(books_not_found)) logger.info( "Finished processing %d books for clinical descriptions", c-l) return def getTestSuite(self): import unittest from tests.test_genereviews import GeneReviewsTestCase test_suite = \ unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase) return test_suite
class MPD(Source): """ From the [MPD](http://phenome.jax.org/) website: This resource is a collaborative standardized collection of measured data on laboratory mouse strains and populations. Includes baseline phenotype data sets as well as studies of drug, diet, disease and aging effect. Also includes protocols, projects and publications, and SNP, variation and gene expression studies. Here, we pull the data and model the genotypes using GENO and the genotype-to-phenotype associations using the OBAN schema. MPD provide measurements for particular assays for several strains. Each of these measurements is itself mapped to a MP or VT term as a phenotype. Therefore, we can create a strain-to-phenotype association based on those strains that lie outside of the "normal" range for the given measurements. We can compute the average of the measurements for all strains tested, and then threshold any extreme measurements being beyond some threshold beyond the average. Our default threshold here, is +/-2 standard deviations beyond the mean. Because the measurements are made and recorded at the level of a specific sex of each strain, we associate the MP/VT phenotype with the sex-qualified genotype/strain. """ MPDDL = 'http://phenomedoc.jax.org/MPD_downloads' files = { 'ontology_mappings': { 'file': 'ontology_mappings.csv', 'url': MPDDL + '/ontology_mappings.csv'}, 'straininfo': { 'file': 'straininfo.csv', 'url': MPDDL + '/straininfo.csv'}, 'assay_metadata': { 'file': 'measurements.csv', 'url': MPDDL + '/measurements.csv'}, 'strainmeans': { 'file': 'strainmeans.csv.gz', 'url': MPDDL + '/strainmeans.csv.gz'}, # 'mpd_datasets_metadata': { #TEC does not seem to be used # 'file': 'mpd_datasets_metadata.xml.gz', # 'url': MPDDL + '/mpd_datasets_metadata.xml.gz'}, } # the following are strain ids for testing # test_ids = [ # "MPD:2", "MPD:3", "MPD:5", "MPD:6", "MPD:9", "MPD:11", "MPD:18", # "MPD:20", "MPD:24", "MPD:28", "MPD:30", "MPD:33", "MPD:34", "MPD:36", # "MPD:37", "MPD:39", "MPD:40", "MPD:42", "MPD:47", "MPD:66", "MPD:68", # "MPD:71", "MPD:75", "MPD:78", "MPD:122", "MPD:169", "MPD:438", # "MPD:457","MPD:473", "MPD:481", "MPD:759", "MPD:766", "MPD:770", # "MPD:849", "MPD:857", "MPD:955", "MPD:964", "MPD:988", "MPD:1005", # "MPD:1017", "MPD:1204", "MPD:1233", "MPD:1235", "MPD:1236", "MPD:1237"] test_ids = [ 'MPD:6', 'MPD:849', 'MPD:425', 'MPD:569', "MPD:10", "MPD:1002", "MPD:39", "MPD:2319"] mgd_agent_id = "MPD:db/q?rtn=people/allinv" mgd_agent_label = "Mouse Phenotype Database" mgd_agent_type = "foaf:organization" def __init__(self, graph_type, are_bnodes_skolemized): Source.__init__(self, graph_type, are_bnodes_skolemized, 'mpd') # @N, not sure if this step is required self.stdevthreshold = 2 # update the dataset object with details about this resource # @N: Note that there is no license as far as I can tell self.dataset = Dataset( 'mpd', 'MPD', 'http://phenome.jax.org', None, None) # TODO add a citation for mpd dataset as a whole self.dataset.set_citation('PMID:15619963') self.assayhash = {} self.idlabel_hash = {} # to store the mean/zscore of each measure by strain+sex self.score_means_by_measure = {} # to store the mean value for each measure by strain+sex self.strain_scores_by_measure = {} return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) return def parse(self, limit=None): """ MPD data is delivered in four separate csv files and one xml file, which we process iteratively and write out as one large graph. :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") self._process_straininfo(limit) # the following will provide us the hash-lookups # These must be processed in a specific order # mapping between assays and ontology terms self._process_ontology_mappings_file(limit) # this is the metadata about the measurements self._process_measurements_file(limit) # get all the measurements per strain self._process_strainmeans_file(limit) # The following will use the hash populated above # to lookup the ids when filling in the graph self._fill_provenance_graph(limit) logger.info("Finished parsing.") return def _process_ontology_mappings_file(self, limit): # line_counter = 0 # TODO unused logger.info("Processing ontology mappings...") raw = '/'.join((self.rawdir, 'ontology_mappings.csv')) with open(raw, 'r') as f: reader = csv.reader(f) # read the header row; skip self.check_header(self.files['ontology_mappings']['file'], f.readline()) for row in reader: try: (assay_id, ont_term, descrip) = row except ValueError: continue assay_id = int(assay_id) if re.match(r'(MP|VT)', ont_term): # add the mapping denovo if assay_id not in self.assayhash: self.assayhash[assay_id] = {} self.assayhash[assay_id]['ont_terms'] = set() self.assayhash[assay_id]['ont_terms'].add(ont_term) return def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') self.check_header(self.files['straininfo']['file'], f.readline()) for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:' + str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:' + str(mpd_strainid) model.addIndividualToGraph(strain_id, strain_name, tax_id) if mpdshortname.strip() != '': model.addSynonym(strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum model.addSameIndividual(strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) model.addSameIndividual(strain_id, reiken_id) else: if url != '': model.addXref(strain_id, url, True) if vendor != '': model.addXref( strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' model.addDescription(strain_id, desc) # TODO make the panels as a resource collection return def _process_measurements_file(self, limit): line_counter = 0 logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, 'measurements.csv')) with open(raw, 'r') as f: reader = csv.reader(f) # read the header row; skip self.check_header( self.files['assay_metadata']['file'], f.readline()) for row in reader: line_counter += 1 assay_id = int(row[0]) assay_label = row[4] assay_units = row[5] assay_type = row[6] if row[6] is not '' else None if assay_id not in self.assayhash: self.assayhash[assay_id] = {} description = self.build_measurement_description(row) self.assayhash[assay_id]['description'] = description self.assayhash[assay_id]['assay_label'] = assay_label self.assayhash[assay_id]['assay_type'] = assay_type self.assayhash[assay_id]['assay_units'] = assay_units # TODO add projectsym property? # TODO add intervention? # ageweeks might be useful for adding to phenotype assoc # end loop on measurement metadata return def _process_strainmeans_file(self, limit): """ This will store the entire set of strain means in a hash. Not the most efficient representation, but easy access. We will loop through this later to then apply cutoffs and add associations :param limit: :return: """ logger.info("Processing strain means ...") line_counter = 0 raw = '/'.join((self.rawdir, self.files['strainmeans']['file'])) with gzip.open(raw, 'rb') as f: f = io.TextIOWrapper(f) reader = csv.reader(f) self.check_header(self.files['strainmeans']['file'], f.readline()) score_means_by_measure = {} strain_scores_by_measure = {} for row in reader: try: (measnum, varname, strain, strainid, sex, mean, nmice, sd, sem, cv, minval, maxval, logmean, logsd, zscore, logzscore) = row except ValueError: continue line_counter += 1 strain_num = int(strainid) assay_num = int(measnum) # assuming the zscore is across all the items # in the same measure+var+strain+sex # note: it seems that there is only ever 1 varname per measnum. # note: some assays only tested one sex! # we split this here by sex if assay_num not in score_means_by_measure: score_means_by_measure[assay_num] = {} if sex not in score_means_by_measure[assay_num]: score_means_by_measure[assay_num][sex] = list() score_means_by_measure[assay_num][sex].append(float(mean)) if strain_num not in strain_scores_by_measure: strain_scores_by_measure[strain_num] = {} if sex not in strain_scores_by_measure[strain_num]: strain_scores_by_measure[strain_num][sex] = {} strain_scores_by_measure[strain_num][sex][assay_num] = \ {'mean': float(mean), 'zscore': float(zscore)} # end loop over strainmeans self.score_means_by_measure = score_means_by_measure self.strain_scores_by_measure = strain_scores_by_measure return def _fill_provenance_graph(self, limit): logger.info("Building graph ...") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) taxon_id = 'NCBITaxon:10090' # hardcode to Mus musculus model.addClassToGraph(taxon_id, None) scores_passing_threshold_count = 0 scores_passing_threshold_with_ontologies_count = 0 scores_not_passing_threshold_count = 0 # loop through all the strains, # and make G2P assoc for those with scores beyond threshold for strain_num in self.strain_scores_by_measure: if self.testMode and 'MPD:'+str(strain_num) not in self.test_ids: continue strain_id = 'MPD-strain:'+str(strain_num) for sex in self.strain_scores_by_measure[strain_num]: measures = self.strain_scores_by_measure[strain_num][sex] for m in measures: assay_id = 'MPD-assay:'+str(m) # TODO consider using the means # instead of precomputed zscores if 'zscore' in measures[m]: zscore = measures[m]['zscore'] if abs(zscore) >= self.stdevthreshold: scores_passing_threshold_count += 1 # logger.info( # "Score passing threshold: %s | %s | %s", # strain_id, assay_id, zscore) # add the G2P assoc prov = Provenance(self.graph) try: assay_label = self.assayhash[m]['assay_label'] assay_description = \ self.assayhash[m]['description'] ont_term_ids = self.assayhash[m].get('ont_terms') comment = ' '.join((assay_label, '(zscore='+str(zscore)+')')) except KeyError: assay_label = None assay_description = None ont_term_ids = None if assay_label is not None: assay_label += ' ('+str(m)+')' # TODO unused # assay_type = self.assayhash[m]['assay_type'] assay_type_id = Provenance.provenance_types['assay'] if ont_term_ids is not None: scores_passing_threshold_with_ontologies_count += 1 prov.add_assay_to_graph( assay_id, assay_label, assay_type_id, assay_description) self._add_g2p_assoc( g, strain_id, sex, assay_id, ont_term_ids, comment) else: scores_not_passing_threshold_count += 1 logger.info("Scores passing threshold: %d", scores_passing_threshold_count) logger.info("Scores passing threshold with ontologies: %d", scores_passing_threshold_with_ontologies_count) logger.info("Scores not passing threshold: %d", scores_not_passing_threshold_count) return def _add_g2p_assoc(self, g, strain_id, sex, assay_id, phenotypes, comment): """ Create an association between a sex-specific strain id and each of the phenotypes. Here, we create a genotype from the strain, and a sex-specific genotype. Each of those genotypes are created as anonymous nodes. The evidence code is hardcoded to be: ECO:experimental_phenotypic_evidence. :param g: :param strain_id: :param sex: :param assay_id: :param phenotypes: a list of phenotypes to association with the strain :param comment: :return: """ geno = Genotype(g) model = Model(g) eco_id = "ECO:0000059" # experimental_phenotypic_evidence strain_label = self.idlabel_hash.get(strain_id) # strain genotype genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), 'genotype')) genotype_label = '[' + strain_label + ']' sex_specific_genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), sex, 'genotype')) if strain_label is not None: sex_specific_genotype_label = strain_label + ' (' + sex + ')' else: sex_specific_genotype_label = strain_id + '(' + sex + ')' genotype_type = Genotype.genoparts['sex_qualified_genotype'] if sex == 'm': genotype_type = Genotype.genoparts['male_genotype'] elif sex == 'f': genotype_type = Genotype.genoparts['female_genotype'] # add the genotype to strain connection geno.addGenotype( genotype_id, genotype_label, Genotype.genoparts['genomic_background']) g.addTriple( strain_id, Genotype.object_properties['has_genotype'], genotype_id) geno.addGenotype( sex_specific_genotype_id, sex_specific_genotype_label, genotype_type) # add the strain as the background for the genotype g.addTriple( sex_specific_genotype_id, Genotype.object_properties['has_sex_agnostic_genotype_part'], genotype_id) # ############# BUILD THE G2P ASSOC ############# # TODO add more provenance info when that model is completed if phenotypes is not None: for phenotype_id in phenotypes: assoc = G2PAssoc( g, self.name, sex_specific_genotype_id, phenotype_id) assoc.add_evidence(assay_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model.addComment(assoc_id, comment) return def getTestSuite(self): import unittest from tests.test_mpd import MPDTestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(MPDTestCase) return test_suite @staticmethod def normalise_units(units): # todo: return units @staticmethod def build_measurement_description(row): (measnum, mpdsector, projsym, varname, descrip, units, method, intervention, paneldesc, datatype, sextested, nstrainstested, ageweeks,) = row if sextested == 'f': sextested = 'female' elif sextested == 'm': sextested = 'male' elif sextested == 'fm': sextested = 'male and female' else: logger.warning("Unknown sex tested key: %s", sextested) description = "This is an assay of [" + descrip + "] shown as a [" + \ datatype + "] measured in [" + units + "]" if intervention is not None and intervention != "": description += " in response to [" + intervention + "]" """ As of 9/28/2017 intparm is no longer in the measurements.tsv if intparm is not None and intervention != "": description += \ ". This represents the [" + intparm + \ "] arm, using materials and methods that included [" + \ method + "]" """ description += \ ". The overall experiment is entitled [" + projsym + "]. " description += \ "It was conducted in [" + sextested + "] mice at [" + \ ageweeks + "] of age in" + " [" + nstrainstested + \ "] different mouse strains. " """ As of 9/28/2017 cat1-3 are no longer in the measurements.tsv description += "Keywords: " + cat1 + \ ((", " + cat2) if cat2.strip() is not "" else "") + \ ((", " + cat3) if cat3.strip() is not "" else "") + "." """ return description # def _log_missing_ids(self, missing_id, name_of_file_from_which_missing): # if missing_id not in self.missing_assay_hash: # self.missing_assay_hash[missing_id] = set() # self.missing_assay_hash[missing_id].add(name_of_file_from_which_missing) # # todo: remove the offending ids from the hash # return @staticmethod def check_header(filename, header): header = header.rstrip("\n") header_map = { 'strainmeans.csv.gz': 'measnum,varname,strain,strainid,sex,mean,' 'nmice,sd,sem,cv,minval,maxval,logmean,' 'logsd,zscore,logzscore', 'straininfo.csv': 'strainname,vendor,stocknum,panel,mpd_strainid,' 'straintype,n_proj,n_snp_datasets,mpd_shortname,url', 'measurements.csv': 'measnum,mpdsector,projsym,varname,descrip,units,' 'method,intervention,paneldesc,datatype,sextested,' 'nstrainstested,ageweeks', 'ontology_mappings.csv': 'measnum,ont_term,descrip' } if header != header_map[filename]: raise ValueError( "header in {} \n {}\n" "does not match expected:\n {}" .format(filename, header, header_map[filename]) )
class MPD(Source): """ From the [MPD](http://phenome.jax.org/) website: This resource is a collaborative standardized collection of measured data on laboratory mouse strains and populations. Includes baseline phenotype data sets as well as studies of drug, diet, disease and aging effect. Also includes protocols, projects and publications, and SNP, variation and gene expression studies. Here, we pull the data and model the genotypes using GENO and the genotype-to-phenotype associations using the OBAN schema. MPD provide measurements for particular assays for several strains. Each of these measurements is itself mapped to a MP or VT term as a phenotype. Therefore, we can create a strain-to-phenotype association based on those strains that lie outside of the "normal" range for the given measurements. We can compute the average of the measurements for all strains tested, and then threshold any extreme measurements being beyond some threshold beyond the average. Our default threshold here, is +/-2 standard deviations beyond the mean. Because the measurements are made and recorded at the level of a specific sex of each strain, we associate the MP/VT phenotype with the sex-qualified genotype/strain. """ MPDDL = 'http://phenomedoc.jax.org/MPD_downloads' files = { 'ontology_mappings': { 'file': 'ontology_mappings.csv', 'url': MPDDL + '/ontology_mappings.csv'}, 'straininfo': { 'file': 'straininfo.csv', 'url': MPDDL + '/straininfo.csv'}, 'assay_metadata': { 'file': 'measurements.csv', 'url': MPDDL + '/measurements.csv'}, 'strainmeans': { 'file': 'strainmeans.csv.gz', 'url': MPDDL + '/strainmeans.csv.gz'}, # 'mpd_datasets_metadata': { #TEC does not seem to be used # 'file': 'mpd_datasets_metadata.xml.gz', # 'url': MPDDL + '/mpd_datasets_metadata.xml.gz'}, } # the following are strain ids for testing # test_ids = [ # "MPD:2", "MPD:3", "MPD:5", "MPD:6", "MPD:9", "MPD:11", "MPD:18", # "MPD:20", "MPD:24", "MPD:28", "MPD:30", "MPD:33", "MPD:34", "MPD:36", # "MPD:37", "MPD:39", "MPD:40", "MPD:42", "MPD:47", "MPD:66", "MPD:68", # "MPD:71", "MPD:75", "MPD:78", "MPD:122", "MPD:169", "MPD:438", # "MPD:457","MPD:473", "MPD:481", "MPD:759", "MPD:766", "MPD:770", # "MPD:849", "MPD:857", "MPD:955", "MPD:964", "MPD:988", "MPD:1005", # "MPD:1017", "MPD:1204", "MPD:1233", "MPD:1235", "MPD:1236", "MPD:1237"] test_ids = [ 'MPD:6', 'MPD:849', 'MPD:425', 'MPD:569', "MPD:10", "MPD:1002", "MPD:39", "MPD:2319"] mgd_agent_id = "MPD:db/q?rtn=people/allinv" mgd_agent_label = "Mouse Phenotype Database" mgd_agent_type = "foaf:organization" def __init__(self, graph_type, are_bnodes_skolemized): Source.__init__(self, graph_type, are_bnodes_skolemized, 'mpd') # @N, not sure if this step is required self.stdevthreshold = 2 # update the dataset object with details about this resource # @N: Note that there is no license as far as I can tell self.dataset = Dataset( 'mpd', 'MPD', 'http://phenome.jax.org', None, None) # TODO add a citation for mpd dataset as a whole self.dataset.set_citation('PMID:15619963') self.assayhash = {} self.idlabel_hash = {} # to store the mean/zscore of each measure by strain+sex self.score_means_by_measure = {} # to store the mean value for each measure by strain+sex self.strain_scores_by_measure = {} return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) return def parse(self, limit=None): """ MPD data is delivered in four separate csv files and one xml file, which we process iteratively and write out as one large graph. :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") self._process_straininfo(limit) # the following will provide us the hash-lookups # These must be processed in a specific order # mapping between assays and ontology terms self._process_ontology_mappings_file(limit) # this is the metadata about the measurements self._process_measurements_file(limit) # get all the measurements per strain self._process_strainmeans_file(limit) # The following will use the hash populated above # to lookup the ids when filling in the graph self._fill_provenance_graph(limit) logger.info("Finished parsing.") return def _process_ontology_mappings_file(self, limit): # line_counter = 0 # TODO unused logger.info("Processing ontology mappings...") raw = '/'.join((self.rawdir, 'ontology_mappings.csv')) with open(raw, 'r') as f: reader = csv.reader(f) # read the header row; skip f.readline() for row in reader: try: (assay_id, ont_term, descrip) = row except ValueError: continue assay_id = int(assay_id) if re.match(r'(MP|VT)', ont_term): # add the mapping denovo if assay_id not in self.assayhash: self.assayhash[assay_id] = {} self.assayhash[assay_id]['ont_terms'] = set() self.assayhash[assay_id]['ont_terms'].add(ont_term) return def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') f.readline() # read the header row; skip for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:' + str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:' + str(mpd_strainid) model.addIndividualToGraph(strain_id, strain_name, tax_id) if mpdshortname.strip() != '': model.addSynonym(strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum model.addSameIndividual(strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) model.addSameIndividual(strain_id, reiken_id) else: if url != '': model.addXref(strain_id, url, True) if vendor != '': model.addXref( strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' model.addDescription(strain_id, desc) # TODO make the panels as a resource collection return def _process_measurements_file(self, limit): line_counter = 0 logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, 'measurements.csv')) with open(raw, 'r') as f: reader = csv.reader(f) # read the header row; skip header = f.readline() logger.info("HEADER: %s", header) for row in reader: # measnum,projsym,varname,descrip,units,cat1,cat2,cat3, # intervention,intparm,appmeth,panelsym,datatype,sextested, # nstrainstested,ageweeks # Again the last row has changed. contains: '(4486 rows)' if len(row) != 16: continue line_counter += 1 assay_id = int(row[0]) assay_label = row[3] assay_units = row[4] assay_type = row[10] if row[10] is not '' else None if assay_id not in self.assayhash: self.assayhash[assay_id] = {} description = self.build_measurement_description(row) self.assayhash[assay_id]['description'] = description self.assayhash[assay_id]['assay_label'] = assay_label self.assayhash[assay_id]['assay_type'] = assay_type self.assayhash[assay_id]['assay_units'] = assay_units # TODO add projectsym property? # TODO add intervention? # ageweeks might be useful for adding to phenotype assoc # end loop on measurement metadata return def _process_strainmeans_file(self, limit): """ This will store the entire set of strain means in a hash. Not the most efficient representation, but easy access. We will loop through this later to then apply cutoffs and add associations :param limit: :return: """ logger.info("Processing strain means ...") line_counter = 0 raw = '/'.join((self.rawdir, self.files['strainmeans']['file'])) with gzip.open(raw, 'rb') as f: f = io.TextIOWrapper(f) reader = csv.reader(f) f.readline() # read the header row; skip score_means_by_measure = {} strain_scores_by_measure = {} for row in reader: try: (measnum, varname, strain, strainid, sex, mean, nmice, sd, sem, cv, minval, maxval, logmean, logsd, zscore, logzscore) = row except ValueError: continue line_counter += 1 strain_num = int(strainid) assay_num = int(measnum) # assuming the zscore is across all the items # in the same measure+var+strain+sex # note: it seems that there is only ever 1 varname per measnum. # note: some assays only tested one sex! # we split this here by sex if assay_num not in score_means_by_measure: score_means_by_measure[assay_num] = {} if sex not in score_means_by_measure[assay_num]: score_means_by_measure[assay_num][sex] = list() score_means_by_measure[assay_num][sex].append(float(mean)) if strain_num not in strain_scores_by_measure: strain_scores_by_measure[strain_num] = {} if sex not in strain_scores_by_measure[strain_num]: strain_scores_by_measure[strain_num][sex] = {} strain_scores_by_measure[strain_num][sex][assay_num] = \ {'mean': float(mean), 'zscore': float(zscore)} # end loop over strainmeans self.score_means_by_measure = score_means_by_measure self.strain_scores_by_measure = strain_scores_by_measure return def _fill_provenance_graph(self, limit): logger.info("Building graph ...") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) taxon_id = 'NCBITaxon:10090' # hardcode to Mus musculus model.addClassToGraph(taxon_id, None) scores_passing_threshold_count = 0 scores_passing_threshold_with_ontologies_count = 0 scores_not_passing_threshold_count = 0 # loop through all the strains, # and make G2P assoc for those with scores beyond threshold for strain_num in self.strain_scores_by_measure: if self.testMode and 'MPD:'+str(strain_num) not in self.test_ids: continue strain_id = 'MPD-strain:'+str(strain_num) for sex in self.strain_scores_by_measure[strain_num]: measures = self.strain_scores_by_measure[strain_num][sex] for m in measures: assay_id = 'MPD-assay:'+str(m) # TODO consider using the means # instead of precomputed zscores if 'zscore' in measures[m]: zscore = measures[m]['zscore'] if abs(zscore) >= self.stdevthreshold: scores_passing_threshold_count += 1 # logger.info( # "Score passing threshold: %s | %s | %s", # strain_id, assay_id, zscore) # add the G2P assoc prov = Provenance(self.graph) try: assay_label = self.assayhash[m]['assay_label'] assay_description = \ self.assayhash[m]['description'] ont_term_ids = self.assayhash[m].get('ont_terms') comment = ' '.join((assay_label, '(zscore='+str(zscore)+')')) except KeyError: assay_label = None assay_description = None ont_term_ids = None if assay_label is not None: assay_label += ' ('+str(m)+')' # TODO unused # assay_type = self.assayhash[m]['assay_type'] assay_type_id = Provenance.provenance_types['assay'] if ont_term_ids is not None: scores_passing_threshold_with_ontologies_count += 1 prov.add_assay_to_graph( assay_id, assay_label, assay_type_id, assay_description) self._add_g2p_assoc( g, strain_id, sex, assay_id, ont_term_ids, comment) else: scores_not_passing_threshold_count += 1 logger.info("Scores passing threshold: %d", scores_passing_threshold_count) logger.info("Scores passing threshold with ontologies: %d", scores_passing_threshold_with_ontologies_count) logger.info("Scores not passing threshold: %d", scores_not_passing_threshold_count) return def _add_g2p_assoc(self, g, strain_id, sex, assay_id, phenotypes, comment): """ Create an association between a sex-specific strain id and each of the phenotypes. Here, we create a genotype from the strain, and a sex-specific genotype. Each of those genotypes are created as anonymous nodes. The evidence code is hardcoded to be: ECO:experimental_phenotypic_evidence. :param g: :param strain_id: :param sex: :param assay_id: :param phenotypes: a list of phenotypes to association with the strain :param comment: :return: """ geno = Genotype(g) model = Model(g) eco_id = "ECO:0000059" # experimental_phenotypic_evidence strain_label = self.idlabel_hash.get(strain_id) # strain genotype genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), 'genotype')) genotype_label = '[' + strain_label + ']' sex_specific_genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), sex, 'genotype')) if strain_label is not None: sex_specific_genotype_label = strain_label + ' (' + sex + ')' else: sex_specific_genotype_label = strain_id + '(' + sex + ')' genotype_type = Genotype.genoparts['sex_qualified_genotype'] if sex == 'm': genotype_type = Genotype.genoparts['male_genotype'] elif sex == 'f': genotype_type = Genotype.genoparts['female_genotype'] # add the genotype to strain connection geno.addGenotype( genotype_id, genotype_label, Genotype.genoparts['genomic_background']) g.addTriple( strain_id, Genotype.object_properties['has_genotype'], genotype_id) geno.addGenotype( sex_specific_genotype_id, sex_specific_genotype_label, genotype_type) # add the strain as the background for the genotype g.addTriple( sex_specific_genotype_id, Genotype.object_properties['has_sex_agnostic_genotype_part'], genotype_id) # ############# BUILD THE G2P ASSOC ############# # TODO add more provenance info when that model is completed if phenotypes is not None: for phenotype_id in phenotypes: assoc = G2PAssoc( g, self.name, sex_specific_genotype_id, phenotype_id) assoc.add_evidence(assay_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model.addComment(assoc_id, comment) return def getTestSuite(self): import unittest from tests.test_mpd import MPDTestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(MPDTestCase) return test_suite @staticmethod def normalise_units(units): # todo: return units @staticmethod def build_measurement_description(row): (assay_id, projsym, varname, descrip, units, cat1, cat2, cat3, intervention, intparm, appmeth, panelsym, datatype, sextested, nstrainstested, ageweeks) = row if sextested == 'f': sextested = 'female' elif sextested == 'm': sextested = 'male' elif sextested == 'fm': sextested = 'male and female' else: logger.warning("Unknown sex tested key: %s", sextested) description = "This is an assay of [" + descrip + "] shown as a [" + \ datatype + "] measured in [" + units + "]" if intervention is not None and intervention != "": description += " in response to [" + intervention + "]" if intparm is not None and intervention != "": description += \ ". This represents the [" + intparm + \ "] arm, using materials and methods that included [" +\ appmeth + "]" description += \ ". The overall experiment is entitled [" + projsym + "]. " description += \ "It was conducted in [" + sextested + "] mice at [" + \ ageweeks + "] of age in" + " [" + nstrainstested + \ "] different mouse strains. " description += "Keywords: " + cat1 + \ ((", " + cat2) if cat2.strip() is not "" else "") + \ ((", " + cat3) if cat3.strip() is not "" else "") + "." return description