def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'ctd') self.dataset = Dataset( 'ctd', 'CTD', 'http://ctdbase.org', None, 'http://ctdbase.org/about/legal.jsp') if 'test_ids' not in config.get_config() \ or 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") self.test_geneids = [] else: self.test_geneids = config.get_config()['test_ids']['gene'] if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_diseaseids = [] else: self.test_diseaseids = config.get_config()['test_ids']['disease'] self.g = self.graph self.geno = Genotype(self.graph) self.pathway = Pathway(self.graph) return
def __init__(self, graph_type, are_bnodes_skolemized, tax_ids=None): super().__init__(graph_type, are_bnodes_skolemized, 'biogrid') self.tax_ids = tax_ids self.dataset = Dataset( 'biogrid', 'The BioGrid', 'http://thebiogrid.org/', None, 'http://wiki.thebiogrid.org/doku.php/terms_and_conditions') # Defaults # our favorite animals # taxids = [9606,10090,10116,7227,7955,6239,8355] if self.tax_ids is None: self.tax_ids = [9606, 10090, 7955] if 'test_ids' not in config.get_config() or \ 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") else: self.test_ids = config.get_config()['test_ids']['gene'] # data-source specific warnings # (will be removed when issues are cleared) logger.warning( "several MI experimental codes do not exactly map to ECO; " "using approximations.") return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'mmrrc') self.strain_hash = {} self.id_label_hash = {} self.dataset = Dataset( 'mmrrc', 'Mutant Mouse Regional Resource Centers', 'https://www.mmrrc.org', None, 'https://www.mmrrc.org/about/data_download.php') return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'wormbase') # update the dataset object with details about this resource # NO LICENSE for this resource self.dataset = Dataset('wormbase', 'WormBase', 'http://www.wormbase.org', None, None, 'http://www.wormbase.org/about/policies#012') self.version_num = None return
def setUp(self): self.dataset = Dataset(identifier=self.identifier, data_release_version=self.data_release_version, ingest_name=self.identifier, ingest_title=self.ingest_title, ingest_url=self.ingest_url, ingest_logo=self.ingest_logo_url, ingest_description=self.ingest_description, license_url=self.license_url, data_rights=self.data_rights) # put all triples in a list for debugging below self.all_triples = list(self.dataset.graph.triples((None, None, None)))
def __init__(self): Source.__init__(self, 'ctd') self.dataset = Dataset( 'ctd', 'CTD', 'http://ctdbase.org', None, 'http://ctdbase.org/about/legal.jsp') if 'test_ids' not in config.get_config() \ or 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") self.test_geneids = [] else: self.test_geneids = config.get_config()['test_ids']['gene'] if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_diseaseids = [] else: self.test_diseaseids = config.get_config()['test_ids']['disease'] self.gu = GraphUtils(curie_map.get()) self.g = self.graph self.geno = Genotype(self.g) return
def __init__(self, graph_type, are_bnodes_skolemized, tax_ids=None, gene_ids=None): super().__init__(graph_type, are_bnodes_skolemized, 'ensembl') self.tax_ids = tax_ids self.gene_ids = gene_ids self.dataset = Dataset('ensembl', 'ENSEMBL', 'http://uswest.ensembl.org', None) # Defaults if self.tax_ids is None: self.tax_ids = [9606, 10090, 7955] self.gene_ids = [] if 'test_ids' not in config.get_config() \ or 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") else: self.gene_ids = config.get_config()['test_ids']['gene'] self.properties = Feature.properties logger.setLevel(logging.INFO) return
def __init__(self, graph_type, are_bnodes_skolemized, tax_ids=None): super().__init__(graph_type, are_bnodes_skolemized, 'go') # Defaults self.tax_ids = tax_ids if self.tax_ids is None: self.tax_ids = [9606, 10090, 7955] logger.info("No taxa set. Defaulting to %s", str(tax_ids)) else: logger.info("Filtering on the following taxa: %s", str(tax_ids)) # update the dataset object with details about this resource # NO LICENSE for this resource self.dataset = Dataset( 'go', 'GeneOntology', 'http://www.geneontology.org', None, "https://creativecommons.org/licenses/by/4.0/legalcode", 'http://geneontology.org/page/use-and-license') if 'test_ids' not in config.get_config() or \ 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") else: self.test_ids = config.get_config()['test_ids']['gene'] return
def __init__(self, tax_ids=None): super().__init__("biogrid") self.tax_ids = tax_ids self.load_bindings() self.dataset = Dataset( "biogrid", "The BioGrid", "http://thebiogrid.org/", None, "http://wiki.thebiogrid.org/doku.php/terms_and_conditions", ) # Defaults # taxids = [9606,10090,10116,7227,7955,6239,8355] #our favorite animals if self.tax_ids is None: self.tax_ids = [9606, 10090, 7955] if "test_ids" not in config.get_config() or "gene" not in config.get_config()["test_ids"]: logger.warn("not configured with gene test ids.") else: self.test_ids = config.get_config()["test_ids"]["gene"] # data-source specific warnings (will be removed when issues are cleared) logger.warn("several MI experimental codes do not exactly map to ECO; using approximations.") return
def __init__(self): Source.__init__(self, 'mpd') # @N, not sure if this step is required self.namespaces.update(curie_map.get()) self.stdevthreshold = 2 self.nobnodes = True # FIXME # update the dataset object with details about this resource # @N: Note that there is no license as far as I can tell self.dataset = Dataset( 'mpd', 'MPD', 'http://phenome.jax.org', None, None) # TODO add a citation for mpd dataset as a whole self.dataset.set_citation('PMID:15619963') self.assayhash = {} self.idlabel_hash = {} # to store the mean/zscore of each measure by strain+sex self.score_means_by_measure = {} # to store the mean value for each measure by strain+sex self.strain_scores_by_measure = {} self.geno = Genotype(self.graph) self.gu = GraphUtils(curie_map.get()) return
def __init__(self): Source.__init__(self, 'hpoa') self.load_bindings() self.dataset = Dataset( 'hpoa', 'Human Phenotype Ontology', 'http://www.human-phenotype-ontology.org', None, 'http://www.human-phenotype-ontology.org/contao/index.php/legal-issues.html') self.replaced_id_count = 0 if 'test_ids' not in config.get_config()\ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = config.get_config()['test_ids']['disease'] # data-source specific warnings to be removed when issues are cleared logger.warning( "note that some ECO classes are missing for ICE, PCS, and ITM;" + " using temporary mappings.") return
def __init__(self, tax_ids=None): super().__init__('biogrid') self.tax_ids = tax_ids self.load_bindings() self.dataset = Dataset( 'biogrid', 'The BioGrid', 'http://thebiogrid.org/', None, 'http://wiki.thebiogrid.org/doku.php/terms_and_conditions') # Defaults # our favorite animals # taxids = [9606,10090,10116,7227,7955,6239,8355] if self.tax_ids is None: self.tax_ids = [9606, 10090, 7955] if 'test_ids' not in config.get_config() or \ 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") else: self.test_ids = config.get_config()['test_ids']['gene'] # data-source specific warnings # (will be removed when issues are cleared) logger.warning( "several MI experimental codes do not exactly map to ECO; " "using approximations.") return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'omim') self.dataset = Dataset( 'omim', 'Online Mendelian Inheritance in Man', 'http://www.omim.org', None, 'http://omim.org/help/agreement') self.omim_ncbigene_idmap = {} # data-source specific warnings # (will be removed when issues are cleared) # check if config exists; if it doesn't, error out and let user know if 'keys' not in config.get_config() and \ 'omim' not in config.get_config()['keys']: logger.error("not configured with API key.") # check to see if there's any ids configured in the config; # otherwise, warn if 'test_ids' not in config.get_config() or \ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") else: # select ony those test ids that are omim's. self.test_ids += \ [obj.replace('OMIM:', '') for obj in config.get_config()['test_ids']['disease'] if re.match(r'OMIM:', obj)] return
def __init__(self, graph_type, are_bnodes_skolemized, tax_ids=None, gene_ids=None): super().__init__(graph_type, are_bnodes_skolemized, 'clinvar') self.tax_ids = tax_ids self.gene_ids = gene_ids self.filter = 'taxids' self.dataset = Dataset( 'ClinVar', 'National Center for Biotechnology Information', 'http://www.ncbi.nlm.nih.gov/clinvar/', None, 'http://www.ncbi.nlm.nih.gov/About/disclaimer.html', 'https://creativecommons.org/publicdomain/mark/1.0/') if 'test_ids' not in config.get_config() or \ 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") else: self.gene_ids = config.get_config()['test_ids']['gene'] if 'test_ids' not in config.get_config() or \ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") else: self.disease_ids = config.get_config()['test_ids']['disease'] self.properties = Feature.properties return
def test_distribution_level_no_license_url_default_value(self): self.dataset = Dataset(identifier=self.identifier, data_release_version=None, ingest_name=self.identifier, ingest_title=self.ingest_title, ingest_url=self.ingest_url, ingest_logo=self.ingest_logo_url, ingest_description=self.ingest_description, license_url=None, data_rights=self.data_rights) triples = list( self.dataset.graph.triples( (self.distribution_level_IRI_ttl_default_version, self.iri_license, URIRef(self.license_url_default)))) self.assertTrue( len(triples) == 1, "distribution level default license triple not set")
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'sgd') self.dataset = Dataset('sgd', 'SGD', 'https://www.yeastgenome.org/', None, None) self.global_terms = Source.open_and_parse_yaml( '../../translationtable/global_terms.yaml') self.apo_term_id = SGD.make_apo_map()
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'monarch') self.dataset = Dataset( 'monarch', 'MonarchInitiative', 'https://monarchinitiative.org', None, 'https://creativecommons.org/licenses/by/4.0/', None) return
def __init__(self, graph_type, are_bnodes_skolemized, tax_ids=None, version=None): super().__init__(graph_type, are_bnodes_skolemized, 'string') self.dataset = Dataset( 'string', 'String', 'http://string-db.org/', None, 'http://string-db.org/cgi/access.pl?footer_active_subpage=licensing' ) if tax_ids is None: self.tax_ids = StringDB.DEFAULT_TAXA else: logger.info("Filtering on taxa {}".format(tax_ids)) self.tax_ids = tax_ids if version is None: self.version = 'v10.5' self.files = { 'protein_links': { 'path': '{}protein.links.detailed.{}/'.format(StringDB.STRING_BASE, self.version), 'pattern': 'protein.links.detailed.{}.txt.gz'.format(self.version) } } self.id_map_files = { 9606: { 'url': 'https://string-db.org/mapping_files/entrez_mappings/' 'entrez_gene_id.vs.string.v10.28042015.tsv', 'file': 'entrez_gene_id.vs.string.v10.28042015.tsv' }, 10090: { 'url': 'https://data.monarchinitiative.org/dipper/' 'cache/10090.string2mgi.tsv', 'file': '10090.string2mgi.tsv' }, 6239: { 'url': 'https://data.monarchinitiative.org/dipper/' 'cache/6239.string2ensembl_gene.tsv', 'file': '6239.string2ensembl_gene.tsv' }, 7227: { 'url': 'https://data.monarchinitiative.org/dipper/' 'cache/7227.string2ensembl_gene.tsv', 'file': '7227.string2ensembl_gene.tsv' }, 7955: { 'url': 'https://data.monarchinitiative.org/dipper/' 'cache/7955.string2zfin.tsv', 'file': '7955.string2zfin.tsv' } }
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'eom') # update the dataset object with details about this resource # TODO put this into a conf file? self.dataset = Dataset( 'eom', 'EOM', 'http://elementsofmorphology.nih.gov', None, 'http://www.genome.gov/copyright.cfm', 'https://creativecommons.org/publicdomain/mark/1.0/') # check if config exists; if it doesn't, error out and let user know if 'dbauth' not in config.get_config() or \ 'disco' not in config.get_config()['dbauth']: logger.error("not configured with PG user/password.") # source-specific warnings. will be cleared when resolved. return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'mychem') self.dataset = Dataset( 'mychem', 'MYCHEM', 'https://mychem.info/', None, None) self.global_terms = Source.open_and_parse_yaml('../../translationtable/global_terms.yaml') self.inchikeys = MyChem.chunks(l=MyChem.get_inchikeys(), n=10) self.drugbank_targets = list() self.drugcentral_interactors = list()
def test_version_level_version_set_explicitly(self): self.dataset = Dataset( identifier=self.identifier, data_release_version=self.data_release_version, ingest_name=self.identifier, ingest_title=self.ingest_title, ingest_url=self.ingest_url, ingest_logo=self.ingest_logo_url, ingest_description=self.ingest_description, license_url=None, data_rights=self.data_rights ) triples = list(self.dataset.graph.triples( (self.version_level_IRI, self.iri_version, None))) self.assertTrue(len(triples) == 1, "didn't get exactly one version level version triple") self.assertEqual(triples[0][2], Literal(self.data_release_version, datatype=XSD.date), "version level version triple (set explicitly) is wrong ")
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'wormbase') # update the dataset object with details about this resource # NO LICENSE for this resource self.dataset = Dataset( 'wormbase', 'WormBase', 'http://www.wormbase.org', None, None, 'http://www.wormbase.org/about/policies#012') self.version_num = None return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'coriell') self.dataset = Dataset('coriell', 'Coriell', 'http://ccr.coriell.org/', None) # data-source specific warnings # (will be removed when issues are cleared) logger.warning('We assume that if a species is not provided, ' 'that it is a Human-derived cell line') logger.warning('We map all omim ids as a disease/phenotype entity, ' 'but should be fixed in the future') # TODO # check if config exists; if it doesn't, error out and let user know if 'dbauth' not in config.get_config() or \ 'coriell' not in config.get_config()['dbauth']: logger.error("not configured with FTP user/password.") return
def __init__(self): Source.__init__(self, 'mmrrc') self.strain_hash = {} self.id_label_hash = {} self.load_bindings() self.dataset = Dataset( 'mmrrc', 'Mutant Mouse Regional Resource Centers', 'https://www.mmrrc.org', None, 'https://www.mmrrc.org/about/data_download.php') return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'genereviews') self.dataset = Dataset('genereviews', 'Gene Reviews', 'http://genereviews.org/', None, 'http://www.ncbi.nlm.nih.gov/books/NBK138602/') self.dataset.set_citation('GeneReviews:NBK1116') self.book_ids = set() self.all_books = {} if 'test_ids' not in config.get_config() or\ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = list() else: # select ony those test ids that are omim's. self.test_ids = config.get_config()['test_ids']['disease'] return
def __init__(self): Source.__init__(self, 'wormbase') # update the dataset object with details about this resource # NO LICENSE for this resource self.dataset = Dataset( 'wormbase', 'WormBase', 'http://www.wormbase.org', None, None, 'http://www.wormbase.org/about/policies#012') self.version_num = None return
def __init__(self, database, username, password=None, host=None): super().__init__('cgd', database, username, password, host) self.dataset = Dataset('cgd', 'cgd', 'http://ga4gh.org') self.gene_map = {} self.disease_map = {} self.drug_map = {} self.transcript_xrefs = {'RefSeq': {}, 'UniProt': {}} self.bindings = {} for k in curie_map.get().keys(): v = curie_map.get()[k] self.bindings[k] = Namespace(v)
def __init__(self, graph_type, are_bnodes_skolemized): Source.__init__(self, graph_type, are_bnodes_skolemized, 'mpd') # @N, not sure if this step is required self.stdevthreshold = 2 # update the dataset object with details about this resource # @N: Note that there is no license as far as I can tell self.dataset = Dataset( 'mpd', 'MPD', 'http://phenome.jax.org', None, None) # TODO add a citation for mpd dataset as a whole self.dataset.set_citation('PMID:15619963') self.assayhash = {} self.idlabel_hash = {} # to store the mean/zscore of each measure by strain+sex self.score_means_by_measure = {} # to store the mean value for each measure by strain+sex self.strain_scores_by_measure = {} return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'impc') # update the dataset object with details about this resource self.dataset = Dataset( 'impc', 'IMPC', 'http://www.mousephenotype.org', None, 'https://raw.githubusercontent.com/mpi2/PhenotypeArchive/master/LICENSE' ) # TODO add a citation for impc dataset as a whole # :impc cito:citesAsAuthority PMID:24194600 return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'animalqtldb') # update the dataset object with details about this resource self.dataset = Dataset( 'animalqtldb', 'Animal QTL db', 'http://www.animalgenome.org/cgi-bin/QTLdb/index', None, None, AQDL + '/faq#23', graph_type=graph_type) # source-specific warnings. will be cleared when resolved. logger.warning( "No licences or rights exist for the raw data from this resource.") return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'hpoa') self.dataset = Dataset( 'hpoa', 'Human Phenotype Ontology', 'http://www.human-phenotype-ontology.org', None, 'http://www.human-phenotype-ontology.org/contao/index.php/legal-issues.html' ) self.replaced_id_count = 0 if 'test_ids' not in config.get_config()\ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = config.get_config()['test_ids']['disease'] # data-source specific warnings to be removed when issues are cleared logger.warning( "note that some ECO classes are missing for ICE, PCS, and ITM;" + " using temporary mappings.") return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'orphanet') self.dataset = Dataset( 'orphanet', 'Orphanet', 'http://www.orpha.net', None, 'http://creativecommons.org/licenses/by-nd/3.0/', 'http://omim.org/help/agreement') # check to see if there's any ids configured in the config; # otherwise, warn if 'test_ids' not in config.get_config() or \ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") return
def __init__(self): Source.__init__(self, 'coriell') self.load_bindings() self.dataset = Dataset('coriell', 'Coriell', 'http://ccr.coriell.org/', None) # data-source specific warnings (will be removed when issues are cleared) logger.warn('We assume that if a species is not provided, that it is a Human-derived cell line') logger.warn('We map all omim ids as a disease/phenotype entity, but should be fixed in the future') # check if config exists; if it doesn't, error out and let user know if 'dbauth' not in config.get_config() or 'coriell' not in config.get_config()['dbauth']: logger.error("not configured with FTP user/password.") return
def __init__(self): Source.__init__(self, 'eom') self.namespaces.update(curie_map.get()) # update the dataset object with details about this resource # TODO put this into a conf file? self.dataset = Dataset('eom', 'EOM', 'http://elementsofmorphology.nih.gov', None, 'http://www.genome.gov/copyright.cfm', 'https://creativecommons.org/publicdomain/mark/1.0/') # check if config exists; if it doesn't, error out and let user know if 'dbauth' not in config.get_config() or 'disco' not in config.get_config()['dbauth']: logger.error("not configured with PG user/password.") # source-specific warnings. will be cleared when resolved. return
def __init__(self, graph_type, are_bnodes_skolemized, tax_ids=None): super().__init__(graph_type, are_bnodes_skolemized, 'monochrom') self.tax_ids = tax_ids # Defaults if self.tax_ids is None: self.tax_ids = [ 9606, 10090, 7955, 10116, 9913, 9031, 9823, 9940, 9796 ] self._check_tax_ids() # TODO add license self.dataset = Dataset('monochrom', 'Monarch Chromosome Ontology', 'http://monarchinitiative.org', None, 'http://creativecommons.org/licenses/by/4.0/') return
def __init__(self, graph_type, are_bnodes_skolemized, tax_ids=None): super().__init__(graph_type, are_bnodes_skolemized, 'panther') self.tax_ids = tax_ids self.dataset = Dataset( 'panther', 'Protein ANalysis THrough Evolutionary Relationships', 'http://pantherdb.org/', None, 'http://www.pantherdb.org/terms/disclaimer.jsp') # # Defaults # if self.tax_ids is None: # self.tax_ids = [9606, 10090, 7955] if 'test_ids' not in config.get_config() \ or 'protein' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") else: self.test_ids = config.get_config()['test_ids']['protein'] return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'decipher') self.dataset = Dataset( 'decipher', 'Development Disorder Genotype – Phenotype Database', 'https://decipher.sanger.ac.uk/', None, 'https://decipher.sanger.ac.uk/legal') if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = config.get_config()['test_ids']['disease'] self.g = self.graph self.geno = Genotype(self.g) self.model = Model(self.g) return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'genereviews') self.dataset = Dataset( 'genereviews', 'Gene Reviews', 'http://genereviews.org/', None, 'http://www.ncbi.nlm.nih.gov/books/NBK138602/') self.dataset.set_citation('GeneReviews:NBK1116') self.book_ids = set() self.all_books = {} if 'test_ids' not in config.get_config() or\ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = list() else: # select ony those test ids that are omim's. self.test_ids = config.get_config()['test_ids']['disease'] return
def __init__(self, graph_type, are_bnodes_skolemized, tax_ids=None, gene_ids=None): super().__init__(graph_type, are_bnodes_skolemized, 'hgnc') self.tax_ids = tax_ids self.gene_ids = gene_ids self.dataset = Dataset( 'hgnc', 'HGNC', 'http://www.genenames.org', None) self.gene_ids = [] if 'test_ids' not in config.get_config() \ or 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") else: self.gene_ids = config.get_config()['test_ids']['gene'] self.properties = Feature.properties return
def __init__(self, graph_type, are_bnodes_skolemized, tax_ids=None, version=None): """ :param tax_ids: [int,], List of taxa :return: """ super().__init__(graph_type, are_bnodes_skolemized, 'bgee') if tax_ids is None: self.tax_ids = Bgee.DEFAULT_TAXA else: logger.info("Filtering on taxa {}".format(tax_ids)) self.tax_ids = tax_ids self.dataset = Dataset( 'Bgee', 'Bgee Gene expression data in animals', 'http://bgee.org/') if version is None: self.version = 'current' else: self.version = version
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'omia') self.dataset = Dataset('omia', 'Online Mendelian Inheritance in Animals', 'http://omia.angis.org.au', None, None, 'http://sydney.edu.au/disclaimer.shtml') self.id_hash = { 'article': {}, 'phene': {}, 'breed': {}, 'taxon': {}, 'gene': {} } self.label_hash = {} # used to store the omia to omim phene mappings self.omia_omim_map = {} # used to store the unique genes that have phenes # (for fetching orthology) self.annotated_genes = set() self.test_ids = { 'disease': [ 'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201', 'OMIA:000810', 'OMIA:001400' ], 'gene': [ 492297, 434, 492296, 3430235, 200685834, 394659996, 200685845, 28713538, 291822383 ], 'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825], # to be filled in during parsing of breed table # for lookup by breed-associations 'breed': [] } # to store a map of omia ids and any molecular info # to write a report for curation self.stored_omia_mol_gen = {} self.g = self.graph return
def __init__(self): Source.__init__(self, 'genereviews') self.load_bindings() self.dataset = Dataset( 'genereviews', 'Gene Reviews', 'http://genereviews.org/', None, 'http://www.ncbi.nlm.nih.gov/books/NBK138602/') self.dataset.set_citation('GeneReviews:NBK1116') self.gu = GraphUtils(curie_map.get()) self.book_ids = set() self.all_books = {} if 'test_ids' not in config.get_config() or\ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = list() else: # select ony those test ids that are omim's. self.test_ids = config.get_config()['test_ids']['disease'] return
class EOM(PostgreSQLSource): """ Elements of Morphology is a resource from NHGRI that has definitions of morphological abnormalities, together with image depictions. We pull those relationships, as well as our local mapping of equivalences between EOM and HP terminologies. The website is crawled monthly by NIF's DISCO crawler system, which we utilize here. Be sure to have pg user/password connection details in your conf.json file, like: dbauth : { 'disco' : {'user' : '<username>', 'password' : '<password>'} } Monarch-curated data for the HP to EOM mapping is stored at https://phenotype-ontologies.googlecode.com Since this resource is so small, the entirety of it is the "test" set. """ # we are using the production view here; should we be using services? tables = [ 'dvp.pr_nlx_157874_1' ] files = { 'map': { 'file': 'hp-to-eom-mapping.tsv', 'url': 'https://phenotype-ontologies.googlecode.com/svn/trunk/src/ontology/hp/mappings/hp-to-eom-mapping.tsv' } } def __init__(self): super().__init__('eom') self.namespaces.update(curie_map.get()) # update the dataset object with details about this resource # TODO put this into a conf file? self.dataset = Dataset( 'eom', 'EOM', 'http://elementsofmorphology.nih.gov', None, 'http://www.genome.gov/copyright.cfm', 'https://creativecommons.org/publicdomain/mark/1.0/') # check if config exists; if it doesn't, error out and let user know if 'dbauth' not in config.get_config() or \ 'disco' not in config.get_config()['dbauth']: logger.error("not configured with PG user/password.") # source-specific warnings. will be cleared when resolved. return def fetch(self, is_dl_forced=False): '''create the connection details for DISCO''' cxn = config.get_config()['dbauth']['disco'] cxn.update( {'host': 'nif-db.crbs.ucsd.edu', 'database': 'disco_crawler', 'port': 5432}) self.dataset.setFileAccessUrl( ''.join(('jdbc:postgresql://', cxn['host'], ':', str(cxn['port']), '/', cxn['database']))) # process the tables # self.fetch_from_pgdb(self.tables,cxn,100) #for testing self.fetch_from_pgdb(self.tables, cxn) self.get_files(is_dl_forced) # FIXME: Everything needed for data provenance? st = os.stat('/'.join((self.rawdir, 'dvp.pr_nlx_157874_1'))) filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d") self.dataset.setVersion(filedate) return def parse(self, limit=None): ''' Over ride Source.parse inherited via PostgreSQLSource ''' if limit is not None: logger.info("Only parsing first %s rows of each file", limit) if self.testOnly: self.testMode = True logger.info("Parsing files...") self._process_nlx_157874_1_view('/'.join((self.rawdir, 'dvp.pr_nlx_157874_1')), limit) self._map_eom_terms('/'.join((self.rawdir, self.files['map']['file'])), limit) logger.info("Finished parsing.") self.load_bindings() # since it's so small, # we default to copying the entire graph to the test set self.testgraph = self.graph logger.info("Found %s nodes", len(self.graph)) return def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data that has been screen-scraped into DISCO. Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Triples: <eom id> a owl:Class rdf:label Literal(eom label) OIO:hasRelatedSynonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) :param raw: :param limit: :return: """ gu = GraphUtils(curie_map.get()) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip filereader = csv.reader(f1, delimiter='\t', quotechar='\"') for line in filereader: line_counter += 1 (morphology_term_id, morphology_term_num, morphology_term_label, morphology_term_url, terminology_category_label, terminology_category_url, subcategory, objective_definition, subjective_definition, comments, synonyms, replaces, small_figure_url, large_figure_url, e_uid, v_uid, v_uuid, v_last_modified) = line # note: # e_uid v_uuid v_last_modified terminology_category_url # subcategory v_uid morphology_term_num # terminology_category_label hp_label notes # are currently unused. # Add morphology term to graph as a class # with label, type, and description. gu.addClassToGraph(self.graph, morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not ( re.match(r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition.strip() + '.' if objective_definition != '' and not ( re.match(r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition.strip() + '.' definition = \ ' '.join( (objective_definition, subjective_definition)).strip() gu.addDefinition(self.graph, morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': gu.addDepiction(self.graph, morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': gu.addDepiction(self.graph, morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': gu.addComment(self.graph, morphology_term_id, comments.strip()) if synonyms != '': for s in synonyms.split(';'): gu.addSynonym( self.graph, morphology_term_id, s.strip(), gu.properties['hasExactSynonym']) # morphology_term_id hasRelatedSynonym replaces (; delimited) if replaces != '' and replaces != synonyms: for s in replaces.split(';'): gu.addSynonym( self.graph, morphology_term_id, s.strip(), gu.properties['hasRelatedSynonym']) # morphology_term_id has page morphology_term_url gu.addPage(self.graph, morphology_term_id, morphology_term_url) if limit is not None and line_counter > limit: break return def _map_eom_terms(self, raw, limit=None): """ This table contains the HP ID mappings from the local tsv file. Triples: <eom id> owl:equivalentClass <hp id> :param raw: :param limit: :return: """ gu = GraphUtils(curie_map.get()) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip for line in f1: line_counter += 1 (morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = line.split('\t') # Sub out the underscores for colons. hp_id = re.sub('_', ':', hp_id) if re.match(".*HP:.*", hp_id): # add the HP term as a class gu.addClassToGraph(self.graph, hp_id, None) # Add the HP ID as an equivalent class gu.addEquivalentClass( self.graph, morphology_term_id, hp_id) else: logger.warning('No matching HP term for %s', morphology_term_label) if limit is not None and line_counter > limit: break return def getTestSuite(self): import unittest # TODO PYLINT: Unable to import 'tests.test_eom' from tests.test_eom import EOMTestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(EOMTestCase) return test_suite
class Coriell(Source): """ The Coriell Catalog provided to Monarch includes metadata and descriptions of NIGMS, NINDS, NHGRI, and NIA cell lines. These lines are made available for research purposes. Here, we create annotations for the cell lines as models of the diseases from which they originate. We create a handle for a patient from which the given cell line is derived (since there may be multiple cell lines created from a given patient). A genotype is assembled for a patient, which includes a karyotype (if specified) and/or a collection of variants. Both the genotype (has_genotype) and disease are linked to the patient (has_phenotype), and the cell line is listed as derived from the patient. The cell line is classified by it's [CLO cell type](http://www.ontobee.org/browser/index.php?o=clo), which itself is linked to a tissue of origin. Unfortunately, the omim numbers listed in this file are both for genes & diseases; we have no way of knowing a priori if a designated omim number is a gene or disease; so we presently link the patient to any omim id via the has_phenotype relationship. Notice: The Coriell catalog is delivered to Monarch in a specific format, and requires ssh rsa fingerprint identification. Other groups wishing to get this data in it's raw form will need to contact Coriell for credential This needs to be placed into your configuration file for it to work. """ terms = { 'cell_line_repository': 'CLO:0000008', 'race': 'SIO:001015', 'ethnic_group': 'EFO:0001799', 'age': 'EFO:0000246', 'sampling_time': 'EFO:0000689', 'collection': 'ERO:0002190' } files = { 'NINDS': { 'file': 'NINDS.csv', 'id': 'NINDS', 'label': 'NINDS Human Genetics DNA and Cell line Repository', 'page': 'https://catalog.coriell.org/1/NINDS'}, 'NIGMS': { 'file': 'NIGMS.csv', 'id': 'NIGMS', 'label': 'NIGMS Human Genetic Cell Repository', 'page': 'https://catalog.coriell.org/1/NIGMS'}, 'NIA': { 'file': 'NIA.csv', 'id': 'NIA', 'label': 'NIA Aging Cell Repository', 'page': 'https://catalog.coriell.org/1/NIA'}, 'NHGRI': { 'file': 'NHGRI.csv', 'id': 'NHGRI', 'label': 'NHGRI Sample Repository for Human Genetic Research', 'page': 'https://catalog.coriell.org/1/NHGRI'} } # the following will house the specific cell lines to use for test output test_lines = [ 'ND02380', 'ND02381', 'ND02383', 'ND02384', 'GM17897', 'GM17898', 'GM17896', 'GM17944', 'GM17945', 'ND00055', 'ND00094', 'ND00136', 'GM17940', 'GM17939', 'GM20567', 'AG02506', 'AG04407', 'AG07602' 'AG07601', 'GM19700', 'GM19701', 'GM19702', 'GM00324', 'GM00325', 'GM00142', 'NA17944', 'AG02505', 'GM01602', 'GM02455', 'AG00364', 'GM13707', 'AG00780'] def __init__(self): Source.__init__(self, 'coriell') self.load_bindings() self.dataset = Dataset( 'coriell', 'Coriell', 'http://ccr.coriell.org/', None) # data-source specific warnings # (will be removed when issues are cleared) logger.warning( 'We assume that if a species is not provided, ' 'that it is a Human-derived cell line') logger.warning( 'We map all omim ids as a disease/phenotype entity, ' 'but should be fixed in the future') # check if config exists; if it doesn't, error out and let user know if 'dbauth' not in config.get_config() or \ 'coriell' not in config.get_config()['dbauth']: logger.error("not configured with FTP user/password.") return def fetch(self, is_dl_forced=False): """ Here we connect to the coriell sftp server using private connection details. They dump bi-weekly files with a timestamp in the filename. For each catalog, we poll the remote site and pull the most-recently updated file, renaming it to our local *_latest.csv. Be sure to have pg user/password connection details in your conf.json file, like: dbauth : { "coriell" : { "user" : "<username>", "password" : "<password>", "host" : <host>, "private_key"=path/to/rsa_key} } :param is_dl_forced: :return: """ host = config.get_config()['dbauth']['coriell']['host'] user = config.get_config()['dbauth']['coriell']['user'] passwd = config.get_config()['dbauth']['coriell']['password'] key = config.get_config()['dbauth']['coriell']['private_key'] with pysftp.Connection( host, username=user, password=passwd, private_key=key) as sftp: # check to make sure each file is in there # get the remote files remote_files = sftp.listdir_attr() files_by_repo = {} for attr in remote_files: # for each catalog, get the most-recent filename m = re.match('(NIGMS|NIA|NHGRI|NINDS)', attr.filename) if m is not None and len(m.groups()) > 0: # there should just be one now files_by_repo[m.group(1)] = attr # sort each array in hash, # & get the name and time of the most-recent file for each catalog for r in self.files: logger.info("Checking on %s catalog file", r) fname = self.files[r]['file'] remotef = files_by_repo[r] target_name = '/'.join((self.rawdir, fname)) # check if the local file is out of date, if so, download. # otherwise, skip. # we rename (for simplicity) the original file st = None if os.path.exists(target_name): st = os.stat(target_name) logger.info( "Local file date: %s", datetime.utcfromtimestamp(st[stat.ST_CTIME])) if st is None or remotef.st_mtime > st[stat.ST_CTIME]: if st is None: logger.info( "File does not exist locally; downloading...") else: logger.info( "There's a new version of %s catalog available; " "downloading...", r) sftp.get(remotef.filename, target_name) logger.info( "Fetched remote %s -> %s", remotef.filename, target_name) st = os.stat(target_name) filedate = \ datetime.utcfromtimestamp( remotef.st_mtime).strftime("%Y-%m-%d") logger.info( "New file date: %s", datetime.utcfromtimestamp(st[stat.ST_CTIME])) else: logger.info("File %s exists; using local copy", fname) filedate = \ datetime.utcfromtimestamp( st[stat.ST_CTIME]).strftime("%Y-%m-%d") self.dataset.setFileAccessUrl(remotef.filename) self.dataset.setVersion(filedate) return def parse(self, limit=None): if limit is not None: logger.info("Only parsing first %s rows of each file", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True for f in self.files: file = '/'.join((self.rawdir, self.files[f]['file'])) self._process_collection( self.files[f]['id'], self.files[f]['label'], self.files[f]['page']) self._process_data(file, limit) logger.info("Finished parsing.") self.load_bindings() logger.info("Found %d nodes in graph", len(self.graph)) logger.info("Found %d nodes in testgraph", len(self.testgraph)) return def _process_data(self, raw, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: # set the graph to build g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) du = DipperUtil() gu.loadProperties(g, geno.object_properties, gu.OBJPROP) gu.loadAllProperties(g) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: if not row: pass else: line_counter += 1 (catalog_id, description, omim_number, sample_type, cell_line_available, dna_in_stock, dna_ref, gender, age, race, ethnicity, affected, karyotype, relprob, mutation, gene, family_id, collection, url, cat_remark, pubmed_ids, family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,, # 2,,18343,H**o sapiens if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:'+catalog_id.strip() # Map the cell/sample type cell_type = self._map_cell_type(sample_type) # Make a cell line label line_label = \ collection.partition(' ')[0]+'-'+catalog_id.strip() # Map the repository/collection repository = self._map_collection(collection) # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_person' if self.nobnodes: patient_id = ':'+patient_id if family_id != '': patient_id = \ '-'.join((patient_id, family_id, family_member)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id.strip())) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. short_desc = (description.split(';')[0]).capitalize() if affected == 'Yes': affected = 'affected' elif affected == 'No': affected = 'unaffected' gender = gender.lower() patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = \ ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = \ ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = 'CLO:0000031' gu.addIndividualToGraph( g, cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:'+dna_ref # some of the equivalent ids are not defined # in the source data; so add them gu.addIndividualToGraph( g, equiv_cell_line, None, cell_line_reagent_id) gu.addSameIndividual(g, cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository gu.addMember(g, repository, cell_line_id) if cat_remark != '': gu.addDescription(g, cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # g,age_id,age,self.terms['age']) # gu.addTriple( # g,age_id,self.properties['has_measurement'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. gu.addPerson(g, patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self._map_race(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.terms['race'],mapped_race) # gu.addSubclass( # g,self.terms['ethnic_group'],mapped_race) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if family_id != '': family_comp_id = 'CoriellFamily:'+family_id family_label = \ ' '.join(('Family of proband with', short_desc)) # Add the family ID as a named individual gu.addIndividualToGraph( g, family_comp_id, family_label, geno.genoparts['family']) # Add the patient as a member of the family gu.addMemberOf(g, patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! if species is None or species == '': species = 'H**o sapiens' taxon = self._map_species(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None if dbsnp_id != '': genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip() omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = du.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = \ '_'+re.sub('MONARCH:', '', self.make_id(karyotype)) if self.nobnodes: karyotype_id = ':'+karyotype_id # add karyotype as karyotype_variation_complement gu.addIndividualToGraph( g, karyotype_id, karyotype, geno.genoparts['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = \ self._get_affected_chromosomes_from_karyotype( karyotype) for c in karyo_chrs: chr_id = makeChromID(c, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, c)) karyotype_feature_label = \ 'some karyotype alteration on chr'+str(c) f = Feature( karyotype_feature_id, karyotype_feature_label, geno.genoparts['sequence_alteration']) f.addFeatureStartLocation(None, chr_id) f.addFeatureToGraph(g) f.loadAllProperties(g) geno.addParts( karyotype_feature_id, karyotype_id, geno.object_properties['has_alternate_part']) if gene != '': vl = gene+'('+mutation+')' # fix the variant_id so it's always in the same order vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' \ and not self._is_normal_karyotype(karyotype): mutation = mutation.strip() gvc_id = karyotype_id if variant_id != '': gvc_id = '_' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass if gvc_id is not None and gvc_id != karyotype_id \ and self.nobnodes: gvc_id = ':'+gvc_id # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = geno.object_properties['has_alternate_part'] if self._is_normal_karyotype(karyotype): karyo_rel = \ geno.object_properties['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for v in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X m = re.match(r'(\d+)\.+(.*)', v.strip()) if m is not None and len(m.groups()) == 2: (locus_num, var_num) = m.groups() if locus_num is not None \ and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for o in omim_map: # gene_id = 'OMIM:' + o # TODO unused vslc_id = \ '_' + '-'.join( [o + '.' + a for a in omim_map.get(o)]) if self.nobnodes: vslc_id = ':'+vslc_id vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts[ 'variant_single_locus_complement']) for v in omim_map.get(o): # this is actually a sequence alt allele1_id = 'OMIM:'+o+'.'+v geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, geno.zygosity['indeterminate'], geno.object_properties[ 'has_alternate_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype gu.addType(g, patient_id, geno.genoparts['wildtype']) elif genotype_id is None: # make an anonymous genotype id genotype_id = '_geno'+catalog_id.strip() if self.nobnodes: genotype_id = ':'+genotype_id # add the gvc if gvc_id is not None: gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = \ geno.object_properties[ 'has_reference_part'] else: rel = \ geno.object_properties[ 'has_alternate_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = \ '; '.join((gvc_label, karyotype)) else: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts( karyotype_id, genotype_id, geno.object_properties[ 'has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' ['+catalog_id.strip()+']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype( genotype_id, genotype_label, geno.genoparts['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient gu.addTriple( g, patient_id, geno.properties['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # ############# DEAL WITH THE DISEASES ############# # we associate the disease to the patient if affected == 'affected': if omim_number != '': for d in omim_number.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno if d not in omim_map: disease_id = 'OMIM:'+d.strip() # assume the label is taken care of gu.addClassToGraph(g, disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc( self.name, patient_id, disease_id) assoc.add_association_to_graph(g) # this line is a model of this disease # TODO abstract out model into # it's own association class? gu.addTriple( g, cell_line_id, gu.properties['model_of'], disease_id) else: logger.info( 'removing %s from disease list ' + 'since it is a gene', d) # ############# ADD PUBLICATIONS ############# if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:'+s.strip() ref = Reference(pubmed_id) ref.setType(Reference.ref_types['journal_article']) ref.addRefToGraph(g) gu.addTriple( g, pubmed_id, gu.properties['mentions'], cell_line_id) if not self.testMode \ and (limit is not None and line_counter > limit): break Assoc(self.name).load_all_properties(g) return def _process_collection(self, collection_id, label, page): """ This function will process the data supplied internally about the repository from Coriell. Triples: Repository a ERO:collection rdf:label Literal(label) foaf:page Literal(page) :param collection_id: :param label: :param page: :return: """ # ############# BUILD THE CELL LINE REPOSITORY ############# for g in [self.graph, self.testgraph]: # FIXME: How to devise a label for each repository? gu = GraphUtils(curie_map.get()) repo_id = 'CoriellCollection:'+collection_id repo_label = label repo_page = page gu.addIndividualToGraph( g, repo_id, repo_label, self.terms['collection']) gu.addPage(g, repo_id, repo_page) return @staticmethod def _map_cell_type(sample_type): ctype = None type_map = { # FIXME: mesenchymal stem cell of adipose 'Adipose stromal cell': 'CL:0002570', # FIXME: amniocyte? 'Amniotic fluid-derived cell line': 'CL:0002323', # B cell 'B-Lymphocyte': 'CL:0000236', # FIXME: No Match 'Chorionic villus-derived cell line': 'CL:0000000', # endothelial cell 'Endothelial': 'CL:0000115', # epithelial cell 'Epithelial': 'CL:0000066', # FIXME: No Match. "Abnormal precursor (virally transformed) # of mouse erythrocytes that can be grown in culture and # induced to differentiate by treatment with, for example, DMSO." 'Erythroleukemic cell line': 'CL:0000000', 'Fibroblast': 'CL:0000057', # fibroblast 'Keratinocyte': 'CL:0000312', # keratinocyte 'Melanocyte': 'CL:0000148', # melanocyte 'Mesothelial': 'CL:0000077', 'Microcell hybrid': 'CL:0000000', # FIXME: No Match 'Myoblast': 'CL:0000056', # myoblast 'Smooth muscle': 'CL:0000192', # smooth muscle cell 'Stem cell': 'CL:0000034', # stem cell 'T-Lymphocyte': 'CL:0000084', # T cell # FIXME: No Match. "Cells isolated from a mass of neoplastic cells, # i.e., a growth formed by abnormal cellular proliferation." # Oncocyte? CL:0002198 'Tumor-derived cell line': 'CL:0002198' } if sample_type.strip() in type_map: ctype = type_map.get(sample_type) else: logger.error("Cell type not mapped: %s", sample_type) return ctype @staticmethod def _map_race(race): rtype = None type_map = { 'African American': 'EFO:0003150', # 'American Indian': 'EFO', 'Asian': 'EFO:0003152', # FIXME: Asian? 'Asian; Other': 'EFO:0003152', # Asian Indian 'Asiatic Indian': 'EFO:0003153', # FIXME: African American? There is also African. 'Black': 'EFO:0003150', 'Caucasian': 'EFO:0003156', 'Chinese': 'EFO:0003157', 'East Indian': 'EFO:0003158', # Eastern Indian 'Filipino': 'EFO:0003160', # Hispanic: EFO:0003169, Latino: EFO:0003166 see next 'Hispanic/Latino': 'EFO:0003169', 'Japanese': 'EFO:0003164', 'Korean': 'EFO:0003165', # 'More than one race': 'EFO', # 'Not Reported': 'EFO', # 'Other': 'EFO', # Asian/Pacific Islander 'Pacific Islander': 'EFO:0003154', # Asian/Pacific Islander 'Polynesian': 'EFO:0003154', # 'Unknown': 'EFO', # Asian 'Vietnamese': 'EFO:0003152', } if race.strip() in type_map: rtype = type_map.get(race) else: logger.warning("Race type not mapped: %s", race) return rtype @staticmethod def _map_species(species): tax = None type_map = { 'Mus musculus': 'NCBITaxon:10090', 'Peromyscus peromyscus californicus': 'NCBITaxon:42520', 'Peromyscus peromyscus maniculatus': 'NCBITaxon:10042', 'Peromyscus peromyscus leucopus': 'NCBITaxon:10041', 'Peromyscus peromyscus polionotus': 'NCBITaxon:42413', 'Macaca fascicularis': 'NCBITaxon:9541', 'Rattus norvegicus': 'NCBITaxon:10116', 'Papio anubis': 'NCBITaxon:9555', 'Cricetulus griseus': 'NCBITaxon:10029', 'Geochelone elephantopus': 'NCBITaxon:66189', 'Muntiacus muntjak': 'NCBITaxon:9888', 'Ailurus fulgens': 'NCBITaxon:9649', 'Sus scrofa': 'NCBITaxon:9823', 'Bos taurus': 'NCBITaxon:9913', 'Oryctolagus cuniculus': 'NCBITaxon:9986', 'Macaca nemestrina': 'NCBITaxon:9545', 'Canis familiaris': 'NCBITaxon:9615', 'Equus caballus': 'NCBITaxon:9796', 'Macaca mulatta': 'NCBITaxon:9544', 'Mesocricetus auratus': 'NCBITaxon:10036', 'Macaca nigra': 'NCBITaxon:54600', 'Erythrocebus patas': 'NCBITaxon:9538', 'Pongo pygmaeus': 'NCBITaxon:9600', 'Callicebus moloch': 'NCBITaxon:9523', 'Lagothrix lagotricha': 'NCBITaxon:9519', 'Saguinus fuscicollis': 'NCBITaxon:9487', 'Saimiri sciureus': 'NCBITaxon:9521', 'Saguinus labiatus': 'NCBITaxon:78454', 'Pan paniscus': 'NCBITaxon:9597', 'Ovis aries': 'NCBITaxon:9940', 'Felis catus': 'NCBITaxon:9685', 'H**o sapiens': 'NCBITaxon:9606' } if species.strip() in type_map: tax = type_map.get(species) else: logger.warning("Species type not mapped: %s", species) return tax @staticmethod def _map_collection(collection): ctype = None type_map = { 'NINDS Repository': 'CoriellCollection:NINDS', 'NIGMS Human Genetic Cell Repository': 'CoriellCollection:NIGMS', 'NIA Aging Cell Culture Repository': 'CoriellCollection:NIA', 'NHGRI Sample Repository for Human Genetic Research': 'CoriellCollection:NHGRI' } if collection.strip() in type_map: ctype = type_map.get(collection) else: logger.warning("ERROR: Collection type not mapped: %s", collection) return ctype @staticmethod def _get_affected_chromosomes_from_karyotype(karyotype): affected_chromosomes = set() chr_regex = r'(\d+|X|Y|M|\?);?' abberation_regex = r'(?:add|del|der|i|idic|inv|r|rec|t)\([\w;]+\)' sex_regex = r'(?:;)(X{2,}Y+|X?Y{2,}|X{3,}|X|Y)(?:;|$)' # first fetch the set of abberations abberations = re.findall(abberation_regex, karyotype) # iterate over them to get the chromosomes for a in abberations: chrs = re.findall(chr_regex, a) affected_chromosomes = affected_chromosomes.union(set(chrs)) # remove the ? as a chromosome, since it isn't valid if '?' in affected_chromosomes: affected_chromosomes.remove('?') # check to see if there are any abnormal sex chromosomes m = re.search(sex_regex, karyotype) if m is not None: if re.search(r'X?Y{2,}', m.group(1)): # this is the only case where there is an extra Y chromosome affected_chromosomes.add('Y') else: affected_chromosomes.add('X') return affected_chromosomes @staticmethod def _is_normal_karyotype(karyotype): """ This will default to true if no karyotype is provided. This is assuming human karyotypes. :param karyotype: :return: """ is_normal = True if karyotype is not None: karyotype = karyotype.strip() if karyotype not in ['46;XX', '46;XY', '']: is_normal = False return is_normal def getTestSuite(self): import unittest from tests.test_coriell import CoriellTestCase # TODO add G2PAssoc, Genotype tests test_suite = \ unittest.TestLoader().loadTestsFromTestCase(CoriellTestCase) return test_suite
class Source: """ Abstract class for any data sources that we'll import and process. Each of the subclasses will fetch() the data, scrub() it as necessary, then parse() it into a graph. The graph will then be written out to a single self.name().<dest_fmt> file. Also provides a means to marshal metadata in a consistent fashion Houses the global translation table (from ontology label to ontology term) so it may as well be used everywhere. """ namespaces = {} files = {} def __init__( self, graph_type='rdf_graph', # or streamed_graph are_bnodes_skized=False, # typically True name=None, # identifier; make an IRI for nquads ingest_title=None, ingest_url=None, license_url=None, # only if it is _our_ lic data_rights=None, # external page that points to their current lic file_handle=None ): # pull in the common test identifiers self.all_test_ids = self.open_and_parse_yaml('../../resources/test_ids.yaml') self.graph_type = graph_type self.are_bnodes_skized = are_bnodes_skized self.ingest_url = ingest_url self.ingest_title = ingest_title self.localtt = self.load_local_translationtable(name) if name is not None: self.name = name.lower() elif self.whoami() is not None: self.name = self.whoami().lower() LOG.info("Processing Source \"%s\"", self.name) self.test_only = False self.path = "" # to be used to store a subset of data for testing downstream. self.triple_count = 0 self.outdir = 'out' self.testdir = 'tests' self.rawdir = 'raw' self.rawdir = '/'.join((self.rawdir, self.name)) self.testname = name + "_test" self.testfile = '/'.join((self.outdir, self.testname + ".ttl")) self.datasetfile = None # still need to pull in file suffix -- this ia a curie not a url self.archive_url = 'MonarchArchive:' + 'ttl/' + self.name + '.ttl' # if raw data dir doesn't exist, create it if not os.path.exists(self.rawdir): os.makedirs(self.rawdir) pth = os.path.abspath(self.rawdir) LOG.info("creating raw directory for %s at %s", self.name, pth) # if output dir doesn't exist, create it if not os.path.exists(self.outdir): os.makedirs(self.outdir) pth = os.path.abspath(self.outdir) LOG.info("created output directory %s", pth) LOG.info("Creating Test graph %s", self.testname) # note: tools such as protoge need slolemized blank nodes self.testgraph = RDFGraph(True, self.testname) if graph_type == 'rdf_graph': graph_id = ':MONARCH_' + str(self.name) + "_" + \ datetime.now().isoformat(' ').split()[0] LOG.info("Creating graph %s", graph_id) self.graph = RDFGraph(are_bnodes_skized, graph_id) elif graph_type == 'streamed_graph': # need to expand on export formats dest_file = open(pth + '/' + name + '.nt', 'w') # where is the close? self.graph = StreamedGraph(are_bnodes_skized, dest_file) # leave test files as turtle (better human readibility) else: LOG.error( "%s graph type not supported\n" "valid types: rdf_graph, streamed_graph", graph_type) # pull in global ontology mapping datastructures self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # self.prefix_base = {v: k for k, v in self.curie_map.items()} # will be set to True if the intention is # to only process and write the test data self.test_only = False self.test_mode = False # this may eventually support Bagits self.dataset = Dataset( self.archive_url, self.ingest_title, self.ingest_url, None, # description license_url, # only _OUR_ lic data_rights, # tries to point to others lics graph_type, file_handle ) for graph in [self.graph, self.testgraph]: self.declareAsOntology(graph) def fetch(self, is_dl_forced=False): """ abstract method to fetch all data from an external resource. this should be overridden by subclasses :return: None """ raise NotImplementedError def parse(self, limit): """ abstract method to parse all data from an external resource, that was fetched in fetch() this should be overridden by subclasses :return: None """ raise NotImplementedError def write(self, fmt='turtle', stream=None): """ This convenience method will write out all of the graphs associated with the source. Right now these are hardcoded to be a single "graph" and a "src_dataset.ttl" and a "src_test.ttl" If you do not supply stream='stdout' it will default write these to files. In addition, if the version number isn't yet set in the dataset, it will be set to the date on file. :return: None """ fmt_ext = { 'rdfxml': 'xml', 'turtle': 'ttl', 'nt': 'nt', # ntriples 'nquads': 'nq', 'n3': 'n3' # notation3 } # make the regular graph output file dest = None if self.name is not None: dest = '/'.join((self.outdir, self.name)) if fmt in fmt_ext: dest = '.'.join((dest, fmt_ext.get(fmt))) else: dest = '.'.join((dest, fmt)) LOG.info("Setting outfile to %s", dest) # make the dataset_file name, always format as turtle self.datasetfile = '/'.join( (self.outdir, self.name + '_dataset.ttl')) LOG.info("Setting dataset file to %s", self.datasetfile) if self.dataset is not None and self.dataset.version is None: self.dataset.set_version_by_date() LOG.info("No version for %s setting to date issued.", self.name) else: LOG.warning("No output file set. Using stdout") stream = 'stdout' gu = GraphUtils(None) # the _dataset description is always turtle gu.write(self.dataset.getGraph(), 'turtle', filename=self.datasetfile) if self.test_mode: # unless we stop hardcoding, the test dataset is always turtle LOG.info("Setting testfile to %s", self.testfile) gu.write(self.testgraph, 'turtle', filename=self.testfile) # print graph out if stream is None: outfile = dest elif stream.lower().strip() == 'stdout': outfile = None else: LOG.error("I don't understand our stream.") return gu.write(self.graph, fmt, filename=outfile) def whoami(self): ''' pointless convieniance ''' LOG.info("Ingest is %s", self.name) @staticmethod def make_id(long_string, prefix='MONARCH'): """ a method to create DETERMINISTIC identifiers based on a string's digest. currently implemented with sha1 :param long_string: :return: """ return ':'.join((prefix, Source.hash_id(long_string))) @staticmethod def hash_id(wordage): # same as graph/GraphUtils.digest_id(wordage) """ prepend 'b' to avoid leading with digit truncate to a 20 char sized word with a leading 'b' return truncated sha1 hash of string. by the birthday paradox; expect 50% chance of collision after 69 billion invocations however these are only hoped to be unique within a single file Consider reducing to 17 hex chars to fit in a 64 bit word 16 discounting a leading constant gives a 50% chance of collision at about 4.3b billion unique input strings (currently _many_ orders of magnitude below that) :param long_string: str string to be hashed :return: str hash of id """ return 'b' + hashlib.sha1(wordage.encode('utf-8')).hexdigest()[1:20] def checkIfRemoteIsNewer(self, remote, local, headers): """ Given a remote file location, and the corresponding local file this will check the datetime stamp on the files to see if the remote one is newer. This is a convenience method to be used so that we don't have to re-fetch files that we already have saved locally :param remote: URL of file to fetch from remote server :param local: pathname to save file to locally :return: True if the remote file is newer and should be downloaded """ LOG.info("Checking if remote file is newer than local \n(%s)", local) # check if local file exists # if no local file, then remote is newer if os.path.exists(local): LOG.info("Local File exists as %s", local) else: LOG.info("Local File does NOT exist as %s", local) return True # get remote file details if headers is None: headers = self._get_default_request_headers() req = urllib.request.Request(remote, headers=headers) LOG.info("Request header: %s", str(req.header_items())) response = urllib.request.urlopen(req) try: resp_headers = response.info() size = resp_headers.get('Content-Length') last_modified = resp_headers.get('Last-Modified') except urllib.error.URLError as err: resp_headers = None size = 0 last_modified = None LOG.error(err) if size is not None and size != '': size = int(size) else: size = 0 fstat = os.stat(local) LOG.info( "Local File date: %s", datetime.utcfromtimestamp(fstat[ST_CTIME])) if last_modified is not None: # Thu, 07 Aug 2008 16:20:19 GMT dt_obj = datetime.strptime( last_modified, "%a, %d %b %Y %H:%M:%S %Z") # get local file details # check date on local vs remote file if dt_obj > datetime.utcfromtimestamp(fstat[ST_CTIME]): # check if file size is different if fstat[ST_SIZE] < size: LOG.info("New Remote File exists") return True if fstat[ST_SIZE] > size: LOG.warning("New Remote File exists but it is SMALLER") return True # filesize is a fairly imperfect metric here LOG.info("New Remote fFle has same filesize--will not download") elif fstat[ST_SIZE] != size: LOG.info( "Remote File is %i \t Local File is %i", size, fstat[ST_SIZE]) return True return False def get_files(self, is_dl_forced, files=None): """ Given a set of files for this source, it will go fetch them, and set a default version by date. If you need to set the version number by another method, then it can be set again. :param is_dl_forced - boolean :param files dict - override instance files dict :return: None """ fstat = None if files is None: files = self.files for fname in files: headers = None filesource = files[fname] if 'headers' in filesource: headers = filesource['headers'] LOG.info("Getting %s", fname) # if the key 'clean' exists in the sources `files` dict # expose that instead of the longer url if 'clean' in filesource and filesource['clean'] is not None: self.dataset.setFileAccessUrl(filesource['clean']) else: self.dataset.setFileAccessUrl(filesource['url']) LOG.info('Fetching %s', filesource['url']) self.fetch_from_url( filesource['url'], '/'.join((self.rawdir, filesource['file'])), is_dl_forced, headers) fstat = os.stat('/'.join((self.rawdir, filesource['file']))) # only keeping the date from the last file filedate = datetime.utcfromtimestamp(fstat[ST_CTIME]).strftime("%Y-%m-%d") # FIXME # change this so the date is attached only to each file, not the entire dataset self.dataset.set_date_issued(filedate) def fetch_from_url( self, remotefile, localfile=None, is_dl_forced=False, headers=None): """ Given a remote url and a local filename, attempt to determine if the remote file is newer; if it is, fetch the remote file and save it to the specified localfile, reporting the basic file information once it is downloaded :param remotefile: URL of remote file to fetch :param localfile: pathname of file to save locally :return: None """ response = None if ((is_dl_forced is True) or localfile is None or (self.checkIfRemoteIsNewer(remotefile, localfile, headers))): # TODO url verification, etc if headers is None: headers = self._get_default_request_headers() request = urllib.request.Request(remotefile, headers=headers) response = urllib.request.urlopen(request) if localfile is not None: with open(localfile, 'wb') as binwrite: while True: chunk = response.read(CHUNK) if not chunk: break binwrite.write(chunk) LOG.info("Finished. Wrote file to %s", localfile) if self.compare_local_remote_bytes(remotefile, localfile, headers): LOG.debug("local file is same size as remote after download") else: raise Exception( "Error downloading file: local file size != remote file size") fstat = os.stat(localfile) LOG.info("file size: %s", fstat[ST_SIZE]) LOG.info( "file created: %s", time.asctime(time.localtime(fstat[ST_CTIME]))) else: LOG.error('Local filename is required') exit(-1) else: LOG.info("Using existing file %s", localfile) return response # TODO: rephrase as mysql-dump-xml specific format def process_xml_table(self, elem, table_name, processing_function, limit): """ This is a convenience function to process the elements of an xml dump of a mysql relational database. The "elem" is akin to a mysql table, with it's name of ```table_name```. It will process each ```row``` given the ```processing_function``` supplied. :param elem: The element data :param table_name: The name of the table to process :param processing_function: The row processing function :param limit: Appears to be making calls to the elementTree library although it not explicitly imported here. :return: """ line_counter = 0 table_data = elem.find("[@name='" + table_name + "']") if table_data is not None: LOG.info("Processing " + table_name) row = {} for line in table_data.findall('row'): for field in line.findall('field'): atts = dict(field.attrib) row[atts['name']] = field.text processing_function(row) line_counter += 1 if self.test_mode and limit is not None and line_counter > limit: continue elem.clear() # discard the element @staticmethod def _check_list_len(row, length): """ Sanity check for csv parser :param row :param length :return:None """ if len(row) != length: raise Exception( "row length does not match expected length of " + str(length) + "\nrow: " + str(row)) @staticmethod def get_file_md5(directory, filename, blocksize=2**20): # reference: # http://stackoverflow.com/questions/1131220/get-md5-hash-of-big-files-in-python md5 = hashlib.md5() with open(os.path.join(directory, filename), "rb") as bin_reader: while True: buff = bin_reader.read(blocksize) if not buff: break md5.update(buff) return md5.hexdigest() def get_remote_content_len(self, remote, headers=None): """ :param remote: :return: size of remote file """ if headers is None: headers = self._get_default_request_headers() req = urllib.request.Request(remote, headers=headers) try: response = urllib.request.urlopen(req) resp_header = response.info() byte_size = resp_header.get('Content-length') except OSError as err: byte_size = None LOG.error(err) return byte_size @staticmethod def get_local_file_size(localfile): """ :param localfile: :return: size of file """ byte_size = os.stat(localfile) return byte_size[ST_SIZE] def compare_local_remote_bytes(self, remotefile, localfile, remote_headers=None): """ test to see if fetched file is the same size as the remote file using information in the content-length field in the HTTP header :return: True or False """ is_equal = True remote_size = self.get_remote_content_len(remotefile, remote_headers) local_size = self.get_local_file_size(localfile) if remote_size is not None and local_size != int(remote_size): is_equal = False LOG.error( 'local file and remote file different sizes\n' '%s has size %s, %s has size %s', localfile, local_size, remotefile, remote_size) return is_equal @staticmethod def file_len(fname): with open(fname) as lines: length = sum(1 for line in lines) return length @staticmethod def get_eco_map(url): """ To convert the three column file to a hashmap we join primary and secondary keys, for example IEA GO_REF:0000002 ECO:0000256 IEA GO_REF:0000003 ECO:0000501 IEA Default ECO:0000501 becomes IEA-GO_REF:0000002: ECO:0000256 IEA-GO_REF:0000003: ECO:0000501 IEA: ECO:0000501 :return: dict """ # this would go in a translation table but it is generated dynamically # maybe when we move to a make driven system eco_map = {} request = urllib.request.Request(url) response = urllib.request.urlopen(request) for line in response: line = line.decode('utf-8').rstrip() if re.match(r'^#', line): continue (code, go_ref, eco_curie) = line.split('\t') if go_ref != 'Default': eco_map["{}-{}".format(code, go_ref)] = eco_curie else: eco_map[code] = eco_curie return eco_map def settestonly(self, testonly): """ Set that this source should only be processed in testMode :param testOnly: :return: None """ self.test_only = testonly def settestmode(self, mode): """ Set testMode to (mode). - True: run the Source in testMode; - False: run it in full mode :param mode: :return: None """ self.test_mode = mode def getTestSuite(self): """ An abstract method that should be overwritten with tests appropriate for the specific source. :return: """ return None # TODO: pramaterising the release date def declareAsOntology(self, graph): """ The file we output needs to be declared as an ontology, including it's version information. TEC: I am not convinced dipper reformatting external data as RDF triples makes an OWL ontology (nor that it should be considered a goal). Proper ontologies are built by ontologists. Dipper reformats data and annotates/decorates it with a minimal set of carefully arranged terms drawn from from multiple proper ontologies. Which allows the whole (dipper's RDF triples and parent ontologies) to function as a single ontology we can reason over when combined in a store such as SciGraph. Including more than the minimal ontological terms in dipper's RDF output constitutes a liability as it allows greater divergence between dipper artifacts and the proper ontologies. Further information will be augmented in the dataset object. :param version: :return: """ # <http://data.monarchinitiative.org/ttl/biogrid.ttl> a owl:Ontology ; # owl:versionInfo # <https://archive.monarchinitiative.org/YYYYMM/ttl/biogrid.ttl> model = Model(graph) # is self.outfile suffix set yet??? ontology_file_id = 'MonarchData:' + self.name + ".ttl" model.addOntologyDeclaration(ontology_file_id) # add timestamp as version info cur_time = datetime.now() t_string = cur_time.strftime("%Y-%m-%d") ontology_version = t_string # TEC this means the MonarchArchive IRI needs the release updated # maybe extract the version info from there # should not hardcode the suffix as it may change archive_url = 'MonarchArchive:' + 'ttl/' + self.name + '.ttl' model.addOWLVersionIRI(ontology_file_id, archive_url) model.addOWLVersionInfo(ontology_file_id, ontology_version) # TODO make sure this is synced with the Dataset class @staticmethod def remove_backslash_r(filename, encoding): """ A helpful utility to remove Carriage Return from any file. This will read a file into memory, and overwrite the contents of the original file. TODO: This function may be a liability :param filename: :return: """ with open(filename, 'r', encoding=encoding, newline=r'\n') as filereader: contents = filereader.read() contents = re.sub(r'\r', '', contents) with open(filename, "w") as filewriter: filewriter.truncate() filewriter.write(contents) @staticmethod def open_and_parse_yaml(yamlfile): """ :param file: String, path to file containing label-id mappings in the first two columns of each row :return: dict where keys are labels and values are ids """ # ??? what if the yaml file does not contain a dict datastructure? mapping = dict() if os.path.exists(os.path.join(os.path.dirname(__file__), yamlfile)): map_file = open(os.path.join(os.path.dirname(__file__), yamlfile), 'r') mapping = yaml.safe_load(map_file) map_file.close() else: LOG.warning("file: %s not found", yamlfile) return mapping @staticmethod def parse_mapping_file(file): """ :param file: String, path to file containing label-id mappings in the first two columns of each row :return: dict where keys are labels and values are ids """ id_map = {} if os.path.exists(os.path.join(os.path.dirname(__file__), file)): with open(os.path.join(os.path.dirname(__file__), file)) as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: key = row[0] value = row[1] id_map[key] = value return id_map @staticmethod def _get_default_request_headers(): return { 'User-Agent': USER_AGENT } # @staticmethod # def getTestSuite(ingest): # WIP # ''' # try to avoid having one of these per ingest # ''' # import unittest # testcase = ingest + 'TestCase' # # construct import names ... how # from tests.test_ + ingest import testcase # return unittest.TestLoader().loadTestsFromTestCase(testcase) def load_local_translationtable(self, name): ''' Load "ingest specific" translation from whatever they called something to the ontology label we need to map it to. To facilitate seeing more ontology labels in dipper ingests a reverse mapping from ontology labels to external strings is also generated and available as a dict localtcid '---\n# %s.yaml\n"": "" # example' ''' localtt_file = 'translationtable/' + name + '.yaml' try: with open(localtt_file): pass except IOError: # write a stub file as a place holder if none exists with open(localtt_file, 'w') as write_yaml: print('---\n# %s.yaml\n"": "" # example' % name, file=write_yaml) finally: with open(localtt_file, 'r') as read_yaml: localtt = yaml.safe_load(read_yaml) # inverse local translation. # note: keeping this invertable will be work. # Useful to not litter an ingest with external syntax self.localtcid = {v: k for k, v in localtt.items()} return localtt def resolve(self, word, mandatory=True, default=None): ''' composite mapping given f(x) and g(x) here: localtt & globaltt respectivly return g(f(x))|g(x)||f(x)|x in order of preference returns x on fall through if finding a mapping is not mandatory (by default finding is mandatory). This may be specialized further from any mapping to a global mapping only; if need be. :param word: the srting to find as a key in translation tables :param mandatory: boolean to cauae failure when no key exists :return value from global translation table, or value from local translation table, or the query key if finding a value is not mandatory (in this order) ''' assert word is not None # we may not agree with a remote sources use of a global term we have # this provides opportunity for us to override if word in self.localtt: label = self.localtt[word] if label in self.globaltt: term_id = self.globaltt[label] else: logging.info( "Translated to '%s' but no global term_id for: '%s'", label, word) term_id = label elif word in self.globaltt: term_id = self.globaltt[word] else: if mandatory: raise KeyError("Mapping required for: ", word) logging.warning("We have no translation for: '%s'", word) if default is not None: term_id = default else: term_id = word return term_id @staticmethod def check_fileheader(expected, received): ''' Compare file headers received versus file headers expected if the expected headers are a subset (proper or not) of received headers report suscess (warn if proper subset) param: expected list param: received list return: truthyness ''' exp = set(expected) got = set(received) if expected != received: LOG.error('\nExpected header: %s\nRecieved header: %s', expected, received) # pass reordering and adding new columns (after protesting) # hard fail on missing expected columns (temper with mandatory cols?) if exp - got != set(): LOG.error('Missing: %s', exp - got) raise AssertionError('Incomming headers are missing expected column.') if got - exp != set(): LOG.warrning('Addtional new columns: %s', got - exp) else: LOG.warrning('Check columns order') return (exp ^ got) & exp == set()
class CTD(Source): """ The Comparative Toxicogenomics Database (CTD) includes curated data describing cross-species chemical–gene/protein interactions and chemical– and gene–disease associations to illuminate molecular mechanisms underlying variable susceptibility and environmentally influenced diseases. Here, we fetch, parse, and convert data from CTD into triples, leveraging only the associations based on DIRECT evidence (not using the inferred associations). We currently process the following associations: * chemical-disease * gene-pathway * gene-disease CTD curates relationships between genes and chemicals/diseases with marker/mechanism and/or therapeutic. Unfortunately, we cannot disambiguate between marker (gene expression) and mechanism (causation) for these associations. Therefore, we are left to relate these simply by "marker". CTD also pulls in genes and pathway membership from KEGG and REACTOME. We create groups of these following the pattern that the specific pathway is a subclass of 'cellular process' (a go process), and the gene is "involved in" that process. For diseases, we preferentially use OMIM identifiers when they can be used uniquely over MESH. Otherwise, we use MESH ids. Note that we scrub the following identifiers and their associated data: * REACT:REACT_116125 - generic disease class * MESH:D004283 - dog diseases * MESH:D004195 - disease models, animal * MESH:D030342 - genetic diseases, inborn * MESH:D040181 - genetic dieases, x-linked * MESH:D020022 - genetic predisposition to a disease """ files = { 'chemical_disease_interactions': { 'file': 'CTD_chemicals_diseases.tsv.gz', 'url': 'http://ctdbase.org/reports/CTD_chemicals_diseases.tsv.gz' }, 'gene_pathway': { 'file': 'CTD_genes_pathways.tsv.gz', 'url': 'http://ctdbase.org/reports/CTD_genes_pathways.tsv.gz' }, 'gene_disease': { 'file': 'CTD_genes_diseases.tsv.gz', 'url': 'http://ctdbase.org/reports/CTD_genes_diseases.tsv.gz' } } static_files = { 'publications': {'file': 'CTD_curated_references.tsv'} } def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'ctd') self.dataset = Dataset( 'ctd', 'CTD', 'http://ctdbase.org', None, 'http://ctdbase.org/about/legal.jsp') if 'test_ids' not in config.get_config() \ or 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") self.test_geneids = [] else: self.test_geneids = config.get_config()['test_ids']['gene'] if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_diseaseids = [] else: self.test_diseaseids = config.get_config()['test_ids']['disease'] self.g = self.graph self.geno = Genotype(self.graph) self.pathway = Pathway(self.graph) return def fetch(self, is_dl_forced=False): """ Override Source.fetch() Fetches resources from CTD using the CTD.files dictionary Args: :param is_dl_forced (bool): Force download Returns: :return None """ self.get_files(is_dl_forced) self._fetch_disambiguating_assoc() # consider creating subsets of the files that # only have direct annotations (not inferred) return def parse(self, limit=None): """ Override Source.parse() Parses version and interaction information from CTD Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") # pub_map = dict() # file_path = '/'.join((self.rawdir, # self.static_files['publications']['file'])) # if os.path.exists(file_path) is True: # pub_map = self._parse_publication_file( # self.static_files['publications']['file'] # ) if self.testOnly: self.testMode = True if self.testMode: self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) self.pathway = Pathway(self.g) self._parse_ctd_file( limit, self.files['chemical_disease_interactions']['file']) self._parse_ctd_file(limit, self.files['gene_pathway']['file']) self._parse_ctd_file(limit, self.files['gene_disease']['file']) self._parse_curated_chem_disease(limit) logger.info("Done parsing files.") return def _parse_ctd_file(self, limit, file): """ Parses files in CTD.files dictionary Args: :param limit (int): limit the number of rows processed :param file (str): file name (must be defined in CTD.file) Returns: :return None """ row_count = 0 version_pattern = re.compile(r'^# Report created: (.+)$') is_versioned = False file_path = '/'.join((self.rawdir, file)) with gzip.open(file_path, 'rt') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: # Scan the header lines until we get the version # There is no official version sp we are using # the upload timestamp instead if is_versioned is False: match = re.match(version_pattern, ' '.join(row)) if match: version = re.sub(r'\s|:', '-', match.group(1)) # TODO convert this timestamp to a proper timestamp self.dataset.setVersion(version) is_versioned = True elif re.match(r'^#', ' '.join(row)): pass else: row_count += 1 if file == self.files[ 'chemical_disease_interactions']['file']: self._process_interactions(row) elif file == self.files['gene_pathway']['file']: self._process_pathway(row) elif file == self.files['gene_disease']['file']: self._process_disease2gene(row) if not self.testMode and \ limit is not None and row_count >= limit: break return def _process_pathway(self, row): """ Process row of CTD data from CTD_genes_pathways.tsv.gz and generate triples Args: :param row (list): row of CTD data Returns: :return None """ model = Model(self.g) self._check_list_len(row, 4) (gene_symbol, gene_id, pathway_name, pathway_id) = row if self.testMode and (int(gene_id) not in self.test_geneids): return entrez_id = 'NCBIGene:' + gene_id pathways_to_scrub = [ 'REACT:REACT_116125', # disease "REACT:REACT_111045", # developmental biology "REACT:REACT_200794", # Mus musculus biological processes "REACT:REACT_13685"] # neuronal system ? if pathway_id in pathways_to_scrub: # these are lame "pathways" like generic # "disease" and "developmental biology" return # convert KEGG pathway ids... KEGG:12345 --> KEGG-path:map12345 if re.match(r'KEGG', pathway_id): pathway_id = re.sub(r'KEGG:', 'KEGG-path:map', pathway_id) # just in case, add it as a class model.addClassToGraph(entrez_id, None) self.pathway.addPathway(pathway_id, pathway_name) self.pathway.addGeneToPathway(entrez_id, pathway_id) return def _fetch_disambiguating_assoc(self): """ For any of the items in the chemical-disease association file that have ambiguous association types we fetch the disambiguated associations using the batch query API, and store these in a file. Elsewhere, we can loop through the file and create the appropriate associations. :return: """ disambig_file = '/'.join( (self.rawdir, self.static_files['publications']['file'])) assoc_file = '/'.join( (self.rawdir, self.files['chemical_disease_interactions']['file'])) # check if there is a local association file, # and download if it's dated later than the original intxn file if os.path.exists(disambig_file): dfile_dt = os.stat(disambig_file) afile_dt = os.stat(assoc_file) if dfile_dt < afile_dt: logger.info( "Local file date before chem-disease assoc file. " " Downloading...") else: logger.info( "Local file date after chem-disease assoc file. " " Skipping download.") return all_pubs = set() dual_evidence = re.compile(r'^marker\/mechanism\|therapeutic$') # first get all the unique publications with gzip.open(assoc_file, 'rt') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: if re.match(r'^#', ' '.join(row)): continue self._check_list_len(row, 10) (chem_name, chem_id, cas_rn, disease_name, disease_id, direct_evidence, inferred_gene_symbol, inference_score, omim_ids, pubmed_ids) = row if direct_evidence == '' or not \ re.match(dual_evidence, direct_evidence): continue if pubmed_ids is not None and pubmed_ids != '': all_pubs.update(set(re.split(r'\|', pubmed_ids))) sorted_pubs = sorted(list(all_pubs)) # now in batches of 4000, we fetch the chemical-disease associations batch_size = 4000 params = { 'inputType': 'reference', 'report': 'diseases_curated', 'format': 'tsv', 'action': 'Download' } url = 'http://ctdbase.org/tools/batchQuery.go?q' start = 0 end = min((batch_size, len(all_pubs))) # get them in batches of 4000 with open(disambig_file, 'wb') as f: while start < len(sorted_pubs): params['inputTerms'] = '|'.join(sorted_pubs[start:end]) # fetch the data from url logger.info( 'fetching %d (%d-%d) refs: %s', len(re.split(r'\|', params['inputTerms'])), start, end, params['inputTerms']) data = urllib.parse.urlencode(params) encoding = 'utf-8' binary_data = data.encode(encoding) req = urllib.request.Request(url, binary_data) resp = urllib.request.urlopen(req) f.write(resp.read()) start = end end = min((start + batch_size, len(sorted_pubs))) return def _process_interactions(self, row): """ Process row of CTD data from CTD_chemicals_diseases.tsv.gz and generate triples. Only create associations based on direct evidence (not using the inferred-via-gene), and unambiguous relationships. (Ambiguous ones will be processed in the sister method using the disambiguated file). There are no OMIM ids for diseases in these cases, so we associate with only the mesh disease ids. Args: :param row (list): row of CTD data Returns: :return None """ model = Model(self.g) self._check_list_len(row, 10) (chem_name, chem_id, cas_rn, disease_name, disease_id, direct_evidence, inferred_gene_symbol, inference_score, omim_ids, pubmed_ids) = row if direct_evidence == '': return evidence_pattern = re.compile(r'^therapeutic|marker\/mechanism$') # dual_evidence = re.compile(r'^marker\/mechanism\|therapeutic$') # filter on those diseases that are mapped to omim ids in the test set intersect = list( set(['OMIM:' + str(i) for i in omim_ids.split('|')] + [disease_id]) & set(self.test_diseaseids)) if self.testMode and len(intersect) < 1: return chem_id = 'MESH:' + chem_id reference_list = self._process_pubmed_ids(pubmed_ids) if re.match(evidence_pattern, direct_evidence): rel_id = self._get_relationship_id(direct_evidence) model.addClassToGraph(chem_id, chem_name) model.addClassToGraph(disease_id, None) self._make_association(chem_id, disease_id, rel_id, reference_list) else: # there's dual evidence, but haven't mapped the pubs pass # logger.debug( # "Dual evidence for %s (%s) and %s (%s)", # chem_name, chem_id, disease_name, disease_id) return def _process_disease2gene(self, row): """ Here, we process the disease-to-gene associations. Note that we ONLY process direct associations (not inferred through chemicals). Furthermore, we also ONLY process "marker/mechanism" associations. We preferentially utilize OMIM identifiers over MESH identifiers for disease/phenotype. Therefore, if a single OMIM id is listed under the "omim_ids" list, we will choose this over any MeSH id that might be listed as the disease_id. If multiple OMIM ids are listed in the omim_ids column, we toss this for now. (Mostly, we are not sure what to do with this information.) We associate "some variant of gene X" with the phenotype, rather than the gene directly. We also pull in the MeSH labels here (but not OMIM) to ensure that we have them (as they may not be brought in separately). :param row: :return: """ # if self.testMode: # g = self.testgraph # else: # g = self.graph # self._check_list_len(row, 9) # geno = Genotype(g) # gu = GraphUtils(curie_map.get()) model = Model(self.g) (gene_symbol, gene_id, disease_name, disease_id, direct_evidence, inference_chemical_name, inference_score, omim_ids, pubmed_ids) = row # we only want the direct associations; skipping inferred for now if direct_evidence == '' or direct_evidence != 'marker/mechanism': return # scrub some of the associations... # it seems odd to link human genes to the following "diseases" diseases_to_scrub = [ 'MESH:D004283', # dog diseases 'MESH:D004195', # disease models, animal 'MESH:D030342', # genetic diseases, inborn 'MESH:D040181', # genetic dieases, x-linked 'MESH:D020022'] # genetic predisposition to a disease if disease_id in diseases_to_scrub: logger.info( "Skipping association between NCBIGene:%s and %s", str(gene_id), disease_id) return intersect = list( set(['OMIM:' + str(i) for i in omim_ids.split('|')] + [disease_id]) & set(self.test_diseaseids)) if self.testMode and ( int(gene_id) not in self.test_geneids or len(intersect) < 1): return # there are three kinds of direct evidence: # (marker/mechanism | marker/mechanism|therapeutic | therapeutic) # we are only using the "marker/mechanism" for now # TODO what does it mean for a gene to be therapeutic for disease? # a therapeutic target? gene_id = 'NCBIGene:' + gene_id preferred_disease_id = disease_id if omim_ids is not None and omim_ids != '': omim_id_list = re.split(r'\|', omim_ids) # If there is only one OMIM ID for the Disease ID # or in the omim_ids list, # use the OMIM ID preferentially over any MeSH ID. if re.match(r'OMIM:.*', disease_id): if len(omim_id_list) > 1: # the disease ID is an OMIM ID and # there is more than one OMIM entry in omim_ids. # Currently no entries satisfy this condition pass elif disease_id != ('OMIM:' + omim_ids): # the disease ID is an OMIM ID and # there is only one non-equiv OMIM entry in omim_ids # we preferentially use the disease_id here logger.warning( "There may be alternate identifier for %s: %s", disease_id, omim_ids) # TODO: What should be done with the alternate disease IDs? else: if len(omim_id_list) == 1: # the disease ID is not an OMIM ID # and there is only one OMIM entry in omim_ids. preferred_disease_id = 'OMIM:' + omim_ids elif len(omim_id_list) > 1: # This is when the disease ID is not an OMIM ID and # there is more than one OMIM entry in omim_ids. pass # we actually want the association between the gene and the disease # to be via an alternate locus not the "wildtype" gene itself. So we # make an anonymous alternate locus, and put that in the association. alt_id = gene_id + '-' + preferred_disease_id + 'VL' # can't have colons in the bnodes alt_locus = re.sub(r':', '', alt_id) alt_locus = "_:" + alt_locus alt_label = 'some variant of ' + gene_symbol + ' that is ' \ + direct_evidence + ' for ' + disease_name model.addIndividualToGraph( alt_locus, alt_label, self.geno.genoparts['variant_locus']) # assume that the label gets added elsewhere model.addClassToGraph(gene_id, None) self.geno.addAffectedLocus(alt_locus, gene_id) model.addBlankNodeAnnotation(alt_locus) # not sure if MESH is getting added separately. # adding labels here for good measure dlabel = None if re.match(r'MESH', preferred_disease_id): dlabel = disease_name model.addClassToGraph(preferred_disease_id, dlabel) # Add the disease to gene relationship. rel_id = self._get_relationship_id(direct_evidence) refs = self._process_pubmed_ids(pubmed_ids) self._make_association(alt_locus, preferred_disease_id, rel_id, refs) return def _make_association(self, subject_id, object_id, rel_id, pubmed_ids): """ Make a reified association given an array of pubmed identifiers. Args: :param subject_id id of the subject of the association (gene/chem) :param object_id id of the object of the association (disease) :param rel_id relationship id :param pubmed_ids an array of pubmed identifiers Returns: :return None """ # TODO pass in the relevant Assoc class rather than relying on G2P assoc = G2PAssoc(self.g, self.name, subject_id, object_id, rel_id) if pubmed_ids is not None and len(pubmed_ids) > 0: eco = self._get_evidence_code('TAS') for pmid in pubmed_ids: r = Reference( self.g, pmid, Reference.ref_types['journal_article']) r.addRefToGraph() assoc.add_source(pmid) assoc.add_evidence(eco) assoc.add_association_to_graph() return @staticmethod def _process_pubmed_ids(pubmed_ids): """ Take a list of pubmed IDs and add PMID prefix Args: :param pubmed_ids - string representing publication ids seperated by a | symbol Returns: :return list: Pubmed curies """ if pubmed_ids.strip() == '': id_list = [] else: id_list = pubmed_ids.split('|') for (i, val) in enumerate(id_list): id_list[i] = 'PMID:' + val return id_list @staticmethod def _get_evidence_code(evidence): """ Get curie for evidence class label Args: :param evidence (str): evidence label Label: :return str: curie for evidence label from ECO """ eco_map = { 'TAS': 'ECO:0000033' } return eco_map[evidence] @staticmethod def _get_relationship_id(rel): """ Get curie from relationship property label Args: :param rel (str): relationship label Returns: :return str: curie for relationship label """ rel_map = { 'therapeutic': Model.object_properties['substance_that_treats'], 'marker/mechanism': Model.object_properties['is_marker_for'], } return str(rel_map[rel]) @staticmethod def _get_class_id(clslab): """ Get curie from CLASS_MAP dictionary Args: :param cls (str): class label Returns: :return str: curie for class label """ class_map = { 'pathway': 'PW:0000001', 'signal transduction': 'GO:0007165' } return class_map[clslab] def _parse_curated_chem_disease(self, limit): model = Model(self.g) line_counter = 0 file_path = '/'.join( (self.rawdir, self.static_files['publications']['file'])) with open(file_path, 'r') as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: # catch comment lines if re.match(r'^#', ' '.join(row)): continue line_counter += 1 self._check_list_len(row, 10) (pub_id, disease_label, disease_id, disease_cat, evidence, chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row if disease_id.strip() == '' or chem_id.strip() == '': continue rel_id = self._get_relationship_id(evidence) chem_id = 'MESH:' + chem_id model.addClassToGraph(chem_id, chem_label) model.addClassToGraph(disease_id, None) if pub_id != '': pub_id = 'PMID:' + pub_id r = Reference( pub_id, Reference.ref_types['journal_article']) r.addRefToGraph(self.g) pubids = [pub_id] else: pubids = None self._make_association(chem_id, disease_id, rel_id, pubids) if not self.testMode and limit is not None \ and line_counter >= limit: break return def getTestSuite(self): import unittest from tests.test_ctd import CTDTestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(CTDTestCase) # test_suite.addTests( # unittest.TestLoader().loadTestsFromTestCase(InteractionsTestCase)) return test_suite
def __init__( self, graph_type='rdf_graph', # or streamed_graph are_bnodes_skized=False, # typically True name=None, # identifier; make an IRI for nquads ingest_title=None, ingest_url=None, license_url=None, # only if it is _our_ lic data_rights=None, # external page that points to their current lic file_handle=None ): # pull in the common test identifiers self.all_test_ids = self.open_and_parse_yaml('../../resources/test_ids.yaml') self.graph_type = graph_type self.are_bnodes_skized = are_bnodes_skized self.ingest_url = ingest_url self.ingest_title = ingest_title self.localtt = self.load_local_translationtable(name) if name is not None: self.name = name.lower() elif self.whoami() is not None: self.name = self.whoami().lower() LOG.info("Processing Source \"%s\"", self.name) self.test_only = False self.path = "" # to be used to store a subset of data for testing downstream. self.triple_count = 0 self.outdir = 'out' self.testdir = 'tests' self.rawdir = 'raw' self.rawdir = '/'.join((self.rawdir, self.name)) self.testname = name + "_test" self.testfile = '/'.join((self.outdir, self.testname + ".ttl")) self.datasetfile = None # still need to pull in file suffix -- this ia a curie not a url self.archive_url = 'MonarchArchive:' + 'ttl/' + self.name + '.ttl' # if raw data dir doesn't exist, create it if not os.path.exists(self.rawdir): os.makedirs(self.rawdir) pth = os.path.abspath(self.rawdir) LOG.info("creating raw directory for %s at %s", self.name, pth) # if output dir doesn't exist, create it if not os.path.exists(self.outdir): os.makedirs(self.outdir) pth = os.path.abspath(self.outdir) LOG.info("created output directory %s", pth) LOG.info("Creating Test graph %s", self.testname) # note: tools such as protoge need slolemized blank nodes self.testgraph = RDFGraph(True, self.testname) if graph_type == 'rdf_graph': graph_id = ':MONARCH_' + str(self.name) + "_" + \ datetime.now().isoformat(' ').split()[0] LOG.info("Creating graph %s", graph_id) self.graph = RDFGraph(are_bnodes_skized, graph_id) elif graph_type == 'streamed_graph': # need to expand on export formats dest_file = open(pth + '/' + name + '.nt', 'w') # where is the close? self.graph = StreamedGraph(are_bnodes_skized, dest_file) # leave test files as turtle (better human readibility) else: LOG.error( "%s graph type not supported\n" "valid types: rdf_graph, streamed_graph", graph_type) # pull in global ontology mapping datastructures self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # self.prefix_base = {v: k for k, v in self.curie_map.items()} # will be set to True if the intention is # to only process and write the test data self.test_only = False self.test_mode = False # this may eventually support Bagits self.dataset = Dataset( self.archive_url, self.ingest_title, self.ingest_url, None, # description license_url, # only _OUR_ lic data_rights, # tries to point to others lics graph_type, file_handle ) for graph in [self.graph, self.testgraph]: self.declareAsOntology(graph)
class GeneReviews(Source): """ Here we process the GeneReviews mappings to OMIM, plus inspect the GeneReviews (html) books to pull the clinical descriptions in order to populate the definitions of the terms in the ontology. We define the GeneReviews items as classes that are either grouping classes over OMIM disease ids (gene ids are filtered out), or are made as subclasses of DOID:4 (generic disease). Note that GeneReviews [copyright policy](http://www.ncbi.nlm.nih.gov/books/NBK138602/) (as of 2015.11.20) says: GeneReviews® chapters are owned by the University of Washington, Seattle, © 1993-2015. Permission is hereby granted to reproduce, distribute, and translate copies of content materials provided that (i) credit for source (www.ncbi.nlm.nih.gov/books/NBK1116/) and copyright (University of Washington, Seattle) are included with each copy; (ii) a link to the original material is provided whenever the material is published elsewhere on the Web; and (iii) reproducers, distributors, and/or translators comply with this copyright notice and the GeneReviews Usage Disclaimer. This script doesn't pull the GeneReviews books from the NCBI Bookshelf directly; scripting this task is expressly prohibited by [NCBIBookshelf policy](http://www.ncbi.nlm.nih.gov/books/NBK45311/). However, assuming you have acquired the books (in html format) via permissible means, a parser for those books is provided here to extract the clinical descriptions to define the NBK identified classes. """ files = { 'idmap': {'file': 'NBKid_shortname_OMIM.txt', 'url': GRDL + '/NBKid_shortname_OMIM.txt'}, 'titles': {'file': 'GRtitle_shortname_NBKid.txt', 'url': GRDL + '/GRtitle_shortname_NBKid.txt'} } def __init__(self): Source.__init__(self, 'genereviews') self.load_bindings() self.dataset = Dataset( 'genereviews', 'Gene Reviews', 'http://genereviews.org/', None, 'http://www.ncbi.nlm.nih.gov/books/NBK138602/') self.dataset.set_citation('GeneReviews:NBK1116') self.gu = GraphUtils(curie_map.get()) self.book_ids = set() self.all_books = {} if 'test_ids' not in config.get_config() or\ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = list() else: # select ony those test ids that are omim's. self.test_ids = config.get_config()['test_ids']['disease'] return def fetch(self, is_dl_forced=False): """ We fetch GeneReviews id-label map and id-omim mapping files from NCBI. :return: None """ self.get_files(is_dl_forced) return def parse(self, limit=None): """ :return: None """ if self.testOnly: self.testMode = True self._get_titles(limit) self._get_equivids(limit) self.create_books() self.process_nbk_html(limit) self.load_bindings() # no test subset for now; test == full graph self.testgraph = self.graph logger.info("Found %d nodes", len(self.graph)) return def _get_equivids(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes(not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['idmap']['file'])) gu = GraphUtils(curie_map.get()) line_counter = 0 # we look some stuff up in OMIM, so initialize here omim = OMIM() id_map = {} allomimids = set() with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter == 1: # skip header continue (nbk_num, shortname, omim_num) = row gr_id = 'GeneReviews:'+nbk_num omim_id = 'OMIM:'+omim_num if not ( (self.testMode and len(self.test_ids) > 0 and omim_id in self.test_ids) or not self.testMode): continue # sometimes there's bad omim nums if len(omim_num) > 6: logger.warning( "OMIM number incorrectly formatted " + "in row %d; skipping:\n%s", line_counter, '\t'.join(row)) continue # build up a hashmap of the mappings; then process later if nbk_num not in id_map: id_map[nbk_num] = set() id_map[nbk_num].add(omim_num) # add the class along with the shortname gu.addClassToGraph(self.graph, gr_id, None) gu.addSynonym(self.graph, gr_id, shortname) allomimids.add(omim_num) if not self.testMode and \ limit is not None and line_counter > limit: break # end looping through file # get the omim ids that are not genes entries_that_are_phenotypes = \ omim.process_entries( list(allomimids), filter_keep_phenotype_entry_ids, None, None, limit) logger.info("Filtered out %d/%d entries that are genes or features", len(allomimids)-len(entries_that_are_phenotypes), len(allomimids)) for nbk_num in self.book_ids: gr_id = 'GeneReviews:'+nbk_num if nbk_num in id_map: omim_ids = id_map.get(nbk_num) for omim_num in omim_ids: omim_id = 'OMIM:'+omim_num # add the gene reviews as a superclass to the omim id, # but only if the omim id is not a gene if omim_id in entries_that_are_phenotypes: gu.addClassToGraph(self.graph, omim_id, None) gu.addSubclass(self.graph, gr_id, omim_id) # add this as a generic subclass of DOID:4 gu.addSubclass(self.graph, 'DOID:4', gr_id) return def _get_titles(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes (not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['titles']['file'])) gu = GraphUtils(curie_map.get()) line_counter = 0 with open(raw, 'r', encoding='latin-1') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter == 1: # skip header continue (shortname, title, nbk_num) = row gr_id = 'GeneReviews:'+nbk_num self.book_ids.add(nbk_num) # a global set of the book nums if limit is None or line_counter < limit: gu.addClassToGraph(self.graph, gr_id, title) gu.addSynonym(self.graph, gr_id, shortname) return def create_books(self): # note that although we put in the url to the book, # NCBI Bookshelf does not allow robots to download content book_item = {'file': 'books/', 'url': ''} for nbk in self.book_ids: b = book_item.copy() b['file'] = '/'.join(('books', nbk+'.html')) b['url'] = 'http://www.ncbi.nlm.nih.gov/books/'+nbk self.all_books[nbk] = b return def process_nbk_html(self, limit): """ Here we process the gene reviews books to fetch the clinical descriptions to include in the ontology. We only use books that have been acquired manually, as NCBI Bookshelf does not permit automated downloads. This parser will only process the books that are found in the ```raw/genereviews/books``` directory, permitting partial completion. :param limit: :return: """ c = 0 books_not_found = set() for nbk in self.book_ids: c += 1 nbk_id = 'GeneReviews:'+nbk book_item = self.all_books.get(nbk) url = '/'.join((self.rawdir, book_item['file'])) # figure out if the book is there; if so, process, otherwise skip book_dir = '/'.join((self.rawdir, 'books')) book_files = os.listdir(book_dir) if ''.join((nbk, '.html')) not in book_files: # logger.warning("No book found locally for %s; skipping", nbk) books_not_found.add(nbk) continue logger.info("Processing %s", nbk) page = open(url) soup = BeautifulSoup(page.read()) # sec0 == clinical description clin_summary = \ soup.find( 'div', id=re.compile(".*Summary.sec0")) if clin_summary is not None: p = clin_summary.find('p') ptext = p.text ptext = re.sub(r'\s+', ' ', ptext) ul = clin_summary.find('ul') if ul is not None: item_text = list() for li in ul.find_all('li'): item_text.append(re.sub(r'\s+', ' ', li.text)) ptext += ' '.join(item_text) # add in the copyright and citation info to description ptext = \ ' '.join( (ptext, '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' + nbk_id+']')) self.gu.addDefinition(self.graph, nbk_id, ptext.strip()) # get the pubs pmid_set = set() pub_div = soup.find('div', id=re.compile(r".*Literature_Cited")) if pub_div is not None: ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"}) for r in ref_list: for a in r.find_all( 'a', attrs={'href': re.compile(r"pubmed")}): if re.match(r'PubMed:', a.text): pmnum = re.sub(r'PubMed:\s*', '', a.text) else: pmnum = \ re.search( r'\/pubmed\/(\d+)$', a['href']).group(1) if pmnum is not None: pmid = 'PMID:'+str(pmnum) self.gu.addTriple( self.graph, pmid, self.gu.object_properties['is_about'], nbk_id) pmid_set.add(pmnum) r = Reference( pmid, Reference.ref_types['journal_article']) r.addRefToGraph(self.graph) # TODO add author history, copyright, license to dataset # TODO get PMID-NBKID equivalence (near foot of page), # and make it "is about" link # self.gu.addTriple( # self.graph, pmid, # self.gu.object_properties['is_about'], nbk_id) # for example: NBK1191 PMID:20301370 # add the book to the dataset self.dataset.setFileAccessUrl(book_item['url']) if limit is not None and c > limit: break # finish looping through books l = len(books_not_found) if len(books_not_found) > 0: if l > 100: logger.warning("There were %d books not found.", l) else: logger.warning( "The following %d books were not found locally: %s", l, str(books_not_found)) logger.info( "Finished processing %d books for clinical descriptions", c-l) return def getTestSuite(self): import unittest from tests.test_genereviews import GeneReviewsTestCase test_suite = \ unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase) return test_suite
class HPOAnnotations(Source): """ The [Human Phenotype Ontology](http://human-phenotype-ontology.org) group curates and assembles over 115,000 annotations to hereditary diseases using the HPO ontology. Here we create OBAN-style associations between diseases and phenotypic features, together with their evidence, and age of onset and frequency (if known). The parser currently only processes the "abnormal" annotations. Association to "remarkable normality" will be added in the near future. We create additional associations from text mining. See info at http://pubmed-browser.human-phenotype-ontology.org/. Also, you can read about these annotations in [PMID:26119816](http://www.ncbi.nlm.nih.gov/pubmed/26119816). In order to properly test this class, you should have a conf.json file configured with some test ids, in the structure of: # as examples. put your favorite ids in the config. <pre> test_ids: {"disease" : ["OMIM:119600", "OMIM:120160"]} </pre> """ files = { 'annot': { 'file': 'phenotype_annotation.tab', 'url': HPOADL + '/phenotype_annotation.tab'}, 'version': { 'file': 'data_version.txt', 'url': HPOADL + '/data_version.txt'}, # 'neg_annot': { # 'file': 'phenotype_annotation.tab', # 'url': HPOADL + '/negative_phenotype_annotation.tab'}, 'doid': { 'file': 'doid.owl', 'url': 'http://purl.obolibrary.org/obo/doid.owl' } } # note, two of these codes are awaiting term requests. see #114 and # https://code.google.com/p/evidenceontology/issues/detail?id=32 # TODO TEC see if the GC issue translated into a GH issue eco_dict = { # FIXME currently using "curator inference used in manual assertion" "ICE": "ECO:0000305", # Inferred from Electronic Annotation "IEA": "ECO:0000501", # FIXME currently is"experimental evidence used in manual assertion" "PCS": "ECO:0000269", # Traceable Author Statement "TAS": "ECO:0000304", # FIXME currently using computational combinatorial evidence # in automatic assertion "ITM": "ECO:0000246", } def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'hpoa') self.dataset = Dataset( 'hpoa', 'Human Phenotype Ontology', 'http://www.human-phenotype-ontology.org', None, 'http://www.human-phenotype-ontology.org/contao/index.php/legal-issues.html') self.replaced_id_count = 0 if 'test_ids' not in config.get_config()\ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = config.get_config()['test_ids']['disease'] # data-source specific warnings to be removed when issues are cleared logger.warning( "note that some ECO classes are missing for ICE, PCS, and ITM;" + " using temporary mappings.") return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) self.scrub() # get the latest build from jenkins # use the files['version'] file as the version fname = '/'.join((self.rawdir, self.files['version']['file'])) with open(fname, 'r', encoding="utf8") as f: # 2015-04-23 13:01 v = f.readline() # read the first line (the only line, really) d = datetime.strptime( v.strip(), '%Y-%m-%d %H:%M').strftime("%Y-%m-%d-%H-%M") f.close() st = os.stat(fname) filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d") # this will cause two dates to be attached to the dataset # (one from the filedate, and the other from here) # TODO when #112 is implemented, # this will result in only the whole dataset being versioned self.dataset.setVersion(filedate, d) self.get_common_files() return def scrub(self): """ Perform various data-scrubbing on the raw data files prior to parsing. For this resource, this currently includes: * revise errors in identifiers for some OMIM and PMIDs :return: None """ # scrub file of the oddities...lots of publication rewriting f = '/'.join((self.rawdir, self.files['annot']['file'])) logger.info('scrubbing PubMed:12345 --> PMID:12345') pysed.replace(r'PubMed:', 'PMID:', f) logger.info('scrubbing pmid:12345 --> PMID:12345') pysed.replace(r'pmid:', 'PMID:', f) logger.info('scrubbing PMID: 12345 --> PMID:12345') pysed.replace(r'PMID: *', 'PMID:', f) logger.info('scrubbing PMID12345 --> PMID:12345') pysed.replace(r'PMID([0-9][0-9]*)', r'PMID:\1', f) logger.info('scrubbing MIM12345 --> OMIM:12345') pysed.replace(r'MIM([0-9][0-9]*)', r'OMIM:\1', f) logger.info('scrubbing MIM:12345 --> OMIM:12345') pysed.replace(r";MIM", ";OMIM", f) logger.info('scrubbing ORPHANET --> Orphanet') pysed.replace("ORPHANET", "Orphanet", f) logger.info('scrubbing ORPHA --> Orphanet') pysed.replace("ORPHA", "Orphanet", f) return def parse(self, limit=None): if limit is not None: logger.info("Only parsing first %s rows", limit) self.add_common_files_to_file_list() logger.info("Parsing files...") if self.testOnly: self.testMode = True # rare disease-phenotype associations self._process_phenotype_tab('/'.join((self.rawdir, self.files['annot']['file'])), limit) # TODO add negative phenotype statements #113 # self._process_negative_phenotype_tab(self.rawfile,self.outfile,limit) # common disease-phenotype associations from text mining work self.process_all_common_disease_files(limit) logger.info("Finished parsing.") return def _map_evidence_to_codes(self, code_string): """ A simple mapping of the code_string to it's ECO class using the dictionary defined here Currently includes ICE, IEA, PCS, TAS :param code_string: :return: """ return self.eco_dict.get(code_string) def _process_phenotype_tab(self, raw, limit): """ see info on format here: http://www.human-phenotype-ontology.org/contao/index.php/annotation-guide.html :param raw: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 row = [str(col).strip() for col in row] # Note from Seb in Dec 2017, a 15th column was added # inadverterntly and will be removed in the winter 2018 # release of hpo data (db, num, name, qual, pheno_id, publist, eco, onset, freq, w, asp, syn, date, curator, extra) = row disease_id = db + ":" + num if self.testMode: try: id_list = self.test_ids if id_list is None \ or disease_id not in id_list: continue except AttributeError: continue # logger.info('adding %s', disease_id) model.addClassToGraph(disease_id, None) model.addClassToGraph(pheno_id, None) eco_id = self._map_evidence_to_codes(eco) model.addClassToGraph(eco_id, None) if onset is not None and onset != '': model.addClassToGraph(onset, None) # we want to do things differently depending on # the aspect of the annotation # TODO PYLINT Redefinition of assoc type from # dipper.models.assoc.D2PAssoc.D2PAssoc to # dipper.models.assoc.DispositionAssoc.DispositionAssoc if asp == 'O' or asp == 'M': # organ abnormality or mortality assoc = D2PAssoc( g, self.name, disease_id, pheno_id, onset, freq) elif asp == 'I': # inheritance patterns for the whole disease assoc = DispositionAssoc( g, self.name, disease_id, pheno_id) elif asp == 'C': # clinical course / onset assoc = DispositionAssoc( g, self.name, disease_id, pheno_id) else: logger.error("I don't know what this aspect is: %s", asp) assoc.add_evidence(eco_id) publist = re.split(r'[,;]', publist) # blow these apart if there is a list of pubs for pub in publist: pub = pub.strip() pubtype = None if pub != '': # if re.match( # r'http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene', # pub): # #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced # m = re.search(r'part\=(\w+)', pub) # pub_id = 'GeneReviews:'+m.group(1) # elif re.search( # r'http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=', # pub): # m = re.search(r'Expert=(\d+)', pub) # pub_id = 'Orphanet:'+m.group(1) if re.match(r'(PMID|ISBN-13|ISBN-10|ISBN|HPO)', pub): if re.match(r'PMID', pub): pubtype = \ Reference.ref_types['journal_article'] elif re.match(r'HPO', pub): pubtype = Reference.ref_types['person'] else: pubtype = Reference.ref_types['publication'] r = Reference(g, pub, pubtype) r.addRefToGraph() elif re.match(r'(OMIM|Orphanet|DECIPHER)', pub): # make the pubs a reference to the website, # instead of the curie if re.match(r'OMIM', pub): omimnum = re.sub(r'OMIM:', '', pub) omimurl = '/'.join(('http://omim.org/entry', str(omimnum).strip())) pub = omimurl elif re.match(r'Orphanet:', pub): orphanetnum = re.sub(r'Orphanet:', '', pub) orphaneturl = \ ''.join(( 'http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert=', str(orphanetnum))) pub = orphaneturl elif re.match(r'DECIPHER:', pub): deciphernum = re.sub(r'DECIPHER:', '', pub) decipherurl = '/'.join( ('https://decipher.sanger.ac.uk/syndrome', deciphernum)) pub = decipherurl pubtype = Reference.ref_types['webpage'] elif re.match(r'http', pub): pass else: logger.error('Unknown pub type for %s: %s', disease_id, pub) print(disease_id, 'pubs:', str(publist)) continue if pub is not None: assoc.add_source(pub) # TODO add curator assoc.add_association_to_graph() if not self.testMode \ and limit is not None and line_counter > limit: break return def get_common_files(self): """ Fetch the raw hpo-annotation-data by cloning/pulling the [repository](https://github.com/monarch-initiative/hpo-annotation-data.git) These files get added to the files object, and iterated over separately. :return: """ repo_dir = '/'.join((self.rawdir, 'git')) REMOTE_URL = \ "[email protected]:monarch-initiative/hpo-annotation-data.git" HTTPS_URL = \ "https://github.com/monarch-initiative/hpo-annotation-data.git" # TODO if repo doesn't exist, then clone otherwise pull if os.path.isdir(repo_dir): shutil.rmtree(repo_dir) logger.info("Cloning common disease files from %s", REMOTE_URL) try: Repo.clone_from(REMOTE_URL, repo_dir) except GitCommandError: # Try with https and if this doesn't work fail Repo.clone_from(HTTPS_URL, repo_dir) return def add_common_files_to_file_list(self): repo_dir = '/'.join((self.rawdir, 'git')) common_disease_dir = '/'.join((repo_dir, 'common-diseases')) # add the files to the self.files object filelist = os.listdir(common_disease_dir) fcount = 0 for f in filelist: if not re.search(r'\.tab', f): continue fcount += 1 self.files['common'+str(fcount).zfill(7)] = { 'file': '/'.join((common_disease_dir, f)), # TODO add url to reference the file? # need to get git version somehow? } # TODO add this to the dataset logger.info("Found %d common disease files", fcount) return def process_all_common_disease_files(self, limit=None): """ Loop through all of the files that we previously fetched from git, creating the disease-phenotype assoc. :param limit: :return: """ self.replaced_id_count = 0 unpadded_doids = self.get_doid_ids_for_unpadding() total_processed = 0 logger.info("Iterating over all common disease files") common_file_count = 0 for f in self.files: if not re.match(r'common', f): continue common_file_count += 1 raw = self.files[f]['file'] total_processed += self.process_common_disease_file( raw, unpadded_doids, limit) if not self.testMode \ and limit is not None and total_processed > limit: break logger.info("Finished iterating over all common disease files.") logger.info("Fixed %d/%d incorrectly zero-padded ids", self.replaced_id_count, common_file_count) return def get_doid_ids_for_unpadding(self): """ Here, we fetch the doid owl file, and get all the doids. We figure out which are not zero-padded, so we can map the DOID to the correct identifier when processing the common annotation files. This may become obsolete when https://github.com/monarch-initiative/hpo-annotation-data/issues/84 is addressed. :return: """ logger.info("Building list of non-zero-padded DOIDs") raw_file = '/'.join((self.rawdir, self.files['doid']['file'])) doids = set() # scan the file and get all doids with open(raw_file, 'r', encoding="utf8") as f: for line in f: matches = re.search(r'(DOID_\d+)', line) if matches is not None: for m in matches.groups(): doids.add(re.sub(r'_', ':', m)) nopad_doids = set() for d in doids: num = re.sub(r'DOID[:_]', '', d) # look for things not starting with zero if not re.match(r'^0', str(num)): nopad_doids.add(num) logger.info("Found %d/%d DOIDs are not zero-padded", len(nopad_doids), len(doids)) return nopad_doids def process_common_disease_file(self, raw, unpadded_doids, limit=None): """ Make disaese-phenotype associations. Some identifiers need clean up: * DOIDs are listed as DOID-DOID: --> DOID: * DOIDs may be unnecessarily zero-padded. these are remapped to their non-padded equivalent. :param raw: :param unpadded_doids: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 assoc_count = 0 replace_id_flag = False with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') header = csvfile.readline() # skip the header row logger.info("HEADER: %s", header) disease_id = None for row in filereader: if 21 == len(row): (did, dname, gid, gene_name, genotype, gene_symbols, phenotype_id, phenotype_name, age_of_onset_id, age_of_onset_name, eid, evidence_name, frequency, sex_id, sex_name, negation_id, negation_name, description, pub_ids, assigned_by, date_created) = [str(col).strip() for col in row] else: logger.warning( "Wrong number of columns! expected 21, got: %s in: %s", len(row), raw) logger.warning("%s", row) continue # b/c "PMID: 17223397" pub_ids = re.sub(r' *', '', pub_ids) disease_id = re.sub(r'DO(ID)?[-\:](DOID:)?', 'DOID:', did) disease_id = re.sub(r'MESH-', 'MESH:', disease_id) if not re.search(r'(DOID\:|MESH\:\w)\d+', disease_id): logger.warning("Invalid id format: %s", disease_id) # figure out if the doid should be unpadded, # then use the unpadded version instead if re.match(r'DOID', disease_id): unpadded_num = re.sub(r'DOID:', '', disease_id) unpadded_num = unpadded_num.lstrip('0') if unpadded_num in unpadded_doids: fixed_id = 'DOID:' + unpadded_num replace_id_flag = True disease_id = fixed_id.strip() if self.testMode and disease_id not in self.test_ids: # since these are broken up into disease-by-disease, # just skip the whole file return 0 else: line_counter += 1 if negation_id != '': continue # TODO add negative associations if disease_id != '' and phenotype_id != '': assoc = D2PAssoc( g, self.name, disease_id, phenotype_id.strip()) if age_of_onset_id != '': assoc.onset = age_of_onset_id if frequency != '': assoc.frequency = frequency eco_id = self._map_evidence_to_codes(eid) if eco_id is None: eco_id = self._map_evidence_to_codes('ITM') assoc.add_evidence(eco_id) # TODO add sex? - not in dataset yet if description != '': assoc.set_description(description) if pub_ids != '': for p in pub_ids.split(';'): p = re.sub(r' *', '', p) if re.search(r'(DOID|MESH)', p) \ or re.search(r'Disease name contained', description): # skip "pubs" that are derived from # the classes themselves continue assoc.add_source(p.strip()) # TODO assigned by? assoc.add_association_to_graph() assoc_count += 1 if not self.testMode and limit is not None\ and line_counter > limit: break if replace_id_flag: logger.info("replaced DOID with unpadded version") self.replaced_id_count += 1 logger.info( "Added %d associations for %s.", assoc_count, disease_id) return assoc_count def getTestSuite(self): import unittest from tests.test_hpoa import HPOATestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(HPOATestCase) return test_suite
class MPD(Source): """ From the [MPD](http://phenome.jax.org/) website: This resource is a collaborative standardized collection of measured data on laboratory mouse strains and populations. Includes baseline phenotype data sets as well as studies of drug, diet, disease and aging effect. Also includes protocols, projects and publications, and SNP, variation and gene expression studies. Here, we pull the data and model the genotypes using GENO and the genotype-to-phenotype associations using the OBAN schema. MPD provide measurements for particular assays for several strains. Each of these measurements is itself mapped to a MP or VT term as a phenotype. Therefore, we can create a strain-to-phenotype association based on those strains that lie outside of the "normal" range for the given measurements. We can compute the average of the measurements for all strains tested, and then threshold any extreme measurements being beyond some threshold beyond the average. Our default threshold here, is +/-2 standard deviations beyond the mean. Because the measurements are made and recorded at the level of a specific sex of each strain, we associate the MP/VT phenotype with the sex-qualified genotype/strain. """ MPDDL = 'http://phenomedoc.jax.org/MPD_downloads' files = { 'ontology_mappings': { 'file': 'ontology_mappings.csv', 'url': MPDDL + '/ontology_mappings.csv'}, 'straininfo': { 'file': 'straininfo.csv', 'url': MPDDL + '/straininfo.csv'}, 'assay_metadata': { 'file': 'measurements.csv', 'url': MPDDL + '/measurements.csv'}, 'strainmeans': { 'file': 'strainmeans.csv.gz', 'url': MPDDL + '/strainmeans.csv.gz'}, # 'mpd_datasets_metadata': { #TEC does not seem to be used # 'file': 'mpd_datasets_metadata.xml.gz', # 'url': MPDDL + '/mpd_datasets_metadata.xml.gz'}, } # the following are strain ids for testing # test_ids = [ # "MPD:2", "MPD:3", "MPD:5", "MPD:6", "MPD:9", "MPD:11", "MPD:18", # "MPD:20", "MPD:24", "MPD:28", "MPD:30", "MPD:33", "MPD:34", "MPD:36", # "MPD:37", "MPD:39", "MPD:40", "MPD:42", "MPD:47", "MPD:66", "MPD:68", # "MPD:71", "MPD:75", "MPD:78", "MPD:122", "MPD:169", "MPD:438", # "MPD:457","MPD:473", "MPD:481", "MPD:759", "MPD:766", "MPD:770", # "MPD:849", "MPD:857", "MPD:955", "MPD:964", "MPD:988", "MPD:1005", # "MPD:1017", "MPD:1204", "MPD:1233", "MPD:1235", "MPD:1236", "MPD:1237"] test_ids = [ 'MPD:6', 'MPD:849', 'MPD:425', 'MPD:569', "MPD:10", "MPD:1002", "MPD:39", "MPD:2319"] mgd_agent_id = "MPD:db/q?rtn=people/allinv" mgd_agent_label = "Mouse Phenotype Database" mgd_agent_type = "foaf:organization" def __init__(self, graph_type, are_bnodes_skolemized): Source.__init__(self, graph_type, are_bnodes_skolemized, 'mpd') # @N, not sure if this step is required self.stdevthreshold = 2 # update the dataset object with details about this resource # @N: Note that there is no license as far as I can tell self.dataset = Dataset( 'mpd', 'MPD', 'http://phenome.jax.org', None, None) # TODO add a citation for mpd dataset as a whole self.dataset.set_citation('PMID:15619963') self.assayhash = {} self.idlabel_hash = {} # to store the mean/zscore of each measure by strain+sex self.score_means_by_measure = {} # to store the mean value for each measure by strain+sex self.strain_scores_by_measure = {} return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) return def parse(self, limit=None): """ MPD data is delivered in four separate csv files and one xml file, which we process iteratively and write out as one large graph. :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") self._process_straininfo(limit) # the following will provide us the hash-lookups # These must be processed in a specific order # mapping between assays and ontology terms self._process_ontology_mappings_file(limit) # this is the metadata about the measurements self._process_measurements_file(limit) # get all the measurements per strain self._process_strainmeans_file(limit) # The following will use the hash populated above # to lookup the ids when filling in the graph self._fill_provenance_graph(limit) logger.info("Finished parsing.") return def _process_ontology_mappings_file(self, limit): # line_counter = 0 # TODO unused logger.info("Processing ontology mappings...") raw = '/'.join((self.rawdir, 'ontology_mappings.csv')) with open(raw, 'r') as f: reader = csv.reader(f) # read the header row; skip self.check_header(self.files['ontology_mappings']['file'], f.readline()) for row in reader: try: (assay_id, ont_term, descrip) = row except ValueError: continue assay_id = int(assay_id) if re.match(r'(MP|VT)', ont_term): # add the mapping denovo if assay_id not in self.assayhash: self.assayhash[assay_id] = {} self.assayhash[assay_id]['ont_terms'] = set() self.assayhash[assay_id]['ont_terms'].add(ont_term) return def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') self.check_header(self.files['straininfo']['file'], f.readline()) for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:' + str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:' + str(mpd_strainid) model.addIndividualToGraph(strain_id, strain_name, tax_id) if mpdshortname.strip() != '': model.addSynonym(strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum model.addSameIndividual(strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) model.addSameIndividual(strain_id, reiken_id) else: if url != '': model.addXref(strain_id, url, True) if vendor != '': model.addXref( strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' model.addDescription(strain_id, desc) # TODO make the panels as a resource collection return def _process_measurements_file(self, limit): line_counter = 0 logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, 'measurements.csv')) with open(raw, 'r') as f: reader = csv.reader(f) # read the header row; skip self.check_header( self.files['assay_metadata']['file'], f.readline()) for row in reader: line_counter += 1 assay_id = int(row[0]) assay_label = row[4] assay_units = row[5] assay_type = row[6] if row[6] is not '' else None if assay_id not in self.assayhash: self.assayhash[assay_id] = {} description = self.build_measurement_description(row) self.assayhash[assay_id]['description'] = description self.assayhash[assay_id]['assay_label'] = assay_label self.assayhash[assay_id]['assay_type'] = assay_type self.assayhash[assay_id]['assay_units'] = assay_units # TODO add projectsym property? # TODO add intervention? # ageweeks might be useful for adding to phenotype assoc # end loop on measurement metadata return def _process_strainmeans_file(self, limit): """ This will store the entire set of strain means in a hash. Not the most efficient representation, but easy access. We will loop through this later to then apply cutoffs and add associations :param limit: :return: """ logger.info("Processing strain means ...") line_counter = 0 raw = '/'.join((self.rawdir, self.files['strainmeans']['file'])) with gzip.open(raw, 'rb') as f: f = io.TextIOWrapper(f) reader = csv.reader(f) self.check_header(self.files['strainmeans']['file'], f.readline()) score_means_by_measure = {} strain_scores_by_measure = {} for row in reader: try: (measnum, varname, strain, strainid, sex, mean, nmice, sd, sem, cv, minval, maxval, logmean, logsd, zscore, logzscore) = row except ValueError: continue line_counter += 1 strain_num = int(strainid) assay_num = int(measnum) # assuming the zscore is across all the items # in the same measure+var+strain+sex # note: it seems that there is only ever 1 varname per measnum. # note: some assays only tested one sex! # we split this here by sex if assay_num not in score_means_by_measure: score_means_by_measure[assay_num] = {} if sex not in score_means_by_measure[assay_num]: score_means_by_measure[assay_num][sex] = list() score_means_by_measure[assay_num][sex].append(float(mean)) if strain_num not in strain_scores_by_measure: strain_scores_by_measure[strain_num] = {} if sex not in strain_scores_by_measure[strain_num]: strain_scores_by_measure[strain_num][sex] = {} strain_scores_by_measure[strain_num][sex][assay_num] = \ {'mean': float(mean), 'zscore': float(zscore)} # end loop over strainmeans self.score_means_by_measure = score_means_by_measure self.strain_scores_by_measure = strain_scores_by_measure return def _fill_provenance_graph(self, limit): logger.info("Building graph ...") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) taxon_id = 'NCBITaxon:10090' # hardcode to Mus musculus model.addClassToGraph(taxon_id, None) scores_passing_threshold_count = 0 scores_passing_threshold_with_ontologies_count = 0 scores_not_passing_threshold_count = 0 # loop through all the strains, # and make G2P assoc for those with scores beyond threshold for strain_num in self.strain_scores_by_measure: if self.testMode and 'MPD:'+str(strain_num) not in self.test_ids: continue strain_id = 'MPD-strain:'+str(strain_num) for sex in self.strain_scores_by_measure[strain_num]: measures = self.strain_scores_by_measure[strain_num][sex] for m in measures: assay_id = 'MPD-assay:'+str(m) # TODO consider using the means # instead of precomputed zscores if 'zscore' in measures[m]: zscore = measures[m]['zscore'] if abs(zscore) >= self.stdevthreshold: scores_passing_threshold_count += 1 # logger.info( # "Score passing threshold: %s | %s | %s", # strain_id, assay_id, zscore) # add the G2P assoc prov = Provenance(self.graph) try: assay_label = self.assayhash[m]['assay_label'] assay_description = \ self.assayhash[m]['description'] ont_term_ids = self.assayhash[m].get('ont_terms') comment = ' '.join((assay_label, '(zscore='+str(zscore)+')')) except KeyError: assay_label = None assay_description = None ont_term_ids = None if assay_label is not None: assay_label += ' ('+str(m)+')' # TODO unused # assay_type = self.assayhash[m]['assay_type'] assay_type_id = Provenance.provenance_types['assay'] if ont_term_ids is not None: scores_passing_threshold_with_ontologies_count += 1 prov.add_assay_to_graph( assay_id, assay_label, assay_type_id, assay_description) self._add_g2p_assoc( g, strain_id, sex, assay_id, ont_term_ids, comment) else: scores_not_passing_threshold_count += 1 logger.info("Scores passing threshold: %d", scores_passing_threshold_count) logger.info("Scores passing threshold with ontologies: %d", scores_passing_threshold_with_ontologies_count) logger.info("Scores not passing threshold: %d", scores_not_passing_threshold_count) return def _add_g2p_assoc(self, g, strain_id, sex, assay_id, phenotypes, comment): """ Create an association between a sex-specific strain id and each of the phenotypes. Here, we create a genotype from the strain, and a sex-specific genotype. Each of those genotypes are created as anonymous nodes. The evidence code is hardcoded to be: ECO:experimental_phenotypic_evidence. :param g: :param strain_id: :param sex: :param assay_id: :param phenotypes: a list of phenotypes to association with the strain :param comment: :return: """ geno = Genotype(g) model = Model(g) eco_id = "ECO:0000059" # experimental_phenotypic_evidence strain_label = self.idlabel_hash.get(strain_id) # strain genotype genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), 'genotype')) genotype_label = '[' + strain_label + ']' sex_specific_genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), sex, 'genotype')) if strain_label is not None: sex_specific_genotype_label = strain_label + ' (' + sex + ')' else: sex_specific_genotype_label = strain_id + '(' + sex + ')' genotype_type = Genotype.genoparts['sex_qualified_genotype'] if sex == 'm': genotype_type = Genotype.genoparts['male_genotype'] elif sex == 'f': genotype_type = Genotype.genoparts['female_genotype'] # add the genotype to strain connection geno.addGenotype( genotype_id, genotype_label, Genotype.genoparts['genomic_background']) g.addTriple( strain_id, Genotype.object_properties['has_genotype'], genotype_id) geno.addGenotype( sex_specific_genotype_id, sex_specific_genotype_label, genotype_type) # add the strain as the background for the genotype g.addTriple( sex_specific_genotype_id, Genotype.object_properties['has_sex_agnostic_genotype_part'], genotype_id) # ############# BUILD THE G2P ASSOC ############# # TODO add more provenance info when that model is completed if phenotypes is not None: for phenotype_id in phenotypes: assoc = G2PAssoc( g, self.name, sex_specific_genotype_id, phenotype_id) assoc.add_evidence(assay_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model.addComment(assoc_id, comment) return def getTestSuite(self): import unittest from tests.test_mpd import MPDTestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(MPDTestCase) return test_suite @staticmethod def normalise_units(units): # todo: return units @staticmethod def build_measurement_description(row): (measnum, mpdsector, projsym, varname, descrip, units, method, intervention, paneldesc, datatype, sextested, nstrainstested, ageweeks,) = row if sextested == 'f': sextested = 'female' elif sextested == 'm': sextested = 'male' elif sextested == 'fm': sextested = 'male and female' else: logger.warning("Unknown sex tested key: %s", sextested) description = "This is an assay of [" + descrip + "] shown as a [" + \ datatype + "] measured in [" + units + "]" if intervention is not None and intervention != "": description += " in response to [" + intervention + "]" """ As of 9/28/2017 intparm is no longer in the measurements.tsv if intparm is not None and intervention != "": description += \ ". This represents the [" + intparm + \ "] arm, using materials and methods that included [" + \ method + "]" """ description += \ ". The overall experiment is entitled [" + projsym + "]. " description += \ "It was conducted in [" + sextested + "] mice at [" + \ ageweeks + "] of age in" + " [" + nstrainstested + \ "] different mouse strains. " """ As of 9/28/2017 cat1-3 are no longer in the measurements.tsv description += "Keywords: " + cat1 + \ ((", " + cat2) if cat2.strip() is not "" else "") + \ ((", " + cat3) if cat3.strip() is not "" else "") + "." """ return description # def _log_missing_ids(self, missing_id, name_of_file_from_which_missing): # if missing_id not in self.missing_assay_hash: # self.missing_assay_hash[missing_id] = set() # self.missing_assay_hash[missing_id].add(name_of_file_from_which_missing) # # todo: remove the offending ids from the hash # return @staticmethod def check_header(filename, header): header = header.rstrip("\n") header_map = { 'strainmeans.csv.gz': 'measnum,varname,strain,strainid,sex,mean,' 'nmice,sd,sem,cv,minval,maxval,logmean,' 'logsd,zscore,logzscore', 'straininfo.csv': 'strainname,vendor,stocknum,panel,mpd_strainid,' 'straintype,n_proj,n_snp_datasets,mpd_shortname,url', 'measurements.csv': 'measnum,mpdsector,projsym,varname,descrip,units,' 'method,intervention,paneldesc,datatype,sextested,' 'nstrainstested,ageweeks', 'ontology_mappings.csv': 'measnum,ont_term,descrip' } if header != header_map[filename]: raise ValueError( "header in {} \n {}\n" "does not match expected:\n {}" .format(filename, header, header_map[filename]) )
class BioGrid(Source): """ Biogrid interaction data """ # TODO write up class summary for docstring files = { 'interactions': { 'file': 'interactions.mitab.zip', 'url': BGDL + '/BIOGRID-ALL-LATEST.mitab.zip'}, 'identifiers': { 'file': 'identifiers.tab.zip', 'url': BGDL + '/BIOGRID-IDENTIFIERS-LATEST.tab.zip'} } # biogrid-specific identifiers for use in subsetting identifier mapping biogrid_ids = [ 106638, 107308, 107506, 107674, 107675, 108277, 108506, 108767, 108814, 108899, 110308, 110364, 110678, 111642, 112300, 112365, 112771, 112898, 199832, 203220, 247276, 120150, 120160, 124085] def __init__(self, tax_ids=None): super().__init__('biogrid') self.tax_ids = tax_ids self.load_bindings() self.dataset = Dataset( 'biogrid', 'The BioGrid', 'http://thebiogrid.org/', None, 'http://wiki.thebiogrid.org/doku.php/terms_and_conditions') # Defaults # our favorite animals # taxids = [9606,10090,10116,7227,7955,6239,8355] if self.tax_ids is None: self.tax_ids = [9606, 10090, 7955] if 'test_ids' not in config.get_config() or \ 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") else: self.test_ids = config.get_config()['test_ids']['gene'] # data-source specific warnings # (will be removed when issues are cleared) logger.warning( "several MI experimental codes do not exactly map to ECO; " "using approximations.") return def fetch(self, is_dl_forced=False): """ :param is_dl_forced: :return: None """ self.get_files(is_dl_forced) # the version number is encoded in the filename in the zip. # for example, the interactions file may unzip to # BIOGRID-ALL-3.2.119.mitab.txt, where the version number is 3.2.119 f = '/'.join((self.rawdir, self.files['interactions']['file'])) st = os.stat(f) filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d") with ZipFile(f, 'r') as myzip: flist = myzip.namelist() # assume that the first entry is the item fname = flist[0] # get the version from the filename version = \ re.match(r'BIOGRID-ALL-(\d+\.\d+\.\d+)\.mitab.txt', fname) myzip.close() self.dataset.setVersion(filedate, str(version.groups()[0])) return def parse(self, limit=None): """ :param limit: :return: """ if self.testOnly: self.testMode = True self._get_interactions(limit) self._get_identifiers(limit) self.load_bindings() logger.info("Loaded %d test graph nodes", len(self.testgraph)) logger.info("Loaded %d full graph nodes", len(self.graph)) return def _get_interactions(self, limit): logger.info("getting interactions") line_counter = 0 f = '/'.join((self.rawdir, self.files['interactions']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] matchcounter = 0 with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip comment lines if re.match(r'^#', line.decode()): logger.debug("Skipping header line") continue line_counter += 1 line = line.decode().strip() # print(line) (interactor_a, interactor_b, alt_ids_a, alt_ids_b, aliases_a, aliases_b, detection_method, pub_author, pub_id, taxid_a, taxid_b, interaction_type, source_db, interaction_id, confidence_val) = line.split('\t') # get the actual gene ids, # typically formated like: gene/locuslink:351|BIOGRID:106848 gene_a_num = re.search( r'locuslink\:(\d+)\|?', interactor_a).groups()[0] gene_b_num = re.search( r'locuslink\:(\d+)\|?', interactor_b).groups()[0] if self.testMode: g = self.testgraph # skip any genes that don't match our test set if (int(gene_a_num) not in self.test_ids) or\ (int(gene_b_num) not in self.test_ids): continue else: g = self.graph # when not in test mode, filter by taxon if int(re.sub(r'taxid:', '', taxid_a.rstrip())) not in\ self.tax_ids or\ int(re.sub( r'taxid:', '', taxid_b.rstrip())) not in\ self.tax_ids: continue else: matchcounter += 1 gene_a = 'NCBIGene:'+gene_a_num gene_b = 'NCBIGene:'+gene_b_num # get the interaction type # psi-mi:"MI:0407"(direct interaction) int_type = re.search(r'MI:\d+', interaction_type).group() rel = self._map_MI_to_RO(int_type) # scrub pubmed-->PMID prefix pub_id = re.sub(r'pubmed', 'PMID', pub_id) # remove bogus whitespace pub_id = pub_id.strip() # get the method, and convert to evidence code det_code = re.search(r'MI:\d+', detection_method).group() evidence = self._map_MI_to_ECO(det_code) # note that the interaction_id is some kind of internal biogrid # identifier that does not map to a public URI. # we will construct a monarch identifier from this assoc = InteractionAssoc(self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence) assoc.add_source(pub_id) assoc.add_association_to_graph(g) assoc.load_all_properties(g) if not self.testMode and ( limit is not None and line_counter > limit): break myzip.close() return def _get_identifiers(self, limit): """ This will process the id mapping file provided by Biogrid. The file has a very large header, which we scan past, then pull the identifiers, and make equivalence axioms :param limit: :return: """ logger.info("getting identifier mapping") line_counter = 0 f = '/'.join((self.rawdir, self.files['identifiers']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] foundheader = False gu = GraphUtils(curie_map.get()) # TODO align this species filter with the one above # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster, # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',') speciesfilters = 'H**o sapiens,Mus musculus'.split(',') with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip header lines if not foundheader: if re.match(r'BIOGRID_ID', line.decode()): foundheader = True continue line = line.decode().strip() # BIOGRID_ID # IDENTIFIER_VALUE # IDENTIFIER_TYPE # ORGANISM_OFFICIAL_NAME # 1 814566 ENTREZ_GENE Arabidopsis thaliana (biogrid_num, id_num, id_type, organism_label) = line.split('\t') if self.testMode: g = self.testgraph # skip any genes that don't match our test set if int(biogrid_num) not in self.biogrid_ids: continue else: g = self.graph # for each one of these, # create the node and add equivalent classes biogrid_id = 'BIOGRID:'+biogrid_num prefix = self._map_idtype_to_prefix(id_type) # TODO make these filters available as commandline options # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC, # WormBase,XenBase,ENSEMBL,miRBase'.split(',') geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',') # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein' if (speciesfilters is not None) \ and (organism_label.strip() in speciesfilters): line_counter += 1 if (geneidtypefilters is not None) \ and (prefix in geneidtypefilters): mapped_id = ':'.join((prefix, id_num)) gu.addEquivalentClass(g, biogrid_id, mapped_id) # this symbol will only get attached to the biogrid class elif id_type == 'OFFICIAL_SYMBOL': gu.addClassToGraph(g, biogrid_id, id_num) # elif (id_type == 'SYNONYM'): # FIXME - i am not sure these are synonyms, altids? # gu.addSynonym(g,biogrid_id,id_num) if not self.testMode and limit is not None \ and line_counter > limit: break myzip.close() return @staticmethod def _map_MI_to_RO(mi_id): rel = InteractionAssoc.interaction_object_properties mi_ro_map = { # colocalization 'MI:0403': rel['colocalizes_with'], # direct interaction 'MI:0407': rel['interacts_with'], # synthetic genetic interaction defined by inequality 'MI:0794': rel['genetically_interacts_with'], # suppressive genetic interaction defined by inequality 'MI:0796': rel['genetically_interacts_with'], # additive genetic interaction defined by inequality 'MI:0799': rel['genetically_interacts_with'], # association 'MI:0914': rel['interacts_with'], # physical association 'MI:0915': rel['interacts_with'] } ro_id = rel['interacts_with'] # default if mi_id in mi_ro_map: ro_id = mi_ro_map.get(mi_id) return ro_id @staticmethod def _map_MI_to_ECO(mi_id): eco_id = 'ECO:0000006' # default to experimental evidence mi_to_eco_map = { 'MI:0018': 'ECO:0000068', # yeast two-hybrid 'MI:0004': 'ECO:0000079', # affinity chromatography 'MI:0047': 'ECO:0000076', # far western blotting 'MI:0055': 'ECO:0000021', # should be FRET, but using physical_interaction FIXME 'MI:0090': 'ECO:0000012', # desired: protein complementation, using: functional complementation 'MI:0096': 'ECO:0000085', # desired: pull down, using: immunoprecipitation 'MI:0114': 'ECO:0000324', # desired: x-ray crystallography, using: imaging assay 'MI:0254': 'ECO:0000011', # desired: genetic interference, using: genetic interaction evidence 'MI:0401': 'ECO:0000172', # desired: biochemical, using: biochemical trait evidence 'MI:0415': 'ECO:0000005', # desired: enzymatic study, using: enzyme assay evidence 'MI:0428': 'ECO:0000324', # imaging 'MI:0686': 'ECO:0000006', # desired: unspecified, using: experimental evidence 'MI:1313': 'ECO:0000006' # None? } if mi_id in mi_to_eco_map: eco_id = mi_to_eco_map.get(mi_id) else: logger.warning( "unmapped code %s. Defaulting to experimental_evidence", mi_id) return eco_id @staticmethod def _map_idtype_to_prefix(idtype): """ Here we need to reformat the BioGrid source prefixes to standard ones used in our curie-map. :param idtype: :return: """ prefix = idtype idtype_to_prefix_map = { 'XENBASE': 'XenBase', 'TREMBL': 'TrEMBL', 'MGI': 'MGI', 'REFSEQ_DNA_ACCESSION': 'RefSeqNA', 'MAIZEGDB': 'MaizeGDB', 'BEEBASE': 'BeeBase', 'ENSEMBL': 'ENSEMBL', 'TAIR': 'TAIR', 'GENBANK_DNA_GI': 'NCBIgi', 'CGNC': 'CGNC', 'RGD': 'RGD', 'GENBANK_GENOMIC_DNA_GI': 'NCBIgi', 'SWISSPROT': 'Swiss-Prot', 'MIM': 'OMIM', 'FLYBASE': 'FlyBase', 'VEGA': 'VEGA', 'ANIMALQTLDB': 'AQTLDB', 'ENTREZ_GENE_ETG': 'ETG', 'HPRD': 'HPRD', 'APHIDBASE': 'APHIDBASE', 'GENBANK_PROTEIN_ACCESSION': 'NCBIProtein', 'ENTREZ_GENE': 'NCBIGene', 'SGD': 'SGD', 'GENBANK_GENOMIC_DNA_ACCESSION': 'NCBIGenome', 'BGD': 'BGD', 'WORMBASE': 'WormBase', 'ZFIN': 'ZFIN', 'DICTYBASE': 'dictyBase', 'ECOGENE': 'ECOGENE', 'BIOGRID': 'BIOGRID', 'GENBANK_DNA_ACCESSION': 'NCBILocus', 'VECTORBASE': 'VectorBase', 'MIRBASE': 'miRBase', 'IMGT/GENE-DB': 'IGMT', 'HGNC': 'HGNC', 'SYSTEMATIC_NAME': None, 'OFFICIAL_SYMBOL': None, 'REFSEQ_GENOMIC_DNA_ACCESSION': 'NCBILocus', 'GENBANK_PROTEIN_GI': 'NCBIgi', 'REFSEQ_PROTEIN_ACCESSION': 'RefSeqProt', 'SYNONYM': None, 'GRID_LEGACY': None, # the following showed up in 3.3.124 'UNIPROT-ACCESSION': 'UniprotKB', 'SWISS-PROT': 'Swiss-Prot', 'OFFICIAL SYMBOL': None, 'ENSEMBL RNA': None, 'GRID LEGACY': None, 'ENSEMBL PROTEIN': None, 'REFSEQ-RNA-GI': None, 'REFSEQ-RNA-ACCESSION': None, 'REFSEQ-PROTEIN-GI': None, 'REFSEQ-PROTEIN-ACCESSION-VERSIONED': None, 'REFSEQ-PROTEIN-ACCESSION': None, 'REFSEQ-LEGACY': None, 'SYSTEMATIC NAME': None, 'ORDERED LOCUS': None, 'UNIPROT-ISOFORM': 'UniprotKB', 'ENSEMBL GENE': 'ENSEMBL', 'CGD': None, # Not sure what this is? 'WORMBASE-OLD': 'WormBase' } if idtype in idtype_to_prefix_map: prefix = idtype_to_prefix_map.get(idtype) else: logger.warning("unmapped prefix %s", prefix) return prefix def getTestSuite(self): import unittest from tests.test_biogrid import BioGridTestCase # TODO add InteractionAssoc tests # TODO add test about if all prefixes are mapped? test_suite = \ unittest.TestLoader().loadTestsFromTestCase(BioGridTestCase) return test_suite
class WormBase(Source): """ This is the parser for the [C. elegans Model Organism Database (WormBase)](http://www.wormbase.org), from which we process genotype and phenotype data for laboratory worms (C.elegans and other nematodes). We generate the wormbase graph to include the following information: * genes * sequence alterations (includes SNPs/del/ins/indel and large chromosomal rearrangements) * RNAi as expression-affecting reagents * genotypes, and their components * strains * publications (and their mapping to PMIDs, if available) * allele-to-phenotype associations (including variants by RNAi) * genetic positional information for genes and sequence alterations Genotypes leverage the GENO genotype model and includes both intrinsic and extrinsic genotypes. Where necessary, we create anonymous nodes of the genotype partonomy (i.e. for variant single locus complements, genomic variation complements, variant loci, extrinsic genotypes, and extrinsic genotype parts). TODO: get people and gene expression """ wbftp = 'ftp://ftp.wormbase.org/pub/wormbase/releases/current-development-release' files = { 'gene_ids': { 'file': 'c_elegans.PRJNA13758.geneIDs.txt.gz', 'url': wbftp + '/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WSNUMBER.geneIDs.txt.gz'}, # 'gene_desc': { # TEC: no functional_descriptions available 2016 Mar 03 # 'file': 'c_elegans.PRJNA13758.functional_descriptions.txt.gz', # 'url': wbftp+'/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WSNUMBER.functional_descriptions.txt.gz'}, 'allele_pheno': { 'file': 'phenotype_association.wb', 'url': wbftp + '/ONTOLOGY/phenotype_association.WSNUMBER.wb'}, 'rnai_pheno': { 'file': 'rnai_phenotypes.wb', 'url': wbftp + '/ONTOLOGY/rnai_phenotypes.WSNUMBER.wb'}, 'pub_xrefs': { 'file': 'pub_xrefs.txt', 'url': 'http://tazendra.caltech.edu/~azurebrd/cgi-bin/forms/generic.cgi?action=WpaXref'}, 'feature_loc': { 'file': 'c_elegans.PRJNA13758.annotations.gff3.gz', 'url': wbftp + '/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WSNUMBER.annotations.gff3.gz'}, 'disease_assoc': { 'file': 'disease_association.wb', 'url': 'ftp://ftp.sanger.ac.uk/pub/wormbase/releases/WSNUMBER/ONTOLOGY/disease_association.WSNUMBER.wb'}, # 'genes_during_development': { # 'file': 'development_association.wb', # 'url wbftp+'/ONTOLOGY/development_association.WS249.wb'}, # 'genes_in_anatomy': { # 'file': 'anatomy_association.wb', # 'url': wbftp+'/ONTOLOGY/anatomy_association.WS249.wb'}, # 'gene_interaction': { # 'file': 'c_elegans.PRJNA13758.gene_interactions.txt.gz', # 'url': wbftp+'/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WSNUMBER.gene_interactions.txt.gz'}, # 'orthologs': { # 'file': 'c_elegans.PRJNA13758.orthologs.txt.gz', # 'url': wbftp+'/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS249.orthologs.txt.gz'}, 'xrefs': { 'file': 'c_elegans.PRJNA13758.xrefs.txt.gz', 'url': wbftp + '/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WSNUMBER.xrefs.txt.gz'}, 'letter': { 'file': 'letter.WSNUMBER', 'url': wbftp + '/letter.WSNUMBER'}, } test_ids = { 'gene': [ 'WBGene00001414', 'WBGene00004967', 'WBGene00003916', 'WBGene00004397', 'WBGene00001531'], 'allele': [ 'WBVar00087800', 'WBVar00087742', 'WBVar00144481', 'WBVar00248869', 'WBVar00250630'], 'strain': ['BA794', 'RK1', 'HE1006'], 'pub': [] # FIXME } def __init__(self): Source.__init__(self, 'wormbase') # update the dataset object with details about this resource # NO LICENSE for this resource self.dataset = Dataset( 'wormbase', 'WormBase', 'http://www.wormbase.org', None, None, 'http://www.wormbase.org/about/policies#012') self.version_num = None return def fetch(self, is_dl_forced=False): # figure out the version number by probing the "current_release", # then edit the file dict accordingly # connect to wormbase ftp current_dev_release_dir = \ 'pub/wormbase/releases/current-development-release' ftp = FTP('ftp.wormbase.org') ftp.login() ftp.cwd(current_dev_release_dir) # the current release dir is a redirect to a versioned release. # pull that from the pwd. pwd = ftp.pwd() ftp.quit() wsver = re.search(r'releases\/(WS\d+)', pwd) if wsver is None or len(wsver.groups()) < 1: logger.error( "Couldn't figure out version number from FTP site. Exiting.") exit(1) else: self.update_wsnum_in_files(wsver.group(1)) self.dataset.set_version_by_num(self.version_num) # fetch all the files self.get_files(is_dl_forced) return def update_wsnum_in_files(self, vernum): """ With the given version number ```vernum```, update the source's version number, and replace in the file hashmap. We also save the "letter" for the version to maintain the version number. :param vernum: :return: """ self.version_num = vernum # replace the WSNUMBER in the url paths with the real WS### for f in self.files: url = self.files[f].get('url') url = re.sub(r'WSNUMBER', self.version_num, url) self.files[f]['url'] = url logger.debug( "Replacing WSNUMBER in %s with %s", f, self.version_num) # also the letter file - keep this so we know the version number self.files['letter']['file'] = re.sub(r'WSNUMBER', self.version_num, self.files['letter']['file']) return def parse(self, limit=None): if limit is not None: logger.info("Only parsing first %s rows of each file", limit) if self.version_num is None: import os logger.info("Figuring out version num for files") # probe the raw directory for the WSnumber on # the "letter.WS###" file. # this is the only one that we keep the version number on files = os.listdir(self.rawdir) letter_file = next(f for f in files if re.match(r'letter', f)) vernum = re.search(r'(WS\d+)', letter_file) self.update_wsnum_in_files(vernum.group(1)) logger.info("Parsing files...") if self.testOnly: self.testMode = True if self.testMode: g = self.testgraph else: g = self.graph self.nobnodes = True # FIXME # to hold any label for a given id self.id_label_map = {} # to hold the mappings between genotype and background self.genotype_backgrounds = {} self.extrinsic_id_to_enviro_id_hash = {} # to hold the genes variant due to a seq alt self.variant_loci_genes = {} # to hold the parts of an environment self.environment_hash = {} self.wildtype_genotypes = [] # stores the rnai_reagent to gene targets self.rnai_gene_map = {} self.process_gene_ids(limit) # self.process_gene_desc(limit) #TEC imput file is mia 2016-Mar-03 self.process_allele_phenotype(limit) self.process_rnai_phenotypes(limit) self.process_pub_xrefs(limit) self.process_feature_loc(limit) self.process_disease_association(limit) # TODO add this when when complete # self.process_gene_interaction(limit) logger.info("Finished parsing.") self.load_bindings() gu = GraphUtils(curie_map.get()) gu.loadAllProperties(g) gu.loadObjectProperties(g, Genotype.object_properties) logger.info("Found %d nodes in graph", len(self.graph)) logger.info("Found %d nodes in testgraph", len(self.testgraph)) return def process_gene_ids(self, limit): raw = '/'.join((self.rawdir, self.files['gene_ids']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing Gene IDs") line_counter = 0 geno = Genotype(g) with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 (taxon_num, gene_num, gene_symbol, gene_synonym, live) = row # 6239,WBGene00000001,aap-1,Y110A7A.10,Live if self.testMode and gene_num not in self.test_ids['gene']: continue taxon_id = 'NCBITaxon:'+taxon_num gene_id = 'WormBase:'+gene_num if gene_symbol == '': gene_symbol = gene_synonym if gene_symbol == '': gene_symbol = None gu.addClassToGraph( g, gene_id, gene_symbol, Genotype.genoparts['gene']) if live == 'Dead': gu.addDeprecatedClass(g, gene_id) geno.addTaxon(taxon_id, gene_id) if gene_synonym != '': gu.addSynonym(g, gene_id, gene_synonym) if not self.testMode \ and limit is not None and line_counter > limit: break return def process_gene_desc(self, limit): raw = '/'.join((self.rawdir, self.files['gene_desc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing Gene descriptions") line_counter = 0 # geno = Genotype(g) # TODO unused with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue line_counter += 1 if line_counter == 1: continue (gene_num, public_name, molecular_name, concise_description, provisional_description, detailed_description, automated_description, gene_class_description) = row if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:'+gene_num if concise_description != 'none available': gu.addDefinition(g, gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text == concise_description \ or re.match(r'none', text) or text == '': pass # don't use it else: text = ' '.join((text, '['+d+']')) descs[d] = text gu.addDescription(g, gene_id, text) if not self.testMode \ and limit is not None and line_counter > limit: break return def process_allele_phenotype(self, limit=None): """ This file compactly lists variant to phenotype associations, such that in a single row, there may be >1 variant listed per phenotype and paper. This indicates that each variant is individually assocated with the given phenotype, as listed in 1+ papers. (Not that the combination of variants is producing the phenotype.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['allele_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph # gu = GraphUtils(curie_map.get()) # TODO unused logger.info("Processing Allele phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, phenotype_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue eco_id = None if eco_symbol == 'IMP': eco_id = 'ECO:0000015' elif eco_symbol.strip() != '': logger.warning( "Encountered an ECO code we don't have: %s", eco_symbol) # according to the GOA spec, persons are not allowed to be # in the reference column, therefore they the variant and # persons are swapped between the reference and with column. # we unswitch them here. temp_var = temp_ref = None if re.search(r'WBVar|WBRNAi', ref): temp_var = ref # move the paper from the with column into the ref if re.search(r'WBPerson', with_or_from): temp_ref = with_or_from if temp_var is not None or temp_ref is not None: with_or_from = temp_var ref = temp_ref allele_list = re.split(r'\|', with_or_from) if len(allele_list) == 0: logger.error( "Missing alleles from phenotype assoc at line %d", line_counter) continue else: for a in allele_list: allele_num = re.sub(r'WB:', '', a.strip()) allele_id = 'WormBase:'+allele_num gene_id = 'WormBase:'+gene_num if re.search(r'WBRNAi', allele_id): # make the reagent-targeted gene, # & annotate that instead of the RNAi item directly rnai_num = re.sub(r'WormBase:', '', allele_id) rnai_id = allele_id rtg_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num, self.nobnodes) geno.addReagentTargetedGene( rnai_id, 'WormBase:'+gene_num, rtg_id) geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) allele_id = rtg_id elif re.search(r'WBVar', allele_id): # this may become deprecated by using wormmine # make the allele to gene relationship # the WBVars are really sequence alterations # the public name will come from elsewhere geno.addSequenceAlteration(allele_id, None) vl_id = '_'+'-'.join((gene_num, allele_num)) if self.nobnodes: vl_id = ':'+vl_id geno.addSequenceAlterationToVariantLocus( allele_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: logger.warning( "Some kind of allele I don't recognize: %s", allele_num) continue assoc = G2PAssoc(self.name, allele_id, phenotype_id) if eco_id is not None: assoc.add_evidence(eco_id) if ref is not None and ref != '': ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref) r = Reference(ref) if re.search(r'Person', ref): r.setType(r.ref_types['person']) # also add # inferred from background scientific knowledge assoc.add_evidence('ECO:0000001') r.addRefToGraph(g) assoc.add_source(ref) assoc.add_association_to_graph(g) # finish looping through all alleles if not self.testMode \ and limit is not None and line_counter > limit: break return def process_rnai_phenotypes(self, limit=None): raw = '/'.join((self.rawdir, self.files['rnai_pheno']['file'])) if self.testMode: g = self.testgraph else: g = self.graph # gu = GraphUtils(curie_map.get()) # TODO unused logger.info("Processing RNAi phenotype associations") line_counter = 0 geno = Genotype(g) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (gene_num, gene_alt_symbol, phenotype_label, phenotype_id, rnai_and_refs) = row # WBGene00001908 F17E9.9 locomotion variant WBPhenotype:0000643 WBRNAi00025129|WBPaper00006395 WBRNAi00025631|WBPaper00006395 # WBGene00001908 F17E9.9 avoids bacterial lawn WBPhenotype:0000402 WBRNAi00095640|WBPaper00040984 # WBGene00001908 F17E9.9 RAB-11 recycling endosome localization variant WBPhenotype:0002107 WBRNAi00090830|WBPaper00041129 if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:'+gene_num # refs = list() # TODO unused # the rnai_and_refs has this so that # WBRNAi00008687|WBPaper00005654 WBRNAi00025197|WBPaper00006395 WBRNAi00045381|WBPaper00025054 # space delimited between RNAi sets; # then each RNAi should have a paper rnai_sets = re.split(r' ', rnai_and_refs) for s in rnai_sets: # get the rnai_id (rnai_num, ref_num) = re.split(r'\|', s) if len(re.split(r'\|', s)) > 2: logger.warning( "There's an unexpected number of items in %s", s) if rnai_num not in self.rnai_gene_map: self.rnai_gene_map[rnai_num] = set() # to use for looking up later self.rnai_gene_map[rnai_num].add(gene_num) rnai_id = 'WormBase:'+rnai_num geno.addGeneTargetingReagent( rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id) # make the "allele" of the gene # that is targeted by the reagent allele_id = self.make_reagent_targeted_gene_id( gene_num, rnai_num, self.nobnodes) allele_label = gene_alt_symbol+'<'+rnai_num+'>' geno.addReagentTargetedGene( rnai_id, gene_id, allele_id, allele_label) assoc = G2PAssoc(self.name, allele_id, phenotype_id) assoc.add_source('WormBase:'+ref_num) # eco_id = 'ECO:0000019' # RNAi evidence # TODO unused assoc.add_association_to_graph(g) if not self.testMode \ and limit is not None and line_counter > limit: break return def process_pub_xrefs(self, limit=None): raw = '/'.join((self.rawdir, self.files['pub_xrefs']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing publication xrefs") line_counter = 0 with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (wb_ref, xref) = row # WBPaper00000009 pmid8805<BR> # WBPaper00000011 doi10.1139/z78-244<BR> # WBPaper00000012 cgc12<BR> if self.testMode and wb_ref not in self.test_ids['pub']: continue ref_id = 'WormBase:'+wb_ref xref_id = r = None xref = re.sub(r'<BR>', '', xref) xref = xref.strip() if re.match(r'pmid', xref): xref_id = 'PMID:'+re.sub(r'pmid\s*', '', xref) r = Reference( xref_id, Reference.ref_types['journal_article']) elif re.search(r'[\(\)\<\>\[\]\s]', xref): continue elif re.match(r'doi', xref): xref_id = 'DOI:'+re.sub(r'doi', '', xref.strip()) r = Reference(xref_id) elif re.match(r'cgc', xref): # TODO not sure what to do here with cgc xrefs continue else: # logger.debug("Other xrefs like %s", xref) continue if xref_id is not None: r.addRefToGraph(g) gu.addSameIndividual(g, ref_id, xref_id) if not self.testMode \ and limit is not None and line_counter > limit: break return def process_feature_loc(self, limit): raw = '/'.join((self.rawdir, self.files['feature_loc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing Feature location and attributes") line_counter = 0 geno = Genotype(g) strain_to_variant_map = {} build_num = self.version_num build_id = 'WormBase:'+build_num with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue (chrom, db, feature_type_label, start, end, score, strand, phase, attributes) = row # I interpolated_pmap_position gene 1 559768 . . . ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM) # I WormBase gene 3747 3909 . - . ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6 # I absolute_pmap_position gene 4119 10230 . . . ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM) # dbs = re.split( # r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA') # # if db not in dbs: # continue if feature_type_label not in [ 'gene', 'point_mutation', 'deletion', 'RNAi_reagent', 'duplication', 'enhancer', 'binding_site', 'biological_region', 'complex_substitution', 'substitution', 'insertion', 'inverted_repeat']: # note biological_regions include balancers # other options here: promoter, regulatory_region, reagent continue line_counter += 1 attribute_dict = {} if attributes != '': attribute_dict = dict( item.split("=")for item in re.sub(r'"', '', attributes).split(";")) fid = flabel = desc = None if 'ID' in attribute_dict: fid = attribute_dict.get('ID') if re.search(r'WB(Gene|Var|sf)', fid): fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid) elif re.match(r'(gmap|landmark)', fid): continue else: logger.info('other identifier %s', fid) fid = None elif 'variation' in attribute_dict: fid = 'WormBase:'+attribute_dict.get('variation') flabel = attribute_dict.get('public_name') sub = attribute_dict.get('substitution') ins = attribute_dict.get('insertion') # if it's a variation: # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T desc = '' if sub is not None: desc = 'substitution='+sub if ins is not None: desc = 'insertion='+ins # keep track of the strains with this variation, # for later processing strain_list = attribute_dict.get('strain') if strain_list is not None: for s in re.split(r',', strain_list): if s.strip() not in strain_to_variant_map: strain_to_variant_map[s.strip()] = set() strain_to_variant_map[s.strip()].add(fid) # if feature_type_label == 'RNAi_reagent': # Target=WBRNAi00096030 1 4942 # this will tell us where the RNAi is actually binding # target = attribute_dict.get('Target') # TODO unused # rnai_num = re.split(r' ', target)[0] # TODO unused # it will be the reagent-targeted-gene that has a position, # (i think) # TODO finish the RNAi binding location name = attribute_dict.get('Name') polymorphism = attribute_dict.get('polymorphism') if fid is None: if name is not None and re.match(r'WBsf', name): fid = 'WormBase:'+name name = None else: continue if self.testMode \ and re.sub(r'WormBase:', '', fid) \ not in self.test_ids['gene']+self.test_ids['allele']: continue # these really aren't that interesting if polymorphism is not None: continue if name is not None and not re.search(name, fid): if flabel is None: flabel = name else: gu.addSynonym(g, fid, name) if desc is not None: gu.addDescription(g, fid, desc) alias = attribute_dict.get('Alias') biotype = attribute_dict.get('biotype') note = attribute_dict.get('Note') other_name = attribute_dict.get('other_name') for n in [alias, other_name]: if n is not None: gu.addSynonym(g, fid, other_name) ftype = self.get_feature_type_by_class_and_biotype( feature_type_label, biotype) chr_id = makeChromID(chrom, build_id, 'CHR') geno.addChromosomeInstance(chrom, build_id, build_num) f = Feature(fid, flabel, ftype) f.addFeatureStartLocation(start, chr_id, strand) f.addFeatureEndLocation(start, chr_id, strand) feature_is_class = False if feature_type_label == 'gene': feature_is_class = True f.addFeatureToGraph(g, True, None, feature_is_class) if note is not None: gu.addDescription(g, fid, note) if not self.testMode \ and limit is not None and line_counter > limit: break # RNAi reagents: # I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10 # I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052 # I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH # TODO TF bindiing sites and network: # I TF_binding_site_region TF_binding_site 1861 2048 . + . Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16 # I TF_binding_site_region TF_binding_site 3403 4072 . + . Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1 return def process_disease_association(self, limit): raw = '/'.join((self.rawdir, self.files['disease_assoc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing disease models") geno = Genotype(g, self.nobnodes) line_counter = 0 worm_taxon = 'NCBITaxon:6239' with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'!', ''.join(row)): # header continue line_counter += 1 (db, gene_num, gene_symbol, is_not, disease_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, gene_class, taxon, date, assigned_by, blank, blank2) = row if self.testMode and gene_num not in self.test_ids['gene']: continue # TODO add NOT phenotypes if is_not == 'NOT': continue # WB WBGene00000001 aap-1 DOID:2583 PMID:19029536 IEA ENSEMBL:ENSG00000145675|OMIM:615214 D Y110A7A.10 gene taxon:6239 20150612 WB gene_id = 'WormBase:'+gene_num # make a variant of the gene vl = '_'+'-'.join((gene_num, 'unspecified')) if self.nobnodes: vl = ':'+vl vl_label = 'some variant of '+gene_symbol geno.addAlleleOfGene(vl, gene_id) animal_id = geno.make_experimental_model_with_genotype( g, vl, vl_label, worm_taxon, 'worm') assoc = G2PAssoc( self.name, animal_id, disease_id, gu.object_properties['model_of']) ref = re.sub(r'WB_REF:', 'WormBase:', ref) if ref != '': assoc.add_source(ref) eco_id = None if eco_symbol == 'IEA': eco_id = 'ECO:0000501' # IEA is this now if eco_id is not None: assoc.add_evidence(eco_id) assoc.add_association_to_graph(g) return def process_gene_interaction(self, limit): """ The gene interaction file includes identified interactions, that are between two or more gene (products). In the case of interactions with >2 genes, this requires creating groups of genes that are involved in the interaction. From the wormbase help list: In the example WBInteraction000007779 it would likely be misleading to suggest that lin-12 interacts with (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60 ALONE; the observation in the paper; see Table V in paper PMID:15990876 was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress the "multivulva" phenotype induced synthetically by simultaneous perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele). So this is necessarily a three-gene interaction. Therefore, we can create groups of genes based on their "status" of Effector | Effected. Status: IN PROGRESS :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['gene_interaction']['file'])) if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) logger.info("Processing gene interaction associations") line_counter = 0 with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar="'") for row in filereader: line_counter += 1 if re.match(r'#', ''.join(row)): continue (interaction_num, interaction_type, interaction_subtype, summary, citation) = row[0:5] print(row) interaction_id = 'WormBase:'+interaction_num # TODO deal with subtypes interaction_type_id = None if interaction_type == 'Genetic': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'genetically_interacts_with'] elif interaction_type == 'Physical': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'molecularly_interacts_with'] elif interaction_type == 'Regulatory': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'regulates'] else: logger.info( "An interaction type I don't understand %s", interaction_type) num_interactors = (len(row) - 5) / 3 if num_interactors != 2: logger.info( "Skipping interactions with !=2 participants:\n %s", str(row)) continue gene_a_id = 'WormBase:'+row[5] gene_b_id = 'WormBase:'+row[8] if self.testMode \ and gene_a_id not in self.test_ids['gene'] \ and gene_b_id not in self.test_ids['gene']: continue assoc = InteractionAssoc( self.name, gene_a_id, gene_b_id, interaction_type_id) assoc.set_association_id(interaction_id) assoc.add_association_to_graph(g) assoc_id = assoc.get_association_id() # citation is not a pmid or WBref - get this some other way gu.addDescription(g, assoc_id, summary) if not self.testMode \ and limit is not None and line_counter > limit: break return def get_feature_type_by_class_and_biotype(self, ftype, biotype): ftype_id = None biotype_map = { 'lincRNA': 'SO:0001641', 'miRNA': 'SO:0001265', 'ncRNA': 'SO:0001263', 'piRNA': 'SO:0001638', 'rRNA': 'SO:0001637', 'scRNA': 'SO:0001266', 'snRNA': 'SO:0001268', 'snoRNA': 'SO:0001267', 'tRNA': 'SO:0001272', # transposable element gene 'transposon_protein_coding': 'SO:0000111', 'transposon_pseudogene': 'SO:0001897', 'pseudogene': 'SO:0000336', 'protein_coding': 'SO:0001217', 'asRNA': 'SO:0001263', # using ncRNA gene TODO make term request } ftype_map = { 'point_mutation': 'SO:1000008', 'deletion': 'SO:0000159', 'RNAi_reagent': 'SO:0000337', 'duplication': 'SO:1000035', 'enhancer': 'SO:0000165', 'binding_site': 'SO:0000409', 'biological_region': 'SO:0001411', 'complex_substitution': 'SO:1000005' } if ftype == 'gene': if biotype in biotype_map: ftype_id = biotype_map.get(biotype) else: ftype_id = ftype_map.get(ftype) return ftype_id def make_reagent_targeted_gene_id( self, gene_id, reagent_id, nobnodes=False): rtg_id = '_'+'-'.join((gene_id, reagent_id)) # TODO targeted_gene_id unused # targeted_gene_id = re.sub(r'W(orm)?B(ase)?:', '', rtg_id) if nobnodes: rtg_id = ':'+rtg_id return rtg_id def getTestSuite(self): import unittest from tests.test_wormbase import WormBaseTestCase test_suite = \ unittest.TestLoader().loadTestsFromTestCase(WormBaseTestCase) return test_suite
class HPOAnnotations(Source): """ The [Human Phenotype Ontology](http://human-phenotype-ontology.org) group curates and assembles over 115,000 annotations to hereditary diseases using the HPO ontology. Here we create OBAN-style associations between diseases and phenotypic features, together with their evidence, and age of onset and frequency (if known). The parser currently only processes the "abnormal" annotations. Association to "remarkable normality" will be added in the near future. In order to properly test this class, you should have a conf.json file configured with some test ids, in the structure of: <pre> test_ids: { "disease" : ["OMIM:119600", "OMIM:120160"] # as examples. put your favorite ids in the config. } </pre> """ files = { 'annot': {'file' : 'phenotype_annotation.tab', 'url' : 'http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/misc/phenotype_annotation.tab'}, 'version': {'file' : 'data_version.txt', 'url' : 'http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/misc/data_version.txt'}, # 'neg_annot': {'file' : 'phenotype_annotation.tab', # 'url' : 'http://compbio.charite.de/hudson/job/hpo.annotations/lastStableBuild/artifact/misc/negative_phenotype_annotation.tab' # }, } # note, two of these codes are awaiting term requests. see #114 and # https://code.google.com/p/evidenceontology/issues/detail?id=32 eco_dict = { "ICE": "ECO:0000305", # FIXME currently using "curator inference used in manual assertion" "IEA": "ECO:0000501", # Inferred from Electronic Annotation "PCS": "ECO:0000269", # FIXME currently using "experimental evidence used in manual assertion" "TAS": "ECO:0000304" # Traceable Author Statement } def __init__(self): Source.__init__(self, 'hpoa') self.load_bindings() self.dataset = Dataset('hpoa', 'Human Phenotype Ontology', 'http://www.human-phenotype-ontology.org', None, 'http://www.human-phenotype-ontology.org/contao/index.php/legal-issues.html') if 'test_ids' not in config.get_config() or 'disease' not in config.get_config()['test_ids']: logger.warn("not configured with disease test ids.") else: self.test_ids = config.get_config()['test_ids']['disease'] # data-source specific warnings (will be removed when issues are cleared) logger.warn("note that some ECO classes are missing for ICE and PCS; using temporary mappings.") return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) self.scrub() # get the latest build from jenkins # NOT DOING THIS ANY MORE - but leaving it in for reference # jenkins_info = eval(urllib.request.urlopen('http://compbio.charite.de/hudson/job/hpo.annotations/lastSuccessfulBuild/api/python').read()) # version = jenkins_info['number'] # use the files['version'] file as the version fname = '/'.join((self.rawdir, self.files['version']['file'])) with open(fname, 'r', encoding="utf8") as f: # 2015-04-23 13:01 v = f.readline() # read the first line (the only line, really) d = datetime.strptime(v.strip(), '%Y-%m-%d %H:%M').strftime("%Y-%m-%d-%H-%M") f.close() st = os.stat(fname) filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d") # this will cause two dates to be attached to the dataset (one from the filedate, and the other from here) # TODO when #112 is implemented, this will result in only the whole dataset being versioned self.dataset.setVersion(filedate, d) return def scrub(self): """ Perform various data-scrubbing on the raw data files prior to parsing. For this resource, this currently includes: * revise errors in identifiers for some OMIM and PMIDs :return: None """ # scrub file of the oddities...lots of publication rewriting f = '/'.join((self.rawdir, self.files['annot']['file'])) logger.info('scrubbing PubMed:12345 --> PMID:12345') pysed.replace("PubMed", 'PMID', f) logger.info('scrubbing pmid:12345 --> PMID:12345') pysed.replace("pmid", 'PMID', f) logger.info('scrubbing PMID12345 --> PMID:12345') pysed.replace("PMID([0-9][0-9]*)", 'PMID:\\1', f) logger.info('scrubbing MIM12345 --> OMIM:12345') pysed.replace('MIM([0-9][0-9]*)', 'OMIM:\\1', f) logger.info('scrubbing MIM:12345 --> OMIM:12345') pysed.replace(";MIM", ";OMIM", f) logger.info('scrubbing ORPHANET --> Orphanet') pysed.replace("ORPHANET", "Orphanet", f) return # here we're reading and building a full named graph of this resource, then dumping it all at the end # we can investigate doing this line-by-line later # supply a limit if you want to test out parsing the head X lines of the file def parse(self, limit=None): if limit is not None: logger.info("Only parsing first %s rows", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True self._process_phenotype_tab('/'.join((self.rawdir, self.files['annot']['file'])), limit) # TODO add negative phenotype statements #113 # self._process_negative_phenotype_tab(self.rawfile,self.outfile,limit) logger.info("Finished parsing.") return def _map_evidence_to_codes(self, code_string): """ A simple mapping of the code_string to it's ECO class using the dictionary defined here Currently includes ICE, IEA, PCS, TAS :param code_string: :return: """ return self.eco_dict.get(code_string) def _process_phenotype_tab(self, raw, limit): if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (db, num, name, qual, pheno_id, publist, eco, onset, freq, w, asp, syn, date, curator) = row disease_id = db + ":" + str(num) if self.testMode and disease_id.strip() not in config.get_config()['test_ids']['disease']: continue # logger.info('adding %s', disease_id) gu.addClassToGraph(g, disease_id, None) gu.addClassToGraph(g, pheno_id, None) eco_id = self._map_evidence_to_codes(eco) gu.addClassToGraph(g, eco_id, None) if onset is not None and onset.strip() != '': gu.addClassToGraph(g, onset, None) # we want to do things differently depending on the aspect of the annotation if asp == 'O' or asp == 'M': # organ abnormality or mortality assoc = D2PAssoc(self.name, disease_id, pheno_id, onset, freq) elif asp == 'I': # inheritance patterns for the whole disease assoc = DispositionAssoc(self.name, disease_id, pheno_id) elif asp == 'C': # clinical course / onset assoc = DispositionAssoc(self.name, disease_id, pheno_id) else: logger.error("I don't know what this aspect is:", asp) assoc.add_evidence(eco_id) publist = publist.split(';') # blow these apart if there is a list of pubs for pub in publist: pub = pub.strip() if pub != '': # if re.match('http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene', pub): # #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced # m = re.search('part\=(\w+)', pub) # pub_id = 'GeneReviews:'+m.group(1) # elif re.search('http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=', pub): # m = re.search('Expert=(\d+)', pub) # pub_id = 'Orphanet:'+m.group(1) if not re.match('http', pub): r = Reference(pub) if re.match('PMID', pub): r.setType(Reference.ref_types['journal_article']) r.addRefToGraph(g) # TODO add curator assoc.add_source(pub) assoc.add_association_to_graph(g) if not self.testMode and limit is not None and line_counter > limit: break Assoc(None).load_all_properties(g) return def getTestSuite(self): import unittest from tests.test_hpoa import HPOATestCase # TODO add D2PAssoc tests test_suite = unittest.TestLoader().loadTestsFromTestCase(HPOATestCase) return test_suite
class MMRRC(Source): """ Here we process the Mutant Mouse Resource and Research Center (https://www.mmrrc.org) strain data, which includes: * strains, their mutant alleles * phenotypes of the alleles * descriptions of the research uses of the strains Note that some gene identifiers are not included (for many of the transgenics with human genes) in the raw data. We do our best to process the links between the variant and the affected gene, but sometimes the mapping is not clear, and we do not include it. Many of these details will be solved by merging this source with the MGI data source, who has the variant-to-gene designations. Also note that even though the strain pages at the MMRRC site do list phenotypic differences in the context of the strain backgrounds, they do not provide that data to us, and thus we cannot supply that disambiguation here. """ files = { 'catalog': { 'file': 'mmrrc_catalog_data.csv', 'url': 'https://www.mmrrc.org/about/mmrrc_catalog_data.csv'}, } test_ids = [ 'MMRRC:037507-MU', 'MMRRC:041175-UCD', 'MMRRC:036933-UNC', 'MMRRC:037884-UCD', 'MMRRC:000255-MU', 'MMRRC:037372-UCD', 'MMRRC:000001-UNC' ] def __init__(self): Source.__init__(self, 'mmrrc') self.strain_hash = {} self.id_label_hash = {} self.load_bindings() self.dataset = Dataset( 'mmrrc', 'Mutant Mouse Regional Resource Centers', 'https://www.mmrrc.org', None, 'https://www.mmrrc.org/about/data_download.php') return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) fname = '/'.join((self.rawdir, self.files['catalog']['file'])) st = os.stat(fname) filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d") # TODO note: can set the data version to what is in the header # first line like: # This MMRRC catalog data file was generated on 2015-04-22 self.dataset.setVersion(filedate) return def parse(self, limit=None): if limit is not None: logger.info("Only parsing first %s rows", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True self._process_phenotype_data(limit) logger.info("Finished parsing.") return def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) fname = '/'.join((self.rawdir, self.files['catalog']['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = 'CL:0000034' mouse_taxon = 'NCBITaxon:10090' geno = Genotype(g) with open(fname, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 # skip the first 3 lines which are header, etc. if line_counter < 4: continue (strain_id, strain_label, strain_type_symbol, strain_state, mgi_allele_id, mgi_allele_symbol, mgi_allele_name, mutation_type, chrom, mgi_gene_id, mgi_gene_symbol, mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums, research_areas) = row if self.testMode and (strain_id not in self.test_ids): continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = {'variants': set(), 'genes': set()} # clean up the bad one if mgi_allele_id == 'multiple mutation': logger.error("Erroneous gene id: %s", mgi_allele_id) mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the # sequence alteration types # var_type = # self._get_variant_type_from_abbrev(mutation_type) # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, # mgi_allele_id) # scrub out any spaces mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id.strip() != '': if re.match(r'Gene\s*ID:', mgi_gene_id, re.I): mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:', mgi_gene_id) elif not re.match(r'MGI', mgi_gene_id): logger.info("Gene id not recognized: %s", mgi_gene_id) if re.match(r'\d+$', mgi_gene_id): # assume that if it's all numbers, then it's MGI mgi_gene_id = 'MGI:'+str(mgi_gene_id) logger.info("Assuming numerics are MGI.") self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - # some things have gene labels, but no identifiers - report if mgi_gene_symbol.strip() != '' and mgi_gene_id == '': logger.error( "Gene label with no identifier for strain %s: %s", strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol.strip()) # make a temp id for genes that aren't identified # tmp_gene_id = '_'+mgi_gene_symbol # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mp_ids are now a comma delimited list # with MP terms in brackets phenotype_ids = [] if mp_ids != '': for i in re.split(r',', mp_ids): i = i.strip() mps = re.search(r'\[(.*)\]', i) if mps is not None: mp_id = mps.group(1).strip() phenotype_ids.append(mp_id) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums.strip() != '': for i in re.split(r'\s+', pubmed_nums): pmid = 'PMID:'+i.strip() pubmed_ids.append(pmid) r = Reference(pmid, Reference.ref_types['journal_article']) r.addRefToGraph(g) # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts gu.addClassToGraph(g, mouse_taxon, None) if research_areas.strip() == '': research_areas = None else: research_areas = 'Research Areas: '+research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class gu.addIndividualToGraph( g, strain_id, strain_label, strain_type, research_areas) # an inst of mouse?? gu.makeLeader(g, strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in the ontology gu.addClassToGraph(g, pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc(self.name, mgi_allele_id, pid, gu.object_properties['has_phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph(g) else: logger.info("Phenotypes and no allele for %s", strain_id) if not self.testMode and ( limit is not None and line_counter > limit): break # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if len(variants) > 0: for v in variants: vl_id = v vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_'+gene+'-VL' vl_id = re.sub(r':', '', vl_id) if self.nobnodes: vl_id = ':'+vl_id vl_symbol = self.id_label_hash[gene]+'<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = '_'+re.sub(r'^_', '', vl)+'U' vslc_id = re.sub(r':', '', vslc_id) if self.nobnodes: vslc_id = ':' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC( vslc_id, vl, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part'], None) gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts['variant_single_locus_complement']) if len(vslc_list) > 0: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r':', '', gvc_id) if self.nobnodes: gvc_id = ':'+gvc_id gvc_label = \ '; '.join(self.id_label_hash[v] for v in vslc_list) gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = \ '_' + re.sub(r':', '', '-'.join( (geno.genoparts['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) if self.nobnodes: bkgd_id = ':'+bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified ('+s+')', geno.genoparts['unspecified_genomic_background'], "A placeholder for the " + "unspecified genetic background for "+s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, geno.genoparts['unspecified_genomic_background']) geno.addParts( gvc_id, genotype_id, geno.object_properties['has_alternate_part']) geno.addGenotype(genotype_id, genotype_label) gu.addTriple( g, s, geno.object_properties['has_genotype'], genotype_id) else: # logger.debug( # "Strain %s is not making a proper genotype.", s) pass gu.loadProperties( g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadProperties( g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties( g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadAllProperties(g) logger.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) return @staticmethod def _get_variant_type_from_abbrev(abbrev): """ All variants are generically typed as "sequence_alterations" unless otherwise stated. :param abbrev: :return: """ variant_type = None var_dict = { 'SM': 'SO:0001059', # spontaneous mutation 'TM': 'SO:0001059', # targeted mutation 'TG': 'SO:xxxxxxx', # transgenic 'GT': 'SO:0001059', # gene trap 'CI': 'SO:0001059', # chemically induced mutation 'RAD': 'SO:0001059', # radiation induced mutation # chromosomal aberration --> chromosomal structure variation 'CH': 'SO:1000183', 'RB': 'SO:1000043', # Robertsonian translocation 'TL': 'SO:1000048', # reciprocal translocation 'TP': 'SO:0000453', # transposition 'INV': 'SO:1000036', # inversion 'INS': 'SO:0000667', # insertion 'DEL': 'SO:0000159', # deletion 'DP': 'SO:1000035', # duplication 'OTH': 'SO:0001059' # other } if abbrev in var_dict: variant_type = var_dict[abbrev] else: logger.warning("Variant type not recognized: %s", abbrev) return variant_type def getTestSuite(self): import unittest from tests.test_mmrrc import MMRRCTestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(MMRRCTestCase) return test_suite
class BioGrid(Source): """ Biogrid interaction data """ # TODO write up class summary for docstring files = { "interactions": { "file": "interactions.mitab.zip", "url": "http://thebiogrid.org/downloads/archives/Latest%20Release/BIOGRID-ALL-LATEST.mitab.zip", }, "identifiers": { "file": "identifiers.tab.zip", "url": "http://thebiogrid.org/downloads/archives/Latest%20Release/BIOGRID-IDENTIFIERS-LATEST.tab.zip", }, } # biogrid-specific identifiers for use in subsetting identifier mapping biogrid_ids = [ 106638, 107308, 107506, 107674, 107675, 108277, 108506, 108767, 108814, 108899, 110308, 110364, 110678, 111642, 112300, 112365, 112771, 112898, 199832, 203220, 247276, 120150, 120160, 124085, ] def __init__(self, tax_ids=None): super().__init__("biogrid") self.tax_ids = tax_ids self.load_bindings() self.dataset = Dataset( "biogrid", "The BioGrid", "http://thebiogrid.org/", None, "http://wiki.thebiogrid.org/doku.php/terms_and_conditions", ) # Defaults # taxids = [9606,10090,10116,7227,7955,6239,8355] #our favorite animals if self.tax_ids is None: self.tax_ids = [9606, 10090, 7955] if "test_ids" not in config.get_config() or "gene" not in config.get_config()["test_ids"]: logger.warn("not configured with gene test ids.") else: self.test_ids = config.get_config()["test_ids"]["gene"] # data-source specific warnings (will be removed when issues are cleared) logger.warn("several MI experimental codes do not exactly map to ECO; using approximations.") return def fetch(self, is_dl_forced=False): """ :param is_dl_forced: :return: None """ self.get_files(is_dl_forced) # the version number is encoded in the filename in the zip. # for example, the interactions file may unzip to BIOGRID-ALL-3.2.119.mitab.txt, # where the version number is 3.2.119 f = "/".join((self.rawdir, self.files["interactions"]["file"])) st = os.stat(f) filedate = datetime.utcfromtimestamp(st[ST_CTIME]).strftime("%Y-%m-%d") with ZipFile(f, "r") as myzip: flist = myzip.namelist() # assume that the first entry is the item fname = flist[0] # get the version from the filename version = re.match("BIOGRID-ALL-(\d+\.\d+\.\d+)\.mitab.txt", fname) myzip.close() self.dataset.setVersion(filedate, str(version.groups()[0])) return def parse(self, limit=None): """ :param limit: :return: """ if self.testOnly: self.testMode = True self._get_interactions(limit) self._get_identifiers(limit) self.load_bindings() logger.info("Loaded %d test graph nodes", len(self.testgraph)) logger.info("Loaded %d full graph nodes", len(self.graph)) return def _get_interactions(self, limit): logger.info("getting interactions") line_counter = 0 f = "/".join((self.rawdir, self.files["interactions"]["file"])) myzip = ZipFile(f, "r") # assume that the first entry is the item fname = myzip.namelist()[0] matchcounter = 0 with myzip.open(fname, "r") as csvfile: for line in csvfile: # skip comment lines if re.match("^#", line.decode()): logger.debug("Skipping header line") continue line_counter += 1 line = line.decode().strip() # print(line) ( interactor_a, interactor_b, alt_ids_a, alt_ids_b, aliases_a, aliases_b, detection_method, pub_author, pub_id, taxid_a, taxid_b, interaction_type, source_db, interaction_id, confidence_val, ) = line.split("\t") # get the actual gene ids, typically formated like: gene/locuslink:351|BIOGRID:106848 gene_a_num = re.search("locuslink\:(\d+)\|?", interactor_a).groups()[0] gene_b_num = re.search("locuslink\:(\d+)\|?", interactor_b).groups()[0] if self.testMode: g = self.testgraph # skip any genes that don't match our test set if (int(gene_a_num) not in self.test_ids) or (int(gene_b_num) not in self.test_ids): continue else: g = self.graph # when not in test mode, filter by taxon if ( int(re.sub("taxid:", "", taxid_a.rstrip())) not in self.tax_ids or int(re.sub("taxid:", "", taxid_b.rstrip())) not in self.tax_ids ): continue else: matchcounter += 1 gene_a = "NCBIGene:" + gene_a_num gene_b = "NCBIGene:" + gene_b_num # get the interaction type # psi-mi:"MI:0407"(direct interaction) int_type = re.search("MI:\d+", interaction_type).group() rel = self._map_MI_to_RO(int_type) # scrub pubmed-->PMID prefix pub_id = re.sub("pubmed", "PMID", pub_id) # remove bogus whitespace pub_id = pub_id.strip() # get the method, and convert to evidence code det_code = re.search("MI:\d+", detection_method).group() evidence = self._map_MI_to_ECO(det_code) # note that the interaction_id is some kind of internal biogrid identifier that does not # map to a public URI. we will construct a monarch identifier from this assoc = InteractionAssoc(self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence) assoc.add_source(pub_id) assoc.add_association_to_graph(g) assoc.load_all_properties(g) if not self.testMode and (limit is not None and line_counter > limit): break myzip.close() return def _get_identifiers(self, limit): """ This will process the id mapping file provided by Biogrid. The file has a very large header, which we scan past, then pull the identifiers, and make equivalence axioms :param limit: :return: """ logger.info("getting identifier mapping") line_counter = 0 f = "/".join((self.rawdir, self.files["identifiers"]["file"])) myzip = ZipFile(f, "r") # assume that the first entry is the item fname = myzip.namelist()[0] foundheader = False gu = GraphUtils(curie_map.get()) # TODO align this species filter with the one above # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster,Danio rerio, # Caenorhabditis elegans,Xenopus laevis'.split(',') speciesfilters = "H**o sapiens,Mus musculus".split(",") with myzip.open(fname, "r") as csvfile: for line in csvfile: # skip header lines if not foundheader: if re.match("BIOGRID_ID", line.decode()): foundheader = True continue line = line.decode().strip() # BIOGRID_ID IDENTIFIER_VALUE IDENTIFIER_TYPE ORGANISM_OFFICIAL_NAME # 1 814566 ENTREZ_GENE Arabidopsis thaliana (biogrid_num, id_num, id_type, organism_label) = line.split("\t") if self.testMode: g = self.testgraph # skip any genes that don't match our test set if int(biogrid_num) not in self.biogrid_ids: continue else: g = self.graph # for each one of these, create the node and add equivalent classes biogrid_id = "BIOGRID:" + biogrid_num prefix = self._map_idtype_to_prefix(id_type) # TODO make these filters available as commandline options # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,WormBase,XenBase,ENSEMBL,miRBase'.split(',') geneidtypefilters = "NCBIGene,MGI,ENSEMBL,ZFIN,HGNC".split(",") # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein' if (speciesfilters is not None) and (organism_label.strip() in speciesfilters): line_counter += 1 if (geneidtypefilters is not None) and (prefix in geneidtypefilters): mapped_id = ":".join((prefix, id_num)) gu.addEquivalentClass(g, biogrid_id, mapped_id) elif id_type == "OFFICIAL_SYMBOL": # this symbol will only get attached to the biogrid class gu.addClassToGraph(g, biogrid_id, id_num) # elif (id_type == 'SYNONYM'): # gu.addSynonym(g,biogrid_id,id_num) #FIXME - i am not sure these are synonyms, altids? if not self.testMode and limit is not None and line_counter > limit: break myzip.close() return @staticmethod def _map_MI_to_RO(mi_id): rel = InteractionAssoc.interaction_object_properties mi_ro_map = { "MI:0403": rel["colocalizes_with"], # colocalization "MI:0407": rel["interacts_with"], # direct interaction "MI:0794": rel["genetically_interacts_with"], # synthetic genetic interaction defined by inequality "MI:0796": rel["genetically_interacts_with"], # suppressive genetic interaction defined by inequality "MI:0799": rel["genetically_interacts_with"], # additive genetic interaction defined by inequality "MI:0914": rel["interacts_with"], # association "MI:0915": rel["interacts_with"], # physical association } ro_id = rel["interacts_with"] # default if mi_id in mi_ro_map: ro_id = mi_ro_map.get(mi_id) return ro_id @staticmethod def _map_MI_to_ECO(mi_id): eco_id = "ECO:0000006" # default to experimental evidence mi_to_eco_map = { "MI:0018": "ECO:0000068", # yeast two-hybrid "MI:0004": "ECO:0000079", # affinity chromatography "MI:0047": "ECO:0000076", # far western blotting "MI:0055": "ECO:0000021", # should be FRET, but using physical_interaction FIXME "MI:0090": "ECO:0000012", # desired: protein complementation, using: functional complementation "MI:0096": "ECO:0000085", # desired: pull down, using: immunoprecipitation "MI:0114": "ECO:0000324", # desired: x-ray crystallography, using: imaging assay "MI:0254": "ECO:0000011", # desired: genetic interference, using: genetic interaction evidence "MI:0401": "ECO:0000172", # desired: biochemical, using: biochemical trait evidence "MI:0415": "ECO:0000005", # desired: enzymatic study, using: enzyme assay evidence "MI:0428": "ECO:0000324", # imaging "MI:0686": "ECO:0000006", # desired: unspecified, using: experimental evidence "MI:1313": "ECO:0000006", # None? } if mi_id in mi_to_eco_map: eco_id = mi_to_eco_map.get(mi_id) else: logger.warn("unmapped code %s. Defaulting to experimental_evidence", mi_id) return eco_id @staticmethod def _map_idtype_to_prefix(idtype): """ Here we need to reformat the BioGrid source prefixes to standard ones used in our curie-map. :param idtype: :return: """ prefix = idtype idtype_to_prefix_map = { "XENBASE": "XenBase", "TREMBL": "TrEMBL", "MGI": "MGI", "REFSEQ_DNA_ACCESSION": "RefSeqNA", "MAIZEGDB": "MaizeGDB", "BEEBASE": "BeeBase", "ENSEMBL": "ENSEMBL", "TAIR": "TAIR", "GENBANK_DNA_GI": "NCBIgi", "CGNC": "CGNC", "RGD": "RGD", "GENBANK_GENOMIC_DNA_GI": "NCBIgi", "SWISSPROT": "Swiss-Prot", "MIM": "OMIM", "FLYBASE": "FlyBase", "VEGA": "VEGA", "ANIMALQTLDB": "AQTLDB", "ENTREZ_GENE_ETG": "ETG", "HPRD": "HPRD", "APHIDBASE": "APHIDBASE", "GENBANK_PROTEIN_ACCESSION": "NCBIProtein", "ENTREZ_GENE": "NCBIGene", "SGD": "SGD", "GENBANK_GENOMIC_DNA_ACCESSION": "NCBIGenome", "BGD": "BGD", "WORMBASE": "WormBase", "ZFIN": "ZFIN", "DICTYBASE": "dictyBase", "ECOGENE": "ECOGENE", "BIOGRID": "BIOGRID", "GENBANK_DNA_ACCESSION": "NCBILocus", "VECTORBASE": "VectorBase", "MIRBASE": "miRBase", "IMGT/GENE-DB": "IGMT", "HGNC": "HGNC", "SYSTEMATIC_NAME": None, "OFFICIAL_SYMBOL": None, "REFSEQ_GENOMIC_DNA_ACCESSION": "NCBILocus", "GENBANK_PROTEIN_GI": "NCBIgi", "REFSEQ_PROTEIN_ACCESSION": "RefSeqProt", "SYNONYM": None, "GRID_LEGACY": None, # the following showed up in 3.3.124 "UNIPROT-ACCESSION": "UniprotKB", "SWISS-PROT": "Swiss-Prot", "OFFICIAL SYMBOL": None, "ENSEMBL RNA": None, "GRID LEGACY": None, "ENSEMBL PROTEIN": None, "REFSEQ-RNA-GI": None, "REFSEQ-RNA-ACCESSION": None, "REFSEQ-PROTEIN-GI": None, "REFSEQ-PROTEIN-ACCESSION-VERSIONED": None, "REFSEQ-PROTEIN-ACCESSION": None, "REFSEQ-LEGACY": None, "SYSTEMATIC NAME": None, "ORDERED LOCUS": None, "UNIPROT-ISOFORM": "UniprotKB", "ENSEMBL GENE": "ENSEMBL", "CGD": None, # Not sure what this is? "WORMBASE-OLD": "WormBase", } if idtype in idtype_to_prefix_map: prefix = idtype_to_prefix_map.get(idtype) else: logger.warn("unmapped prefix %s", prefix) return prefix def getTestSuite(self): import unittest from tests.test_biogrid import BioGridTestCase # TODO add InteractionAssoc tests # TODO add test about if all prefixes are mapped? test_suite = unittest.TestLoader().loadTestsFromTestCase(BioGridTestCase) return test_suite