def __init__(self, tax_ids=None, gene_ids=None): Source.__init__(self, 'ensembl') self.tax_ids = tax_ids self.gene_ids = gene_ids self.load_bindings() self.dataset = Dataset( 'ensembl', 'ENSEMBL', 'http://www.ensembl.org', None) # Defaults if self.tax_ids is None: self.tax_ids = [9606, 10090, 7955] self.gene_ids = [] if 'test_ids' not in config.get_config() \ or 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") else: self.gene_ids = config.get_config()['test_ids']['gene'] self.properties = Feature.properties logger.setLevel(logging.INFO) return
def __init__(self, tax_ids=None, gene_ids=None): Source.__init__(self, 'clinvar') self.tax_ids = tax_ids self.gene_ids = gene_ids self.filter = 'taxids' self.load_bindings() self.dataset = Dataset('ClinVar', 'National Center for Biotechnology Information', 'http://www.ncbi.nlm.nih.gov/clinvar/', None, 'http://www.ncbi.nlm.nih.gov/About/disclaimer.html', 'https://creativecommons.org/publicdomain/mark/1.0/') if 'test_ids' not in config.get_config() or 'gene' not in config.get_config()['test_ids']: logger.warn("not configured with gene test ids.") else: self.gene_ids = config.get_config()['test_ids']['gene'] if 'test_ids' not in config.get_config() or 'disease' not in config.get_config()['test_ids']: logger.warn("not configured with disease test ids.") else: self.disease_ids = config.get_config()['test_ids']['disease'] self.properties = Feature.properties return
def __init__(self, graph_type, are_bnodes_skolemized): Source.__init__( self, graph_type, are_bnodes_skolemized, 'mpd', ingest_title='Mouse Phenome Database', ingest_url='https://phenome.jax.org/', # license_url=None, data_rights='https://phenome.jax.org/about/termsofuse' # file_handle=None ) # @N, not sure if this step is required self.stdevthreshold = 2 # TODO add a citation for mpd dataset as a whole self.dataset.set_citation('PMID:15619963') self.assayhash = {} self.idlabel_hash = {} # to store the mean/zscore of each measure by strain+sex self.score_means_by_measure = {} # to store the mean value for each measure by strain+sex self.strain_scores_by_measure = {} return
def __init__(self, tax_ids=None): Source.__init__(self, 'go') # Defaults self.tax_ids = tax_ids if self.tax_ids is None: self.tax_ids = [9606, 10090, 7955] logger.info("No taxa set. Defaulting to %s", str(tax_ids)) else: logger.info("Filtering on the following taxa: %s", str(tax_ids)) # update the dataset object with details about this resource # NO LICENSE for this resource self.dataset = Dataset( 'go', 'GeneOntology', 'http://www.geneontology.org', None, "https://creativecommons.org/licenses/by/4.0/legalcode", 'http://geneontology.org/page/use-and-license') if 'test_ids' not in config.get_config() or \ 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") else: self.test_ids = config.get_config()['test_ids']['gene'] return
def __init__(self): Source.__init__(self, 'ctd') self.dataset = Dataset( 'ctd', 'CTD', 'http://ctdbase.org', None, 'http://ctdbase.org/about/legal.jsp') if 'test_ids' not in config.get_config() \ or 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") self.test_geneids = [] else: self.test_geneids = config.get_config()['test_ids']['gene'] if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_diseaseids = [] else: self.test_diseaseids = config.get_config()['test_ids']['disease'] self.gu = GraphUtils(curie_map.get()) self.g = self.graph self.geno = Genotype(self.g) return
def __init__(self, tax_ids=None, gene_ids=None): Source.__init__(self, 'ncbigene') self.tax_ids = tax_ids self.gene_ids = gene_ids self.filter = 'taxids' self.load_bindings() self.dataset = Dataset('ncbigene', 'National Center for Biotechnology Information', 'http://ncbi.nih.nlm.gov/gene', None, 'http://www.ncbi.nlm.nih.gov/About/disclaimer.html', 'https://creativecommons.org/publicdomain/mark/1.0/') # data-source specific warnings (will be removed when issues are cleared) # Defaults if self.tax_ids is None: self.tax_ids = [9606, 10090, 7955] logger.info("No taxa set. Defaulting to %s", str(tax_ids)) else: logger.info("Filtering on the following taxa: %s", str(tax_ids)) self.gene_ids = [] if 'test_ids' not in config.get_config() or 'gene' not in config.get_config()['test_ids']: logger.warn("not configured with gene test ids.") else: self.gene_ids = config.get_config()['test_ids']['gene'] self.properties = Feature.properties return
def __init__(self): Source.__init__(self, 'hpoa') self.load_bindings() self.dataset = Dataset( 'hpoa', 'Human Phenotype Ontology', 'http://www.human-phenotype-ontology.org', None, 'http://www.human-phenotype-ontology.org/contao/index.php/legal-issues.html') self.replaced_id_count = 0 if 'test_ids' not in config.get_config()\ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = config.get_config()['test_ids']['disease'] # data-source specific warnings to be removed when issues are cleared logger.warning( "note that some ECO classes are missing for ICE, PCS, and ITM;" + " using temporary mappings.") return
def __init__(self): Source.__init__(self, 'mpd') # @N, not sure if this step is required self.namespaces.update(curie_map.get()) self.stdevthreshold = 2 self.nobnodes = True # FIXME # update the dataset object with details about this resource # @N: Note that there is no license as far as I can tell self.dataset = Dataset( 'mpd', 'MPD', 'http://phenome.jax.org', None, None) # TODO add a citation for mpd dataset as a whole self.dataset.set_citation('PMID:15619963') self.assayhash = {} self.idlabel_hash = {} # to store the mean/zscore of each measure by strain+sex self.score_means_by_measure = {} # to store the mean value for each measure by strain+sex self.strain_scores_by_measure = {} self.geno = Genotype(self.graph) self.gu = GraphUtils(curie_map.get()) return
def __init__(self): Source.__init__(self, 'mmrrc') self.strain_hash = {} self.id_label_hash = {} self.load_bindings() self.dataset = Dataset( 'mmrrc', 'Mutant Mouse Regional Resource Centers', 'https://www.mmrrc.org', None, 'https://www.mmrrc.org/about/data_download.php') return
def __init__(self): Source.__init__(self, 'wormbase') # update the dataset object with details about this resource # NO LICENSE for this resource self.dataset = Dataset( 'wormbase', 'WormBase', 'http://www.wormbase.org', None, None, 'http://www.wormbase.org/about/policies#012') self.version_num = None return
def __init__(self): Source.__init__(self, 'impc') # update the dataset object with details about this resource self.dataset = Dataset('impc', 'IMPC', 'http://www.mousephenotype.org', None, 'https://raw.githubusercontent.com/mpi2/PhenotypeArchive/master/LICENSE') # TODO add a citation for impc dataset as a whole # :impc cito:citesAsAuthority PMID:24194600 return
def __init__(self): Source.__init__(self, 'animalqtldb') # update the dataset object with details about this resource self.dataset = Dataset('animalqtldb', 'Animal QTL db', 'http://www.animalgenome.org/cgi-bin/QTLdb/index', None, None, 'http://www.animalgenome.org/QTLdb/faq#23') # source-specific warnings. will be cleared when resolved. logger.warn("No licences or rights exist for the raw data from this resource.") return
def _get_curie_and_type_from_id(variant_id): """ Given a variant id, our best guess at its curie and type (snp, haplotype, etc) 'None' will be used for both curie and type for IDs that we can't process # 2019-May three snp-id have ' e' or ' a' appended. note space. # examples: 'rs2440154 e-A' and 'rs2440154 e' # including the suffix in the url is a web noop but breaks rdflib :param variant_id: :return: """ curie = None variant_type = None # remove space before hyphens variant_id = re.sub(r' -', '-', variant_id).strip() if re.search(r' x ', variant_id) or re.search(r',', variant_id): # TODO deal with rs1234 x rs234... (haplotypes?) LOG.warning("Cannot parse variant groups of this format: %s", variant_id) elif re.search(r';', variant_id): curie = ':haplotype_' + Source.hash_id( variant_id) # deliberate 404 variant_type = "haplotype" elif variant_id[:2] == 'rs': # remove whitespace from errant id, rs6194 5053-? curie = 'dbSNP:' + variant_id.split('-')[0].replace(' ', '') # curie = re.sub(r'-.*$', '', curie).strip() variant_type = "snp" # remove the alteration elif variant_id[:3] == 'kgp': # http://www.1000genomes.org/faq/what-are-kgp-identifiers curie = 'GWAS:' + variant_id.split('-')[0] variant_type = "snp" elif variant_id[:3] == 'chr': # like: chr10:106180121-G variant_id = re.sub(r'-?', '-N', variant_id) variant_id = re.sub(r' ', '', variant_id) # going to hate myself but ... # moving this from a broken base node to yet another blank node # It had produced this monstrocity with the embedded quote # :gwas--Nc-Nh-Nr-N1-N1-N--N1-N0-N2-N7-N5-N1-N1-N0-N2-N"-N?-N curie = Source.make_id('gwas-' + re.sub(r':', '-', variant_id), '_') variant_type = "snp" elif variant_id.strip() == '': pass else: LOG.warning("There's a snp id i can't manage: %s", variant_id) return curie, variant_type
def __init__(self): Source.__init__(self, 'eom') self.namespaces.update(curie_map.get()) # update the dataset object with details about this resource # TODO put this into a conf file? self.dataset = Dataset('eom', 'EOM', 'http://elementsofmorphology.nih.gov', None, 'http://www.genome.gov/copyright.cfm', 'https://creativecommons.org/publicdomain/mark/1.0/') # check if config exists; if it doesn't, error out and let user know if 'dbauth' not in config.get_config() or 'disco' not in config.get_config()['dbauth']: logger.error("not configured with PG user/password.") # source-specific warnings. will be cleared when resolved. return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'sgd') self.dataset = Dataset( 'sgd', 'SGD', 'https://www.yeastgenome.org/', None, None) self.global_terms = Source.open_and_parse_yaml('../../translationtable/global_terms.yaml') self.apo_term_id = SGD.make_apo_map()
def __init__(self): Source.__init__(self, 'coriell') self.load_bindings() self.dataset = Dataset('coriell', 'Coriell', 'http://ccr.coriell.org/', None) # data-source specific warnings (will be removed when issues are cleared) logger.warn('We assume that if a species is not provided, that it is a Human-derived cell line') logger.warn('We map all omim ids as a disease/phenotype entity, but should be fixed in the future') # check if config exists; if it doesn't, error out and let user know if 'dbauth' not in config.get_config() or 'coriell' not in config.get_config()['dbauth']: logger.error("not configured with FTP user/password.") return
def __init__(self): Source.__init__(self, 'orphanet') self.load_bindings() self.dataset = Dataset( 'orphanet', 'Orphanet', 'http://www.orpha.net', None, 'http://creativecommons.org/licenses/by-nd/3.0/', 'http://omim.org/help/agreement') # check to see if there's any ids configured in the config; # otherwise, warn if 'test_ids' not in config.get_config() or \ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'sgd') self.dataset = Dataset('sgd', 'SGD', 'https://www.yeastgenome.org/', None, None) self.global_terms = Source.open_and_parse_yaml( '../../translationtable/global_terms.yaml') self.apo_term_id = SGD.make_apo_map()
def __init__(self, tax_ids=None, gene_ids=None): Source.__init__(self, 'hgnc') self.tax_ids = tax_ids self.gene_ids = gene_ids self.load_bindings() self.dataset = Dataset('hgnc', 'HGNC', 'http://www.genenames.org', None) self.gene_ids = [] if 'test_ids' not in config.get_config() or 'gene' not in config.get_config()['test_ids']: logger.warn("not configured with gene test ids.") else: self.gene_ids = config.get_config()['test_ids']['gene'] self.properties = Feature.properties return
def __init__(self): Source.__init__(self, 'gwascatalog') self.load_bindings() self.dataset = Dataset( 'gwascatalog', 'GWAS Catalog', 'http://www.ebi.ac.uk/gwas/', 'The NHGRI-EBI Catalog of published genome-wide association studies', 'http://creativecommons.org/licenses/by/3.0/', None) # 'http://www.ebi.ac.uk/gwas/docs/about' # TODO add this if 'test_ids' not in config.get_config() or \ 'gene' not in config.get_config()['test_ids']: logger.warning("not configured with gene test ids.") else: self.test_ids = config.get_config()['test_ids'] return
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'mychem') self.dataset = Dataset( 'mychem', 'MYCHEM', 'https://mychem.info/', None, None) self.global_terms = Source.open_and_parse_yaml('../../translationtable/global_terms.yaml') self.inchikeys = MyChem.chunks(l=MyChem.get_inchikeys(), n=10) self.drugbank_targets = list() self.drugcentral_interactors = list()
def __init__(self): Source.__init__(self, "orphanet") self.load_bindings() self.dataset = Dataset( "orphanet", "Orphanet", "http://www.orpha.net", None, "http://creativecommons.org/licenses/by-nd/3.0/", "http://omim.org/help/agreement", ) # check to see if there's any ids configured in the config; otherwise, warn if "test_ids" not in config.get_config() or "disease" not in config.get_config()["test_ids"]: logger.warn("not configured with disease test ids.") return
def __init__(self): Source.__init__(self, 'kegg') # update the dataset object with details about this resource self.dataset = Dataset('kegg', 'KEGG', 'http://www.genome.jp/kegg/', None, None, 'http://www.kegg.jp/kegg/legal.html') # source-specific warnings. will be cleared when resolved. # check to see if there's any ids configured in the config; otherwise, warn if 'test_ids' not in config.get_config() or 'disease' not in config.get_config()['test_ids']: logger.warn("not configured with disease test ids.") else: self.test_ids['disease'] += config.get_config()['test_ids']['disease'] self.label_hash = {} self.omim_disease_hash = {} # to hold the mappings of omim:kegg ids self.kegg_disease_hash = {} # to hold the mappings of kegg:omim ids return
def __init__(self): Source.__init__(self, 'omia') self.load_bindings() self.dataset = Dataset( 'omia', 'Online Mendelian Inheritance in Animals', 'http://omia.angis.org.au', None, None, 'http://sydney.edu.au/disclaimer.shtml') self.id_hash = { 'article': {}, 'phene': {}, 'breed': {}, 'taxon': {}, 'gene': {} } self.label_hash = {} self.gu = GraphUtils(curie_map.get()) # used to store the omia to omim phene mappings self.omia_omim_map = {} # used to store the unique genes that have phenes # (for fetching orthology) self.annotated_genes = set() self.test_ids = { 'disease': [ 'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201', 'OMIA:000810', 'OMIA:001400'], 'gene': [ 492297, 434, 492296, 3430235, 200685834, 394659996, 200685845, 28713538, 291822383], 'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825], # to be filled in during parsing of breed table # for lookup by breed-associations 'breed': [] } # to store a map of omia ids and any molecular info # to write a report for curation self.stored_omia_mol_gen = {} self.g = self.graph self.geno = Genotype(self.g) return
def __init__(self, graph_type, are_bnodes_skolemized): Source.__init__(self, graph_type, are_bnodes_skolemized, 'mpd') # @N, not sure if this step is required self.stdevthreshold = 2 # update the dataset object with details about this resource # @N: Note that there is no license as far as I can tell self.dataset = Dataset( 'mpd', 'MPD', 'http://phenome.jax.org', None, None) # TODO add a citation for mpd dataset as a whole self.dataset.set_citation('PMID:15619963') self.assayhash = {} self.idlabel_hash = {} # to store the mean/zscore of each measure by strain+sex self.score_means_by_measure = {} # to store the mean value for each measure by strain+sex self.strain_scores_by_measure = {} return
def __init__(self): Source.__init__(self, 'decipher') self.load_bindings() self.dataset = Dataset( 'decipher', 'Development Disorder Genotype – Phenotype Database', 'https://decipher.sanger.ac.uk/', None, 'https://decipher.sanger.ac.uk/legal') if 'test_ids' not in config.get_config() \ or 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = [] else: self.test_ids = config.get_config()['test_ids']['disease'] self.gu = GraphUtils(curie_map.get()) self.g = self.graph self.geno = Genotype(self.g) return
def __init__(self): Source.__init__(self, 'omim') self.load_bindings() self.dataset = Dataset('omim', 'Online Mendelian Inheritance in Man', 'http://www.omim.org', None, 'http://omim.org/help/agreement') # data-source specific warnings (will be removed when issues are cleared) # check if config exists; if it doesn't, error out and let user know if 'keys' not in config.get_config() and 'omim' not in config.get_config()['keys']: logger.error("not configured with API key.") # check to see if there's any ids configured in the config; otherwise, warn if 'test_ids' not in config.get_config() or 'disease' not in config.get_config()['test_ids']: logger.warn("not configured with disease test ids.") else: # select ony those test ids that are omim's. self.test_ids += [obj.replace('OMIM:', '') for obj in config.get_config()['test_ids']['disease'] if re.match('OMIM:', obj)] return
def __init__(self): Source.__init__(self, 'genereviews') self.load_bindings() self.dataset = Dataset( 'genereviews', 'Gene Reviews', 'http://genereviews.org/', None, 'http://www.ncbi.nlm.nih.gov/books/NBK138602/') self.dataset.set_citation('GeneReviews:NBK1116') self.gu = GraphUtils(curie_map.get()) self.book_ids = set() self.all_books = {} if 'test_ids' not in config.get_config() or\ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = list() else: # select ony those test ids that are omim's. self.test_ids = config.get_config()['test_ids']['disease'] return
def _get_curie_and_type_from_id(variant_id): """ Given a variant id, our best guess at its curie and type (snp, haplotype, etc) None will be used for both curie and type for IDs that we can't process :param variant_id: :return: """ curie = None variant_type = None # remove space before hyphens variant_id = re.sub(r' -', '-', variant_id) if re.search(r' x ', variant_id) \ or re.search(r',', variant_id): # TODO deal with rs1234 x rs234... (haplotypes?) logger.warning( "Cannot parse variant groups of this format: %s", variant_id) elif re.search(r';', variant_id): curie = ':haplotype_' + Source.hash_id(variant_id) variant_type = "haplotype" elif re.match(r'rs', variant_id): curie = 'dbSNP:' + variant_id.strip() curie = re.sub(r'-.*$', '', curie).strip() variant_type = "snp" # remove the alteration elif re.match(r'kgp', variant_id): # http://www.1000genomes.org/faq/what-are-kgp-identifiers curie = ':kgp-' + variant_id.strip() variant_type = "snp" elif re.match(r'chr', variant_id): # like: chr10:106180121-G # variant_id = re.sub(r'-?', '-N', variant_id) variant_id = re.sub(r' ', '', variant_id) curie = ':gwas-' + re.sub( r':', '-', variant_id.strip()) variant_type = "snp" elif variant_id.strip() == '': pass else: logger.warning( "There's a snp id i can't manage: %s", variant_id) return curie, variant_type
def make_reagent_targeted_gene_id(gene_id, reagent_id): return Source.make_id('-'.join((gene_id, reagent_id)), '_')
def __init__(self, graph_type, are_bnodes_skolemized): super().__init__(graph_type, are_bnodes_skolemized, 'rgd') self.dataset = Dataset('rgd', 'RGD', 'http://rgd.mcw.edu/', None, None) self.global_terms = Source.open_and_parse_yaml( '../../translationtable/global_terms.yaml')