def _map_eom_terms(self, raw, limit=None): """ This table contains the HP ID mappings from the local tsv file. Triples: <eom id> owl:equivalentClass <hp id> :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip for line in f1: line_counter += 1 (morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = line.split('\t') # Sub out the underscores for colons. hp_id = re.sub('_', ':', hp_id) if re.match(".*HP:.*", hp_id): # add the HP term as a class model.addClassToGraph(hp_id, None) # Add the HP ID as an equivalent class model.addEquivalentClass(morphology_term_id, hp_id) else: logger.warning('No matching HP term for %s', morphology_term_label) if limit is not None and line_counter > limit: break return
def _map_eom_terms(self, raw, limit=None): """ This table contains the HP ID mappings from the local tsv file. Triples: <eom id> owl:equivalentClass <hp id> :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip for line in f1: line_counter += 1 row = line.split('\t') ( morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = row # Sub out the underscores for colons. hp_id = re.sub('_', ':', hp_id) if re.match(".*HP:.*", hp_id): # add the HP term as a class model.addClassToGraph(hp_id, None) # Add the HP ID as an equivalent class model.addEquivalentClass(morphology_term_id, hp_id) else: LOG.warning('No matching HP term for %s', morphology_term_label) if limit is not None and line_counter > limit: break return
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in xrefs.strip().split('|'): prefix = ':'.join(dbxref.split(':')[:-1]).strip() if prefix in self.localtt: prefix = self.localtt[prefix] dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None and prefix != '': if prefix == 'HPRD': # proteins are not == genes. model.addTriple( gene_id, self.globaltt['has gene product'], dbxref_curie) continue # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': if dbxref_curie in self.omim_replaced: repl = self.omim_replaced[dbxref_curie] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = omim if dbxref_curie in self.omim_type and \ self.omim_type[dbxref_curie] != self.globaltt['gene']: continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in xrefs.strip().split('|'): prefix = ':'.join(dbxref.split(':')[:-1]).strip() if prefix in self.localtt: prefix = self.localtt[prefix] dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None and prefix != '': if prefix == 'HPRD': # proteins are not == genes. model.addTriple(gene_id, self.globaltt['has gene product'], dbxref_curie) continue # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': if dbxref_curie in self.omim_replaced: repl = self.omim_replaced[dbxref_curie] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = omim if dbxref_curie in self.omim_type and \ self.omim_type[dbxref_curie] != self.globaltt['gene']: continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.testMode: graph = self.testgraph else: graph = self.graph filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # These will be made xrefs taxon_spec_xref_filters = {'10090': ['ENSEMBL'], '9606': ['ENSEMBL']} if taxon in taxon_spec_xref_filters: taxon_spec_filters = taxon_spec_xref_filters[taxon] else: taxon_spec_filters = [] model = Model(graph) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for ref in xrefs.strip().split('|'): xref_curie = self._cleanup_id(ref) if xref_curie is not None and xref_curie.strip() != '': if re.match(r'HPRD', xref_curie): # proteins are not == genes. model.addTriple(gene_id, self.properties['has_gene_product'], xref_curie) continue # skip some of these for now if xref_curie.split(':')[0] in filter_out: continue if xref_curie.split(':')[0] in taxon_spec_xref_filters: model.addXref(gene_id, xref_curie) if re.match(r'^OMIM', xref_curie): if DipperUtil.is_omim_disease(xref_curie): continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, xref_curie) if int(taxon) in clique_map: if clique_map[int(taxon)] == xref_curie.split( ':')[0]: model.makeLeader(xref_curie) elif clique_map[int(taxon)] == gene_id.split( ':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, xref_curie) except AssertionError as e: logger.warn("Error parsing {0}: {1}".format(gene_id, e)) return
def _process_trait_mappings(self, raw, src_key, limit=None): """ This method mapps traits from/to ... Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) col = self.files[src_key]['columns'] with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') header = next(filereader, None) self.check_fileheader(col, header) for row in filereader: line_counter += 1 # need to skip the last line if len(row) != len(col): LOG.info("skipping line %d: %s", line_counter, '\t'.join(row)) continue vto_id = row[col.index('VT')].strip() pto_id = row[col.index('LPT')].strip() cmo_id = row[col.index('CMO')].strip() ato_column = row[col.index('ATO')].strip() # species = row[col.index('Species')].strip() # trait_class = row[col.index('Class')].strip() # trait_type = row[col.index('Type')].strip() # qtl_count = row[col.index('QTL_Count')].strip() ato_id = re.sub( r'ATO #', 'AQTLTrait:', re.sub( r'\].*', '', re.sub(r'\[', '', ato_column))) ato_id = ato_id.strip() ato_label = re.sub(r'.*\]\s*', '', ato_column) model.addClassToGraph(ato_id, ato_label.strip()) if re.match(r'VT:.*', vto_id): model.addClassToGraph(vto_id, None) model.addEquivalentClass(ato_id, vto_id) if re.match(r'LPT:.*', pto_id): model.addClassToGraph(pto_id, None) model.addXref(ato_id, pto_id) if re.match(r'CMO:.*', cmo_id): model.addClassToGraph(cmo_id, None) model.addXref(ato_id, cmo_id) LOG.info("Done with trait mappings") return
def _process_genes_kegg2ncbi(self, limit=None): """ This method maps the KEGG human gene IDs to the corresponding NCBI Gene IDs. Triples created: <kegg_gene_id> is a class <ncbi_gene_id> is a class <kegg_gene_id> equivalentClass <ncbi_gene_id> <kegg_gene_id> biolink:category biolink:Gene <ncbi_gene_id> biolink:category biolink:Gene :param limit: :return: """ src_key = 'ncbi' LOG.info("Processing KEGG gene IDs to NCBI gene IDs") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, self.files[src_key]['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (kegg_gene_id, ncbi_gene_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids[ 'genes']: continue # Adjust the NCBI gene ID prefix. ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id) kegg_gene_id = 'KEGG-' + kegg_gene_id # Adding the KEGG gene ID to the graph here is redundant, # unless there happens to be additional gene IDs in this table # not present in the genes table. model.addClassToGraph(kegg_gene_id, None, class_category=blv.terms['Gene']) model.addClassToGraph(ncbi_gene_id, None, class_category=blv.terms['Gene']) model.addEquivalentClass(kegg_gene_id, ncbi_gene_id, subject_category=blv.terms['Gene'], object_category=blv.terms['Gene']) if not self.test_mode and (limit is not None and reader.line_num > limit): break LOG.info("Done with KEGG gene IDs to NCBI gene IDs")
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.testMode: graph = self.testgraph else: graph = self.graph filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] taxon_spec_filters = { '10090': ['ENSEMBL'] } if taxon in taxon_spec_filters: filter_out += taxon_spec_filters[taxon] model = Model(graph) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for ref in xrefs.strip().split('|'): xref_curie = self._cleanup_id(ref) if xref_curie is not None and xref_curie.strip() != '': if re.match(r'HPRD', xref_curie): # proteins are not == genes. model.addTriple( gene_id, self.properties['has_gene_product'], xref_curie) continue # skip some of these for now if xref_curie.split(':')[0] in filter_out: continue if re.match(r'^OMIM', xref_curie): if DipperUtil.is_omim_disease(xref_curie): continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass( gene_id, xref_curie) if int(taxon) in clique_map: if clique_map[int(taxon)] == xref_curie.split(':')[0]: model.makeLeader(xref_curie) elif clique_map[int(taxon)] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, xref_curie) except AssertionError as e: logger.warn("Error parsing {0}: {1}".format(gene_id, e)) return
def _process_trait_mappings(self, raw, limit=None): """ This method mapps traits from/to ... Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip header line for row in filereader: line_counter += 1 # need to skip the last line if len(row) < 8: LOG.info("skipping line %d: %s", line_counter, '\t'.join(row)) continue (vto_id, pto_id, cmo_id, ato_column, species, trait_class, trait_type, qtl_count) = row ato_id = re.sub( r'ATO #', 'AQTLTrait:', re.sub(r'\].*', '', re.sub(r'\[', '', ato_column))) ato_id = ato_id.strip() ato_label = re.sub(r'.*\]\s*', '', ato_column) model.addClassToGraph(ato_id, ato_label.strip()) if re.match(r'VT:.*', vto_id): model.addClassToGraph(vto_id, None) model.addEquivalentClass(ato_id, vto_id) if re.match(r'LPT:.*', pto_id): model.addClassToGraph(pto_id, None) model.addXref(ato_id, pto_id) if re.match(r'CMO:.*', cmo_id): model.addClassToGraph(cmo_id, None) model.addXref(ato_id, cmo_id) LOG.info("Done with trait mappings") return
def _process_genes_kegg2ncbi(self, limit=None): """ This method maps the KEGG human gene IDs to the corresponding NCBI Gene IDs. Triples created: <kegg_gene_id> is a class <ncbi_gene_id> is a class <kegg_gene_id> equivalentClass <ncbi_gene_id> :param limit: :return: """ logger.info("Processing KEGG gene IDs to NCBI gene IDs") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) line_counter = 0 raw = '/'.join((self.rawdir, self.files['ncbi']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, ncbi_gene_id, link_type) = row if self.testMode and \ kegg_gene_id not in self.test_ids['genes']: continue # Adjust the NCBI gene ID prefix. ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id) kegg_gene_id = 'KEGG-'+kegg_gene_id # Adding the KEGG gene ID to the graph here is redundant, # unless there happens to be additional gene IDs in this table # not present in the genes table. model.addClassToGraph(kegg_gene_id, None) model.addClassToGraph(ncbi_gene_id, None) model.addEquivalentClass(kegg_gene_id, ncbi_gene_id) if (not self.testMode) and ( limit is not None and line_counter > limit): break logger.info("Done with KEGG gene IDs to NCBI gene IDs") return
def _process_trait_mappings(self, raw, limit=None): """ This method mapps traits from/to ... Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip header line for row in filereader: line_counter += 1 # need to skip the last line if len(row) < 8: LOG.info("skipping line %d: %s", line_counter, '\t'.join(row)) continue (vto_id, pto_id, cmo_id, ato_column, species, trait_class, trait_type, qtl_count) = row ato_id = re.sub( r'ATO #', 'AQTLTrait:', re.sub( r'\].*', '', re.sub(r'\[', '', ato_column))) ato_id = ato_id.strip() ato_label = re.sub(r'.*\]\s*', '', ato_column) model.addClassToGraph(ato_id, ato_label.strip()) if re.match(r'VT:.*', vto_id): model.addClassToGraph(vto_id, None) model.addEquivalentClass(ato_id, vto_id) if re.match(r'LPT:.*', pto_id): model.addClassToGraph(pto_id, None) model.addXref(ato_id, pto_id) if re.match(r'CMO:.*', cmo_id): model.addClassToGraph(cmo_id, None) model.addXref(ato_id, cmo_id) LOG.info("Done with trait mappings") return
def _process_gene_xref(self, limit): """ Make equivalentClass axioms between flybase gene ids and NCBIGene and HGNC noting these are not expected to be orthologs just bad renaming. Note that there are a lot of genes in flybase from other organisms we make the eq axioms so that they clique merge in our large graph (for example, human genes should merge with HGNC) Adds triples to self.graph :param limit: number of rows to process :return: None """ model = Model(self.graph) src_key = 'gene_xref' raw = '/'.join((self.rawdir, self.queries[src_key]['file'])) LOG.info("processing gene xrefs") col = self.queries[src_key]['columns'] with open(raw, 'r') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') row = next(reader) # headers self.check_fileheader(col, row) for row in reader: gene_id = row[col.index('gene_id')] xref_id = row[col.index('xref_id')] xref_source = row[col.index('xref_source')] gene_curie = 'FlyBase:' + gene_id xref_prefix = None if xref_source == 'EntrezGene': xref_prefix = 'NCBIGene' elif xref_source == 'HGNC': xref_prefix = 'HGNC' # gene_taxon = self.globaltt['H**o sapiens'] xref_curie = xref_prefix + ':' + xref_id model.addEquivalentClass(gene_curie, xref_curie, object_category=blv.terms['Gene']) if limit is not None and reader.line_num > limit: break
def _get_mapped_gene_ids(self, entry, graph): gene_ids = [] model = Model(graph) omim_num = str(entry['mimNumber']) omim_curie = 'OMIM:' + omim_num if 'externalLinks' in entry: links = entry['externalLinks'] omimtype = self.omim_type[omim_num] if 'geneIDs' in links: entrez_mappings = links['geneIDs'] gene_ids = entrez_mappings.split(',') self.omim_ncbigene_idmap[omim_curie] = gene_ids if omimtype in [ self.globaltt['gene'], self.globaltt['has_affected_feature']]: for ncbi in gene_ids: model.addEquivalentClass(omim_curie, 'NCBIGene:' + str(ncbi)) return gene_ids
def _get_mapped_gene_ids(self, entry, g): gene_ids = [] model = Model(g) omimid = 'OMIM:'+str(entry['mimNumber']) if 'externalLinks' in entry: links = entry['externalLinks'] omimtype = self._get_omimtype(entry) if 'geneIDs' in links: entrez_mappings = links['geneIDs'] gene_ids = entrez_mappings.split(',') self.omim_ncbigene_idmap[omimid] = gene_ids if omimtype == Genotype.genoparts['gene']: for i in gene_ids: model.addEquivalentClass(omimid, 'NCBIGene:'+str(i)) return gene_ids
def _process_genes_kegg2ncbi(self, limit=None): """ This method maps the KEGG human gene IDs to the corresponding NCBI Gene IDs. Triples created: <kegg_gene_id> is a class <ncbi_gene_id> is a class <kegg_gene_id> equivalentClass <ncbi_gene_id> :param limit: :return: """ LOG.info("Processing KEGG gene IDs to NCBI gene IDs") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, self.files['ncbi']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (kegg_gene_id, ncbi_gene_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids['genes']: continue # Adjust the NCBI gene ID prefix. ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id) kegg_gene_id = 'KEGG-' + kegg_gene_id # Adding the KEGG gene ID to the graph here is redundant, # unless there happens to be additional gene IDs in this table # not present in the genes table. model.addClassToGraph(kegg_gene_id, None) model.addClassToGraph(ncbi_gene_id, None) model.addEquivalentClass(kegg_gene_id, ncbi_gene_id) if not self.test_mode and ( limit is not None and reader.line_num > limit): break LOG.info("Done with KEGG gene IDs to NCBI gene IDs")
def _map_eom_terms(self, raw, limit=None): """ This table contains the HP ID mappings from the local tsv file. Triples: <eom id> owl:equivalentClass <hp id> :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 col = self.files['map']['columns'] with open(raw, 'r') as reader: line = reader.readline().strip() line = line.strip('/n') if self.check_fileheader(col, line.split('\t')): pass for line in reader: line_counter += 1 row = line.strip('\n').split('\t') morphology_term_id = row[col.index( 'morphology_term_id')].strip() # morphology_term_label = row[col.index('morphology_term_label')] hp_id = row[col.index('HP ID')].strip() # hp_label = row[col.index('HP_Label')] # notes = row[col.index('Notes')] # Sub out the underscores for colons. hp_id = re.sub('_', ':', hp_id) if re.match(".*HP:.*", hp_id): # add the HP term as a class model.addClassToGraph(hp_id, None) # TEC subclass of phenotype?? # Add the HP ID as an equivalent class model.addEquivalentClass(morphology_term_id, hp_id) else: LOG.warning('No matching HP term for %s', morphology_term_id) if limit is not None and line_counter > limit: break
def _process_pathway_pathway(self, limit): """ There are "map" and "ko" identifiers for pathways. This makes equivalence mapping between them, where they exist. :param limit: :return: """ logger.info("Processing KEGG pathways to other ids") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 model = Model(g) raw = '/'.join((self.rawdir, self.files['pathway_pathway']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (pathway_id_1, pathway_id_2) = row if self.testMode and \ pathway_id_1 not in self.test_ids['pathway']: continue pathway_id_1 = 'KEGG-'+pathway_id_1 # will look like KEGG-path:map04130 or KEGG-path:ko04130 pathway_id_2 = 'KEGG-'+pathway_id_2 if pathway_id_1 != pathway_id_2: model.addEquivalentClass(pathway_id_1, pathway_id_2) if not self.testMode and \ limit is not None and line_counter > limit: break return
def _process_genes(self, limit=None): if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n . for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db, rna_central_ids) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' \ and int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self._get_gene_type(locus_type) model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass( hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass( hgnc_id, 'ENSEMBL:' + ensembl_gene_id) if omim_id != '' and "|" not in omim_id: omim_curie = 'OMIM:' + omim_id if not DipperUtil.is_omim_disease(omim_curie): model.addEquivalentClass(hgnc_id, omim_curie) geno.addTaxon('NCBITaxon:9606', hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': g.addTriple( 'PMID:' + str(p.strip()), model.object_properties['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) f = Feature(g, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom # TEC Monoch? Monarchdom?? band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR') model.addClassToGraph(band_id, None) f.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) f.addSubsequenceOfFeature(chrom_id) if not self.testMode \ and limit is not None and line_counter > limit: break # end loop through file return
def _add_gene_equivalencies(self, dbxrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport', '', None] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in dbxrefs.strip().split('|'): dbxref = dbxref.strip() # de stutter dbxref (prefix, local_id) = dbxref.split(':')[-2:] prefix = prefix.strip() local_id = local_id.strip() # skip some of these based on curie prefix or malformatting if prefix is None or prefix in filter_out or \ local_id is None or local_id == '': continue if prefix in self.localtt: prefix = self.localtt[prefix] if prefix == 'AnimalQTLdb' and taxon in self.informal_species: prefix = self.informal_species[taxon] + 'QTL' elif prefix == 'AnimalQTLdb': LOG.warning('Unknown AnimalQTLdb species %s for %s:%s', taxon, prefix, local_id) # else: # taxon is not in informal species (not unexpected) dbxref_curie = ':'.join((prefix, local_id)) if dbxref_curie is not None: if prefix == 'HPRD': # proteins are not == genes. model.addTriple(gene_id, self.globaltt['has gene product'], dbxref_curie) continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) # For Ensembl xrefs, don't proceed to equivalent class code # these are more loose xrefs than equivalent identifiers continue if prefix == 'OMIM': omim_num = dbxref_curie[5:] if omim_num in self.omim_replaced: repl = self.omim_replaced[omim_num] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = 'OMIM:' + omim omim_num = omim # last "gene" wins (is never > 2) if omim_num in self.omim_type and\ self.omim_type[omim_num] == self.globaltt['gene']: model.addXref(gene_id, dbxref_curie) else: # OMIM disease/phenotype is not considered a gene at all # no equivilance between ncbigene and omin-nongene # and ncbi is never a human clique leader in any case dbxref_curie = None continue # designate clique leaders and equivalentClass/sameAs triples # (perhaps premature as this ingest can't know what else exists) try: if self.class_or_indiv.get(gene_id) == 'C' and \ dbxref_curie is not None: model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) elif dbxref_curie is not None: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _get_identifiers(self, limit): """ This will process the id mapping file provided by Biogrid. The file has a very large header, which we scan past, then pull the identifiers, and make equivalence axioms :param limit: :return: """ LOG.info("getting identifier mapping") line_counter = 0 f = '/'.join((self.rawdir, self.files['identifiers']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] foundheader = False # TODO align this species filter with the one above # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster, # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',') speciesfilters = 'H**o sapiens,Mus musculus'.split(',') with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip header lines if not foundheader: if re.match(r'BIOGRID_ID', line.decode()): foundheader = True continue line = line.decode().strip() # BIOGRID_ID # IDENTIFIER_VALUE # IDENTIFIER_TYPE # ORGANISM_OFFICIAL_NAME # 1 814566 ENTREZ_GENE Arabidopsis thaliana (biogrid_num, id_num, id_type, organism_label) = line.split('\t') if self.test_mode: graph = self.testgraph # skip any genes that don't match our test set if int(biogrid_num) not in self.biogrid_ids: continue else: graph = self.graph model = Model(graph) # for each one of these, # create the node and add equivalent classes biogrid_id = 'BIOGRID:' + biogrid_num prefix = self.localtt[id_type] # TODO make these filters available as commandline options # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC, # WormBase,XenBase,ENSEMBL,miRBase'.split(',') geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',') # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein' if (speciesfilters is not None) \ and (organism_label.strip() in speciesfilters): line_counter += 1 if (geneidtypefilters is not None) \ and (prefix in geneidtypefilters): mapped_id = ':'.join((prefix, id_num)) model.addEquivalentClass(biogrid_id, mapped_id) # this symbol will only get attached to the biogrid class elif id_type == 'OFFICIAL_SYMBOL': model.addClassToGraph(biogrid_id, id_num) # elif (id_type == 'SYNONYM'): # FIXME - i am not sure these are synonyms, altids? # gu.addSynonym(g,biogrid_id,id_num) if not self.test_mode and limit is not None and line_counter > limit: break myzip.close() return
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignore element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'ORPHA:' + str(disorder_num) if self.test_mode and disorder_id not in self.all_test_ids[ 'disease']: continue disorder_label = elem.find('Name').text # assuming that these are in the ontology (...any particular one?) model.addClassToGraph(disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') expected_genes = assoc_list.get('count') LOG.info('Expecting %s genes assdciated with disorder %s.', expected_genes, disorder_id) processed_genes = 0 for assoc in assoc_list.findall('DisorderGeneAssociation'): processed_genes += 1 gene = assoc.find('Gene') # get gene's curie HGNC or Ensembl ... lclid = gene.find('OrphaNumber').text gene_curie = 'ORPHA:' + lclid gene_set = {'ORPHA': lclid} for gene_ref in gene.findall( './ExternalReferenceList/ExternalReference'): gene_set[gene_ref.find('Source').text] = \ gene_ref.find('Reference').text # set priority (clique leader if available) but default to OPRHA for pfx in ('HGNC', 'Ensembl', 'SwissProt'): # 'OMIM', 'Genatlas','Reactome', 'IUPHAR'): if pfx in gene_set: if pfx in self.localtt: pfx = self.localtt[pfx] gene_curie = pfx + ':' + gene_set[pfx] gene_set.pop(pfx) model.addClassToGraph(gene_curie, None) break # TEC have reservations w.r.t aggerator links being gene classes for prefix in gene_set: lclid = gene_set[prefix] if prefix in self.localtt: prefix = self.localtt[prefix] dbxref = prefix + ':' + lclid if gene_curie != dbxref: model.addClassToGraph(dbxref, None) model.addEquivalentClass(gene_curie, dbxref) # TEC. would prefer this not happen here. let HGNC handle it # except there are some w/o explicit external links ... # gene_name = gene.find('Name').text gene_symbol = gene.find('Symbol').text # gene_iid = assoc.find('DisorderGeneAssociationType').get('id') # gene_type_id = self.resolve(gene_iid) # don't know the 'type' of the gene for this class anymore # model.addClassToGraph( # gene_curie, gene_symbol, gene_type_id, gene_name) syn_list = gene.find('./SynonymList') if int(syn_list.get('count')) > 0: for syn in syn_list.findall('./Synonym'): model.addSynonym(gene_curie, syn.text) dg_label = assoc.find( './DisorderGeneAssociationType/Name').text # rel_id = self.resolve(dg_label) # alt_locus_id = '_:' + gene_num + '-' + disorder_num + 'VL' # alt_label = ' '.join(( # 'some variant of', gene_symbol.strip(), disorder_label)) # model.addIndividualToGraph( # alt_locus_id, alt_label, self.globaltt['variant_locus']) # geno.addAffectedLocus(alt_locus_id, gene_id) # model.addBlankNodeAnnotation(alt_locus_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use dg association status to issue an evidence code # FIXME I think that these codes are sub-optimal eco_id = self.resolve( assoc.find('DisorderGeneAssociationStatus/Name').text) # assoc = G2PAssoc( # graph, self.name, alt_locus_id, disorder_id, rel_id) # assoc.add_evidence(eco_id) # assoc.add_association_to_graph() self.add_gene_to_disease(dg_label, gene_curie, gene_symbol, disorder_id, eco_id) elem.clear() # empty the element if int(expected_genes) != processed_genes: LOG.warning( '% expected %s associated genes but we processed %i', disorder_id, expected_genes, processed_genes) if self.test_mode and limit is not None and line_counter > limit: return return
def _process_genes(self, taxid, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 LOG.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: LOG.warning("Too few columns in: " + row) raise ValueError("Data error for file %s", raw) (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene, ensembl_peptide_id, uniprotswissprot) = row[0:7] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[7] else: hgnc_id = None if self.test_mode and entrezgene != '' and \ int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:' + ensembl_gene_id peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id) uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot) entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None gene_biotype = gene_biotype.strip() gene_type_id = self.resolve(gene_biotype, False) if gene_type_id == gene_biotype.strip(): # did not resolve gene_type_id = self.globaltt['polypeptide'] model.addClassToGraph( gene_id, external_gene_name, gene_type_id, description) model.addIndividualToGraph(peptide_curie, None, gene_type_id) model.addIndividualToGraph(uniprot_curie, None, gene_type_id) if entrezgene != '': if taxid == '9606': # Use HGNC for eq in human data model.addXref(gene_id, entrez_curie) else: model.addEquivalentClass(gene_id, entrez_curie) if hgnc_id is not None and hgnc_id != '': model.addEquivalentClass(gene_id, hgnc_id) geno.addTaxon('NCBITaxon:'+taxid, gene_id) if ensembl_peptide_id != '': geno.addGeneProduct(gene_id, peptide_curie) if uniprotswissprot != '': geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.test_mode and limit is not None and line_counter > limit: break return
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) model = Model(g) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) # PYLINT complains iterparse deprecated, # but as of py 3.4 only the optional & unsupplied parse arg is. for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignoreS element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'Orphanet:' + str(disorder_num) if self.testMode and \ disorder_id not in \ config.get_config()['test_ids']['disease']: continue disorder_label = elem.find('Name').text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find('GeneList') for gene in gene_list.findall('Gene'): gene_iid = gene.get('id') gene_type = gene.find('GeneType').get('id') gene_iid_to_type[gene_iid] = gene_type # assuming that these are in the ontology model.addClassToGraph(disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') for a in assoc_list.findall('DisorderGeneAssociation'): gene_iid = a.find('.//Gene').get('id') gene_name = a.find('.//Gene/Name').text gene_symbol = a.find('.//Gene/Symbol').text gene_num = a.find('./Gene/OrphaNumber').text gene_id = 'Orphanet:' + str(gene_num) gene_type_id = \ self._map_gene_type_id(gene_iid_to_type[gene_iid]) model.addClassToGraph(gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find('./Gene/SynonymList') if int(syn_list.get('count')) > 0: for s in syn_list.findall('./Synonym'): model.addSynonym(gene_id, s.text) dgtype = a.find('DisorderGeneAssociationType').get('id') rel_id = self._map_rel_id(dgtype) dg_label = \ a.find('./DisorderGeneAssociationType/Name').text if rel_id is None: logger.warning( "Cannot map association type (%s) to RO " + "for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol) continue alt_locus_id = '_:' + gene_num + '-' + disorder_num + 'VL' alt_label = \ ' '.join(('some variant of', gene_symbol.strip(), 'that is a', dg_label.lower(), disorder_label)) model.addIndividualToGraph(alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus_id, gene_id) model.addBlankNodeAnnotation(alt_locus_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = \ a.find('DisorderGeneAssociationStatus').get('id') # imported automatically asserted information # used in automatic assertion eco_id = 'ECO:0000323' # Assessed # TODO are these internal ids stable between releases? if status_code == '17991': # imported manually asserted information # used in automatic assertion eco_id = 'ECO:0000322' # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(g, self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() rlist = a.find('./Gene/ExternalReferenceList') eqid = None for r in rlist.findall('ExternalReference'): if r.find('Source').text == 'Ensembl': eqid = 'ENSEMBL:' + r.find('Reference').text elif r.find('Source').text == 'HGNC': eqid = 'HGNC:' + r.find('Reference').text elif r.find('Source').text == 'OMIM': eqid = 'OMIM:' + r.find('Reference').text else: pass # skip the others for now if eqid is not None: model.addClassToGraph(eqid, None) model.addEquivalentClass(gene_id, eqid) elem.clear() # empty the element if self.testMode and limit is not None and line_counter > limit: return return
def _process_trait_mappings(self, raw, limit=None): """ This method mapps traits from/to ... Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 model = Model(g) # with open(raw, 'r') as csvfile: # filereader = csv.reader(csvfile, delimiter=',') # row_count = sum(1 for row in filereader) # row_count = row_count - 1 with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip header line for row in filereader: line_counter += 1 # need to skip the last line if len(row) < 8: logger.info( "skipping line %d: %s", line_counter, '\t'.join(row)) continue (vto_id, pto_id, cmo_id, ato_column, species, trait_class, trait_type, qtl_count) = row ato_id = re.sub(r'ATO #', 'AQTLTrait:', re.sub(r'\].*', '', re.sub(r'\[', '', ato_column))) ato_id = ato_id.strip() ato_label = re.sub(r'.*\]\s*', '', ato_column) # if species == 'Cattle': # ato_id = re.sub(r'ATO:', 'AQTLTraitCattle:', ato_id) # elif species == 'Chicken': # ato_id = re.sub(r'ATO:', 'AQTLTraitChicken:', ato_id) # elif species == 'Sheep': # ato_id = re.sub(r'ATO:', 'AQTLTraitSheep:', ato_id) # elif species == 'Horse': # ato_id = re.sub(r'ATO:', 'AQTLTraitHorse:', ato_id) # elif species == 'Pig': # ato_id = re.sub(r'ATO:', 'AQTLTraitPig:', ato_id) # elif species == 'Rainbow trout': # ato_id = re.sub( # r'ATO:', 'AQTLTraitRainbowTrout:', ato_id) # else: # logger.warning( # 'Unknown species %s foufnd in trait mapping file.', # species) # continue # print(ato_label) model.addClassToGraph(ato_id, ato_label.strip()) if re.match(r'VT:.*', vto_id): model.addClassToGraph(vto_id, None) model.addEquivalentClass(ato_id, vto_id) if re.match(r'LPT:.*', pto_id): model.addClassToGraph(pto_id, None) model.addXref(ato_id, pto_id) if re.match(r'CMO:.*', cmo_id): model.addClassToGraph(cmo_id, None) model.addXref(ato_id, cmo_id) logger.info("Done with trait mappings") return
def _process_genes(self, taxid, limit=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 logger.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: raise ValueError("Data error for file %s", raw) (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene, peptide_id, uniprot_swissprot) = row[0:7] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[7] else: hgnc_id = None if self.testMode and entrezgene != '' \ and int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:' + ensembl_gene_id peptide_curie = 'ENSEMBL:{}'.format(peptide_id) uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot) entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None # gene_type_id = self._get_gene_type(gene_biotype) gene_type_id = None model.addClassToGraph( gene_id, external_gene_name, gene_type_id, description) model.addIndividualToGraph(peptide_curie, None, self._get_gene_type("polypeptide")) model.addIndividualToGraph(uniprot_curie, None, self._get_gene_type("polypeptide")) if entrezgene != '': model.addEquivalentClass(gene_id, entrez_curie) if hgnc_id is not None and hgnc_id != '': model.addEquivalentClass(gene_id, hgnc_id) geno.addTaxon('NCBITaxon:'+taxid, gene_id) if peptide_id != '': geno.addGeneProduct(gene_id, peptide_curie) if uniprot_swissprot != '': geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ LOG.info("Processing OMIM to KEGG gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, omim_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-' + kegg_gene_id.strip() omim_id = re.sub(r'omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! # so add them as a class then make equivalence model.addClassToGraph(omim_id, None) geno.addGene(kegg_gene_id, None) if not DipperUtil.is_omim_disease(omim_id): model.addEquivalentClass(kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID & the KEGG gene ID # we do this with omim ids because # they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph( alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, kegg_gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. rel = self.globaltt['is marker for'] assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph() elif link_type == 'original': # these are sometimes a gene, and sometimes a disease LOG.info( 'Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are LOG.warning( 'Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.test_mode) and ( limit is not None and line_counter > limit): break LOG.info("Done with OMIM to KEGG gene") return
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignore element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'ORPHA:' + str(disorder_num) if self.test_mode and disorder_id not in self.all_test_ids['disease']: continue disorder_label = elem.find('Name').text # assuming that these are in the ontology (...any particular one?) model.addClassToGraph(disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') expected_genes = assoc_list.get('count') LOG.info( 'Expecting %s genes associated with disorder %s.', expected_genes, disorder_id) processed_genes = 0 for assoc in assoc_list.findall('DisorderGeneAssociation'): processed_genes += 1 gene = assoc.find('Gene') # get gene's curie HGNC or Ensembl ... lclid = gene.find('OrphaNumber').text gene_curie = 'ORPHA:' + lclid gene_set = {'ORPHA': lclid} for gene_ref in gene.findall( './ExternalReferenceList/ExternalReference'): gene_set[gene_ref.find('Source').text] = \ gene_ref.find('Reference').text # set priority (clique leader if available) but default to OPRHA for pfx in ('HGNC', 'Ensembl', 'SwissProt'): if pfx in gene_set: if pfx in self.localtt: pfx = self.localtt[pfx] gene_curie = pfx + ':' + gene_set[pfx] gene_set.pop(pfx) model.addClassToGraph(gene_curie, None) break # TEC have reservations w.r.t aggerator links being gene classes for prefix in gene_set: lclid = gene_set[prefix] if prefix in self.localtt: prefix = self.localtt[prefix] dbxref = prefix + ':' + lclid if gene_curie != dbxref: model.addClassToGraph(dbxref, None) model.addEquivalentClass(gene_curie, dbxref) # TEC. would prefer this not happen here. let HGNC handle it # except there are some w/o explicit external links ... gene_symbol = gene.find('Symbol').text syn_list = gene.find('./SynonymList') if int(syn_list.get('count')) > 0: for syn in syn_list.findall('./Synonym'): model.addSynonym(gene_curie, syn.text) dg_label = assoc.find('./DisorderGeneAssociationType/Name').text # use dg association status to issue an evidence code # FIXME I think that these codes are sub-optimal eco_id = self.resolve( assoc.find('DisorderGeneAssociationStatus/Name').text) rel_id = self.resolve(dg_label) g2p_assoc = G2PAssoc(self.graph, self.name, gene_curie, disorder_id, rel_id) g2p_assoc.add_evidence(eco_id) g2p_assoc.add_association_to_graph() elem.clear() # empty the element if int(expected_genes) != processed_genes: LOG.warning( '% expected %s associated genes but we processed %i', disorder_id, expected_genes, processed_genes) if self.test_mode and limit is not None and line_counter > limit: return return
def _process_omim2disease(self, limit=None): """ This method maps the KEGG disease IDs to the corresponding OMIM disease IDs. Currently this only maps KEGG diseases and OMIM diseases that are 1:1. Triples created: <kegg_disease_id> is a class <omim_disease_id> is a class <kegg_disease_id> hasXref <omim_disease_id> :param limit: :return: """ LOG.info("Processing 1:1 KEGG disease to OMIM disease mappings") if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) raw = '/'.join((self.rawdir, self.files['omim2disease']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (omim_disease_id, kegg_disease_id, link_type) = row kegg_disease_id = 'KEGG-' + kegg_disease_id.strip() omim_disease_id = re.sub(r'omim', 'OMIM', omim_disease_id) # Create hash for the links from OMIM ID -> KEGG ID if omim_disease_id not in self.omim_disease_hash: self.omim_disease_hash[omim_disease_id] = [kegg_disease_id] else: self.omim_disease_hash[omim_disease_id].append(kegg_disease_id) # Create hash for the links from KEGG ID -> OMIM ID if kegg_disease_id not in self.kegg_disease_hash: self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id] else: self.kegg_disease_hash[kegg_disease_id].append(omim_disease_id) # Now process the disease hashes # and only pass 1:1 omim disease:KEGG disease entries. for omim_disease_id in self.omim_disease_hash: if self.test_mode and omim_disease_id not in self.test_ids['disease']: continue if (not self.test_mode) and (limit is not None and line_counter > limit): break line_counter += 1 if len(self.omim_disease_hash[omim_disease_id]) == 1: kegg_disease_id = ''.join(self.omim_disease_hash.get(omim_disease_id)) if len(self.kegg_disease_hash[kegg_disease_id]) == 1: # add ids, and deal with the labels separately model.addClassToGraph(kegg_disease_id, None) model.addClassToGraph(omim_disease_id, None) # TODO is this safe? model.addEquivalentClass(kegg_disease_id, omim_disease_id) else: pass # gu.addXref(g, omim_disease_id, kegg_disease_id) # TODO add xrefs if >1:1 mapping? LOG.info("Done with KEGG disease to OMIM disease mappings.") return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ LOG.info("Processing OMIM to KEGG gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (kegg_gene_id, omim_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-' + kegg_gene_id.strip() omim_id = re.sub(r'omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! # so add them as a class then make equivalence model.addClassToGraph(omim_id, None) geno.addGene(kegg_gene_id, None) # previous: if omim type is not disease-ish then use # now is: if omim type is gene then use if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID & the KEGG gene ID # we do this with omim ids because # they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph( alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, kegg_gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. rel = self.globaltt['is marker for'] assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph() elif link_type == 'original': # these are sometimes a gene, and sometimes a disease LOG.info( 'Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are LOG.warning( 'Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.test_mode) and ( limit is not None and reader.line_num > limit): break LOG.info("Done with OMIM to KEGG gene")
def _process_genes(self, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) col = self.files['genes']['columns'] LOG.info("Processing HGNC genes") chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]') band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)') with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) if not self.check_fileheader(col, row): exit(-1) for row in filereader: # To generate: # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' | # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g" hgnc_id = row[col.index('hgnc_id')].strip() symbol = row[col.index('symbol')].strip() name = row[col.index('name')].strip() # locus_group = row[col.index('locus_group')] locus_type = row[col.index('locus_type')].strip() # status = row[col.index('status')] location = row[col.index('location')].strip() # location_sortable = row[col.index('location_sortable')] # alias_symbol = row[col.index('alias_symbol')] # alias_name = row[col.index('alias_name')] # prev_symbol = row[col.index('prev_symbol')] # prev_name = row[col.index('prev_name')] # gene_family = row[col.index('gene_family')] # gene_family_id = row[col.index('gene_family_id')] # date_approved_reserved = row[col.index('date_approved_reserved')] # date_symbol_changed = row[col.index('date_symbol_changed')] # date_name_changed = row[col.index('date_name_changed')] # date_modified = row[col.index('date_modified')] entrez_id = row[col.index('entrez_id')].strip() ensembl_gene_id = row[col.index('ensembl_gene_id')].strip() # vega_id = row[col.index('vega_id')] # ucsc_id = row[col.index('ucsc_id')] # ena = row[col.index('ena')] # refseq_accession = row[col.index('refseq_accession')] # ccds_id = row[col.index('ccds_id')] # uniprot_ids = row[col.index('uniprot_ids')] pubmed_ids = row[col.index('pubmed_id')].strip() # pipe seperated! # mgd_id = row[col.index('mgd_id')] # rgd_id = row[col.index('rgd_id')] # lsdb = row[col.index('lsdb')] # cosmic = row[col.index('cosmic')] omim_ids = row[col.index('omim_id')].strip() # pipe seperated! # mirbase = row[col.index('mirbase')] # homeodb = row[col.index('homeodb')] # snornabase = row[col.index('snornabase')] # bioparadigms_slc = row[col.index('bioparadigms_slc')] # orphanet = row[col.index('orphanet')] # pseudogene.org = row[col.index('pseudogene.org')] # horde_id = row[col.index('horde_id')] # merops = row[col.index('merops')] # imgt = row[col.index('imgt')] # iuphar = row[col.index('iuphar')] # kznf_gene_catalog = row[col.index('kznf_gene_catalog')] # mamit_trnadb = row[col.index('mamit-trnadb')] # cd = row[col.index('cd')] # lncrnadb = row[col.index('lncrnadb')] # enzyme_id = row[col.index('enzyme_id')] # intermediate_filament_db = row[col.index('intermediate_filament_db')] # rna_central_ids = row[col.index('rna_central_ids')] # lncipedia = row[col.index('lncipedia')] # gtrnadb = row[col.index('gtrnadb')] if self.test_mode and entrez_id != '' and \ entrez_id not in self.gene_ids: continue if name == '': name = None if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: gene_type_id = self.resolve(locus_type, False) # withdrawn -> None? if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) for omim_id in omim_ids.split('|'): if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] LOG.warning('%s is replaced with %s', omim_id, repl) for omim in repl: if self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" for pubmed_id in pubmed_ids.split('|'): graph.addTriple( 'PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_match = chr_pattern.match(location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_match = band_pattern.search(location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.test_mode and limit is not None and \ filereader.line_num > limit: break
def _process_omim2disease(self, limit=None): """ This method maps the KEGG disease IDs to the corresponding OMIM disease IDs. Currently this only maps KEGG diseases and OMIM diseases that are 1:1. Triples created: <kegg_disease_id> is a class <omim_disease_id> is a class <kegg_disease_id> hasXref <omim_disease_id> :param limit: :return: """ LOG.info("Processing 1:1 KEGG disease to OMIM disease mappings") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, self.files['omim2disease']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (omim_disease_id, kegg_disease_id, link_type) = row kegg_disease_id = 'KEGG-' + kegg_disease_id.strip() omim_disease_id = re.sub(r'omim', 'OMIM', omim_disease_id) # Create hash for the links from OMIM ID -> KEGG ID if omim_disease_id not in self.omim_disease_hash: self.omim_disease_hash[omim_disease_id] = [kegg_disease_id] else: self.omim_disease_hash[omim_disease_id].append(kegg_disease_id) # Create hash for the links from KEGG ID -> OMIM ID if kegg_disease_id not in self.kegg_disease_hash: self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id] else: self.kegg_disease_hash[kegg_disease_id].append(omim_disease_id) # Now process the disease hashes # and only pass 1:1 omim disease:KEGG disease entries. for omim_disease_id in self.omim_disease_hash: if self.test_mode and omim_disease_id not in self.test_ids['disease']: continue if (not self.test_mode) and (limit is not None and reader.line_num > limit): break if len(self.omim_disease_hash[omim_disease_id]) == 1: kegg_disease_id = ''.join(self.omim_disease_hash.get(omim_disease_id)) if len(self.kegg_disease_hash[kegg_disease_id]) == 1: # add ids, and deal with the labels separately model.addClassToGraph(kegg_disease_id, None) model.addClassToGraph(omim_disease_id, None) # TODO is this safe? model.addEquivalentClass(kegg_disease_id, omim_disease_id) else: pass # gu.addXref(g, omim_disease_id, kegg_disease_id) # TODO add xrefs if >1:1 mapping? LOG.info("Done with KEGG disease to OMIM disease mappings.")
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignore element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'ORPHA:' + str(disorder_num) if self.test_mode and disorder_id not in self.all_test_ids[ 'disease']: continue disorder_label = elem.find('Name').text # assuming that these are in the ontology (...any particular one?) model.addClassToGraph(disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') expected_genes = assoc_list.get('count') LOG.info('Expecting %s genes associated with disorder %s.', expected_genes, disorder_id) processed_genes = 0 for assoc in assoc_list.findall('DisorderGeneAssociation'): processed_genes += 1 gene = assoc.find('Gene') # get gene's curie HGNC or Ensembl ... lclid = gene.find('OrphaNumber').text gene_curie = 'ORPHA:' + lclid gene_set = {'ORPHA': lclid} for gene_ref in gene.findall( './ExternalReferenceList/ExternalReference'): gene_set[gene_ref.find('Source').text] = \ gene_ref.find('Reference').text # set priority (clique leader if available) but default to OPRHA for pfx in ('HGNC', 'Ensembl', 'SwissProt'): if pfx in gene_set: if pfx in self.localtt: pfx = self.localtt[pfx] gene_curie = pfx + ':' + gene_set[pfx] gene_set.pop(pfx) model.addClassToGraph(gene_curie, None) break # TEC have reservations w.r.t aggerator links being gene classes for prefix in gene_set: lclid = gene_set[prefix] if prefix in self.localtt: prefix = self.localtt[prefix] dbxref = prefix + ':' + lclid if gene_curie != dbxref: model.addClassToGraph(dbxref, None) model.addEquivalentClass(gene_curie, dbxref) # TEC. would prefer this not happen here. let HGNC handle it # except there are some w/o explicit external links ... gene_symbol = gene.find('Symbol').text syn_list = gene.find('./SynonymList') if int(syn_list.get('count')) > 0: for syn in syn_list.findall('./Synonym'): model.addSynonym(gene_curie, syn.text) dg_label = assoc.find( './DisorderGeneAssociationType/Name').text # use dg association status to issue an evidence code # FIXME I think that these codes are sub-optimal eco_id = self.resolve( assoc.find('DisorderGeneAssociationStatus/Name').text) rel_id = self.resolve(dg_label) g2p_assoc = G2PAssoc(self.graph, self.name, gene_curie, disorder_id, rel_id) g2p_assoc.add_evidence(eco_id) g2p_assoc.add_association_to_graph() elem.clear() # empty the element if int(expected_genes) != processed_genes: LOG.warning( '% expected %s associated genes but we processed %i', disorder_id, expected_genes, processed_genes) if self.test_mode and limit is not None and line_counter > limit: return return
def _add_gene_equivalencies(self, dbxrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport', ''] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in dbxrefs.strip().split('|'): prefix = ':'.join( dbxref.split(':')[:-1]).strip() # restore nonterminal ':' if prefix in self.localtt: prefix = self.localtt[prefix] # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'AnimalQTLdb' and taxon in self.informal_species: prefix = self.informal_species[taxon] + 'QTL' dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None: if prefix == 'HPRD': # proteins are not == genes. model.addTriple(gene_id, self.globaltt['has gene product'], dbxref_curie) continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': omim_num = dbxref_curie[5:] if omim_num in self.omim_replaced: repl = self.omim_replaced[omim_num] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = 'OMIM:' + omim model.addXref(gene_id, dbxref_curie) omim_num = omim # last wins elif omim_num in self.omim_type and\ self.omim_type[omim_num] == self.globaltt['gene']: model.addXref(gene_id, dbxref_curie) else: continue # no equivilance between ncbigene and omin-nongene # designate clique leaders # (perhaps premature as this ingest can't know what else exists) try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) model = Model(g) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) # PYLINT complains iterparse deprecated, # but as of py 3.4 only the optional & unsupplied parse arg is. for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignoreS element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'Orphanet:'+str(disorder_num) if self.testMode and \ disorder_id not in \ config.get_config()['test_ids']['disease']: continue disorder_label = elem.find('Name').text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find('GeneList') for gene in gene_list.findall('Gene'): gene_iid = gene.get('id') gene_type = gene.find('GeneType').get('id') gene_iid_to_type[gene_iid] = gene_type # assuming that these are in the ontology model.addClassToGraph(disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') for a in assoc_list.findall('DisorderGeneAssociation'): gene_iid = a.find('.//Gene').get('id') gene_name = a.find('.//Gene/Name').text gene_symbol = a.find('.//Gene/Symbol').text gene_num = a.find('./Gene/OrphaNumber').text gene_id = 'Orphanet:'+str(gene_num) gene_type_id = \ self._map_gene_type_id(gene_iid_to_type[gene_iid]) model.addClassToGraph( gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find('./Gene/SynonymList') if int(syn_list.get('count')) > 0: for s in syn_list.findall('./Synonym'): model.addSynonym(gene_id, s.text) dgtype = a.find('DisorderGeneAssociationType').get('id') rel_id = self._map_rel_id(dgtype) dg_label = \ a.find('./DisorderGeneAssociationType/Name').text if rel_id is None: logger.warning( "Cannot map association type (%s) to RO " + "for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol) continue alt_locus_id = '_:'+gene_num+'-'+disorder_num+'VL' alt_label = \ ' '.join(('some variant of', gene_symbol.strip(), 'that is a', dg_label.lower(), disorder_label)) model.addIndividualToGraph(alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAffectedLocus(alt_locus_id, gene_id) model.addBlankNodeAnnotation(alt_locus_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = \ a.find('DisorderGeneAssociationStatus').get('id') # imported automatically asserted information # used in automatic assertion eco_id = 'ECO:0000323' # Assessed # TODO are these internal ids stable between releases? if status_code == '17991': # imported manually asserted information # used in automatic assertion eco_id = 'ECO:0000322' # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(g, self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph() rlist = a.find('./Gene/ExternalReferenceList') eqid = None for r in rlist.findall('ExternalReference'): if r.find('Source').text == 'Ensembl': eqid = 'ENSEMBL:'+r.find('Reference').text elif r.find('Source').text == 'HGNC': eqid = 'HGNC:'+r.find('Reference').text elif r.find('Source').text == 'OMIM': eqid = 'OMIM:'+r.find('Reference').text else: pass # skip the others for now if eqid is not None: model.addClassToGraph(eqid, None) model.addEquivalentClass(gene_id, eqid) elem.clear() # empty the element if self.testMode and limit is not None and line_counter > limit: return return
def _get_identifiers(self, limit): """ This will process the id mapping file provided by Biogrid. The file has a very large header, which we scan past, then pull the identifiers, and make equivalence axioms :param limit: :return: """ logger.info("getting identifier mapping") line_counter = 0 f = '/'.join((self.rawdir, self.files['identifiers']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] foundheader = False # TODO align this species filter with the one above # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster, # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',') speciesfilters = 'H**o sapiens,Mus musculus'.split(',') with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip header lines if not foundheader: if re.match(r'BIOGRID_ID', line.decode()): foundheader = True continue line = line.decode().strip() # BIOGRID_ID # IDENTIFIER_VALUE # IDENTIFIER_TYPE # ORGANISM_OFFICIAL_NAME # 1 814566 ENTREZ_GENE Arabidopsis thaliana (biogrid_num, id_num, id_type, organism_label) = line.split('\t') if self.testMode: g = self.testgraph # skip any genes that don't match our test set if int(biogrid_num) not in self.biogrid_ids: continue else: g = self.graph model = Model(g) # for each one of these, # create the node and add equivalent classes biogrid_id = 'BIOGRID:'+biogrid_num prefix = self._map_idtype_to_prefix(id_type) # TODO make these filters available as commandline options # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC, # WormBase,XenBase,ENSEMBL,miRBase'.split(',') geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',') # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein' if (speciesfilters is not None) \ and (organism_label.strip() in speciesfilters): line_counter += 1 if (geneidtypefilters is not None) \ and (prefix in geneidtypefilters): mapped_id = ':'.join((prefix, id_num)) model.addEquivalentClass(biogrid_id, mapped_id) # this symbol will only get attached to the biogrid class elif id_type == 'OFFICIAL_SYMBOL': model.addClassToGraph(biogrid_id, id_num) # elif (id_type == 'SYNONYM'): # FIXME - i am not sure these are synonyms, altids? # gu.addSynonym(g,biogrid_id,id_num) if not self.testMode and limit is not None \ and line_counter > limit: break myzip.close() return
def _process_genes(self, taxid, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) col = list(self.columns['bmq_attributes']) if taxid != '9606' and 'hgnc_id' in col: col.remove('hgnc_id') col_exp = [ self.columns['bmq_headers'][self.columns['bmq_attributes'].index(x)] for x in col] LOG.info("Processing Ensembl genes for NCBITaxon:%s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t') row = next(reader) if not self.check_fileheader(col_exp, row): pass for row in reader: ensembl_gene_id = row[col.index('ensembl_gene_id')] external_gene_name = row[col.index('external_gene_name')] description = row[col.index('description')].strip() gene_biotype = row[col.index('gene_biotype')].strip() entrezgene = row[col.index('entrezgene_id')].strip() ensembl_peptide_id = row[col.index('ensembl_peptide_id')].strip() uniprotswissprot = row[col.index('uniprotswissprot')].strip() hgnc_curie = None # in the case of human genes, we also get the hgnc id, if taxid == '9606' and 'hgnc_id' in col: hgnc_curie = row[col.index('hgnc_id')].strip() if self.test_mode and entrezgene != '' and \ entrezgene not in self.gene_ids: continue gene_id = 'ENSEMBL:' + ensembl_gene_id entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None gene_type_id = self.resolve( gene_biotype, mandatory=False, default=self.globaltt['polypeptide']) model.addClassToGraph( gene_id, external_gene_name, gene_type_id, description) if entrezgene != '': if taxid == '9606': # Use HGNC for eq in human data model.addXref(gene_id, entrez_curie) else: model.addEquivalentClass(gene_id, entrez_curie) if hgnc_curie is not None and hgnc_curie != '': model.addEquivalentClass(gene_id, hgnc_curie) geno.addTaxon('NCBITaxon:' + taxid, gene_id) if ensembl_peptide_id is not None and ensembl_peptide_id != '': peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id) model.addIndividualToGraph(peptide_curie, None, gene_type_id) geno.addGeneProduct(gene_id, peptide_curie) if uniprotswissprot != '': uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot) model.addIndividualToGraph(uniprot_curie, None, gene_type_id) geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.test_mode and limit is not None and reader.line_num > limit: break
def _process_genes(self, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) col = self.files['genes']['columns'] LOG.info("Processing HGNC genes") chr_pattern = re.compile(r'(\d+|X|Y|Z|W|MT)[pq$]') band_pattern = re.compile(r'([pq][A-H\d]?\d?(?:\.\d+)?)') with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) if not self.check_fileheader(col, row): pass for row in filereader: # To generate: # head -1 hgnc_complete_set.txt.1 | tr '\t' '\n' | # sed "s/\(.*\)/\1 = row[col.index(\'\1\')]/g" hgnc_id = row[col.index('hgnc_id')].strip() symbol = row[col.index('symbol')].strip() name = row[col.index('name')].strip() # locus_group = row[col.index('locus_group')] locus_type = row[col.index('locus_type')].strip() # status = row[col.index('status')] location = row[col.index('location')].strip() # location_sortable = row[col.index('location_sortable')] # alias_symbol = row[col.index('alias_symbol')] # alias_name = row[col.index('alias_name')] # prev_symbol = row[col.index('prev_symbol')] # prev_name = row[col.index('prev_name')] # gene_family = row[col.index('gene_family')] # gene_family_id = row[col.index('gene_family_id')] # date_approved_reserved = row[col.index('date_approved_reserved')] # date_symbol_changed = row[col.index('date_symbol_changed')] # date_name_changed = row[col.index('date_name_changed')] # date_modified = row[col.index('date_modified')] entrez_id = row[col.index('entrez_id')].strip() ensembl_gene_id = row[col.index('ensembl_gene_id')].strip() # vega_id = row[col.index('vega_id')] # ucsc_id = row[col.index('ucsc_id')] # ena = row[col.index('ena')] # refseq_accession = row[col.index('refseq_accession')] # ccds_id = row[col.index('ccds_id')] # uniprot_ids = row[col.index('uniprot_ids')] pubmed_ids = row[col.index( 'pubmed_id')].strip() # pipe separated! # mgd_id = row[col.index('mgd_id')] # rgd_id = row[col.index('rgd_id')] # lsdb = row[col.index('lsdb')] # cosmic = row[col.index('cosmic')] omim_ids = row[col.index('omim_id')].strip() # pipe separated! # mirbase = row[col.index('mirbase')] # homeodb = row[col.index('homeodb')] # snornabase = row[col.index('snornabase')] # bioparadigms_slc = row[col.index('bioparadigms_slc')] # orphanet = row[col.index('orphanet')] # pseudogene.org = row[col.index('pseudogene.org')] # horde_id = row[col.index('horde_id')] # merops = row[col.index('merops')] # imgt = row[col.index('imgt')] # iuphar = row[col.index('iuphar')] # kznf_gene_catalog = row[col.index('kznf_gene_catalog')] # mamit_trnadb = row[col.index('mamit-trnadb')] # cd = row[col.index('cd')] # lncrnadb = row[col.index('lncrnadb')] # enzyme_id = row[col.index('enzyme_id')] # intermediate_filament_db = row[col.index('intermediate_filament_db')] # rna_central_ids = row[col.index('rna_central_ids')] # lncipedia = row[col.index('lncipedia')] # gtrnadb = row[col.index('gtrnadb')] if self.test_mode and entrez_id != '' and \ entrez_id not in self.gene_ids: continue if name == '': name = None if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) elif symbol[ -1] == '@': # 10) region (HOX), RNA cluster, gene (PCDH) continue else: gene_type_id = self.resolve(locus_type, mandatory=False) if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) for omim_id in omim_ids.split('|'): if omim_id in self.omim_replaced: repl = self.omim_replaced[omim_id] LOG.warning('%s is replaced with %s', omim_id, repl) for omim in repl: if self.omim_type[omim] == self.globaltt['gene']: omim_id = omim if omim_id in self.omim_type and \ self.omim_type[omim_id] == self.globaltt['gene']: model.addEquivalentClass(hgnc_id, 'OMIM:' + omim_id) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" for pubmed_id in pubmed_ids.split('|'): graph.addTriple('PMID:' + pubmed_id, self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_match = chr_pattern.match(location) if chr_match is not None and chr_match.groups(): chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_match = band_pattern.search(location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and band_match.groups(): band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.test_mode and limit is not None and \ filereader.line_num > limit: break
def _process_genes(self, limit=None): if self.testMode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n . for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db, rna_central_ids) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' and \ int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self.resolve(locus_type, False) # withdrawn -> None? if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) if omim_id != '' and "|" not in omim_id: omim_curie = 'OMIM:' + omim_id if not DipperUtil.is_omim_disease(omim_curie): model.addEquivalentClass(hgnc_id, omim_curie) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': graph.addTriple('PMID:' + str(p.strip()), self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.testMode and limit is not None and line_counter > limit: break # end loop through file return
def _process_genes(self, taxid, limit=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 logger.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: raise ValueError("Data error for file %s", raw) (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene, peptide_id, uniprot_swissprot) = row[0:7] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[7] else: hgnc_id = None if self.testMode and entrezgene != '' \ and int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:' + ensembl_gene_id peptide_curie = 'ENSEMBL:{}'.format(peptide_id) uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot) entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None # gene_type_id = self._get_gene_type(gene_biotype) gene_type_id = None model.addClassToGraph(gene_id, external_gene_name, gene_type_id, description) model.addIndividualToGraph(peptide_curie, None, self._get_gene_type("polypeptide")) model.addIndividualToGraph(uniprot_curie, None, self._get_gene_type("polypeptide")) if entrezgene != '': model.addEquivalentClass(gene_id, entrez_curie) if hgnc_id is not None and hgnc_id != '': model.addEquivalentClass(gene_id, hgnc_id) geno.addTaxon('NCBITaxon:' + taxid, gene_id) if peptide_id != '': geno.addGeneProduct(gene_id, peptide_curie) if uniprot_swissprot != '': geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.testMode \ and limit is not None and line_counter > limit: break return