def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.testMode: graph = self.testgraph else: graph = self.graph filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # These will be made xrefs taxon_spec_xref_filters = {'10090': ['ENSEMBL'], '9606': ['ENSEMBL']} if taxon in taxon_spec_xref_filters: taxon_spec_filters = taxon_spec_xref_filters[taxon] else: taxon_spec_filters = [] model = Model(graph) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for ref in xrefs.strip().split('|'): xref_curie = self._cleanup_id(ref) if xref_curie is not None and xref_curie.strip() != '': if re.match(r'HPRD', xref_curie): # proteins are not == genes. model.addTriple(gene_id, self.properties['has_gene_product'], xref_curie) continue # skip some of these for now if xref_curie.split(':')[0] in filter_out: continue if xref_curie.split(':')[0] in taxon_spec_xref_filters: model.addXref(gene_id, xref_curie) if re.match(r'^OMIM', xref_curie): if DipperUtil.is_omim_disease(xref_curie): continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, xref_curie) if int(taxon) in clique_map: if clique_map[int(taxon)] == xref_curie.split( ':')[0]: model.makeLeader(xref_curie) elif clique_map[int(taxon)] == gene_id.split( ':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, xref_curie) except AssertionError as e: logger.warn("Error parsing {0}: {1}".format(gene_id, e)) return
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.testMode: graph = self.testgraph else: graph = self.graph filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] taxon_spec_filters = { '10090': ['ENSEMBL'] } if taxon in taxon_spec_filters: filter_out += taxon_spec_filters[taxon] model = Model(graph) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for ref in xrefs.strip().split('|'): xref_curie = self._cleanup_id(ref) if xref_curie is not None and xref_curie.strip() != '': if re.match(r'HPRD', xref_curie): # proteins are not == genes. model.addTriple( gene_id, self.properties['has_gene_product'], xref_curie) continue # skip some of these for now if xref_curie.split(':')[0] in filter_out: continue if re.match(r'^OMIM', xref_curie): if DipperUtil.is_omim_disease(xref_curie): continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass( gene_id, xref_curie) if int(taxon) in clique_map: if clique_map[int(taxon)] == xref_curie.split(':')[0]: model.makeLeader(xref_curie) elif clique_map[int(taxon)] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, xref_curie) except AssertionError as e: logger.warn("Error parsing {0}: {1}".format(gene_id, e)) return
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in xrefs.strip().split('|'): prefix = ':'.join(dbxref.split(':')[:-1]).strip() if prefix in self.localtt: prefix = self.localtt[prefix] dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None and prefix != '': if prefix == 'HPRD': # proteins are not == genes. model.addTriple(gene_id, self.globaltt['has gene product'], dbxref_curie) continue # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': if DipperUtil.is_omim_disease(dbxref_curie): continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err) return
def _process_genes(self, limit=None): if self.testMode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n . for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db, rna_central_ids) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' and \ int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self.resolve(locus_type, False) # withdrawn -> None? if gene_type_id != locus_type: model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass(hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass(hgnc_id, 'ENSEMBL:' + ensembl_gene_id) if omim_id != '' and "|" not in omim_id: omim_curie = 'OMIM:' + omim_id if not DipperUtil.is_omim_disease(omim_curie): model.addEquivalentClass(hgnc_id, omim_curie) geno.addTaxon(self.hs_txid, hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': graph.addTriple('PMID:' + str(p.strip()), self.globaltt['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, self.hs_txid, 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) feat = Feature(graph, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom band_id = makeChromID(band, self.hs_txid, 'CHR') model.addClassToGraph(band_id, None) feat.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) feat.addSubsequenceOfFeature(chrom_id) if not self.testMode and limit is not None and line_counter > limit: break # end loop through file return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ LOG.info("Processing OMIM to KEGG gene") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, omim_id, link_type) = row if self.test_mode and kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-' + kegg_gene_id.strip() omim_id = re.sub(r'omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! # so add them as a class then make equivalence model.addClassToGraph(omim_id, None) geno.addGene(kegg_gene_id, None) if not DipperUtil.is_omim_disease(omim_id): model.addEquivalentClass(kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID & the KEGG gene ID # we do this with omim ids because # they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] model.addIndividualToGraph( alt_locus_id, alt_label, self.globaltt['variant_locus']) geno.addAffectedLocus(alt_locus_id, kegg_gene_id) model.addBlankNodeAnnotation(alt_locus_id) # Add the disease to gene relationship. rel = self.globaltt['is marker for'] assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph() elif link_type == 'original': # these are sometimes a gene, and sometimes a disease LOG.info( 'Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are LOG.warning( 'Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.test_mode) and ( limit is not None and line_counter > limit): break LOG.info("Done with OMIM to KEGG gene") return
def _process_genes(self, limit=None): if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) model = Model(g) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # curl -s ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt | head -1 | tr '\t' '\n' | grep -n . for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db, rna_central_ids) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' \ and int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self._get_gene_type(locus_type) model.addClassToGraph(hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': model.addDeprecatedClass(hgnc_id) else: model.makeLeader(hgnc_id) if entrez_id != '': model.addEquivalentClass( hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': model.addEquivalentClass( hgnc_id, 'ENSEMBL:' + ensembl_gene_id) if omim_id != '' and "|" not in omim_id: omim_curie = 'OMIM:' + omim_id if not DipperUtil.is_omim_disease(omim_curie): model.addEquivalentClass(hgnc_id, omim_curie) geno.addTaxon('NCBITaxon:9606', hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': g.addTriple( 'PMID:' + str(p.strip()), model.object_properties['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) f = Feature(g, hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom # TEC Monoch? Monarchdom?? band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR') model.addClassToGraph(band_id, None) f.addSubsequenceOfFeature(band_id) else: model.addClassToGraph(chrom_id, None) f.addSubsequenceOfFeature(chrom_id) if not self.testMode \ and limit is not None and line_counter > limit: break # end loop through file return