def _map_eom_terms(self, raw, limit=None): """ This table contains the HP ID mappings from the local tsv file. Triples: <eom id> owl:equivalentClass <hp id> :param raw: :param limit: :return: """ gu = GraphUtils(curie_map.get()) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip for line in f1: line_counter += 1 (morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = line.split('\t') # Sub out the underscores for colons. hp_id = re.sub('_', ':', hp_id) if re.match(".*HP:.*", hp_id): # add the HP term as a class gu.addClassToGraph(self.graph, hp_id, None) # Add the HP ID as an equivalent class gu.addEquivalentClass(self.graph, morphology_term_id, hp_id) else: logger.warning('No matching HP term for %s', morphology_term_label) if limit is not None and line_counter > limit: break return
def _get_process_allelic_variants(self, entry, g): gu = GraphUtils(curie_map.get()) geno = Genotype(g) du = DipperUtil() if entry is not None: publist = {} # to hold the entry-specific publication mentions for the allelic variants entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, g) if 'allelicVariantList' in entry: allelicVariantList = entry['allelicVariantList'] for al in allelicVariantList: al_num = al['allelicVariant']['number'] al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4) al_label = None al_description = None if al['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in al['allelicVariant']: al_label = al['allelicVariant']['mutations'] if 'text' in al['allelicVariant']: al_description = al['allelicVariant']['text'] m = re.findall('\{(\d+)\:', al_description) publist[al_id] = set(m) geno.addAllele(al_id, al_label, geno.genoparts['variant_locus'], al_description) geno.addAlleleOfGene(al_id, 'OMIM:'+str(entry_num), geno.object_properties['is_sequence_variant_instance_of']) for r in publist[al_id]: pmid = ref_to_pmid[int(r)] gu.addTriple(g, pmid, gu.object_properties['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in al['allelicVariant']: dbsnp_ids = re.split(',', al['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:'+dnum.strip() gu.addIndividualToGraph(g, did, None) gu.addEquivalentClass(g, al_id, did) if 'clinvarAccessions' in al['allelicVariant']: # clinvarAccessions triple semicolon delimited, each lik eRCV000020059;;1 rcv_ids = re.split(';;;', al['allelicVariant']['clinvarAccessions']) rcv_ids = [(re.match('(RCV\d+)\;\;', r)).group(1) for r in rcv_ids] for rnum in rcv_ids: rid = 'ClinVar:'+rnum gu.addXref(g, al_id, rid) gu.addPage(g, al_id, "http://omim.org/entry/"+str(entry_num)+"#"+str(al_num).zfill(4)) elif re.search('moved', al['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in al['allelicVariant']: moved_id = 'OMIM:'+al['allelicVariant']['movedTo'] moved_ids = [moved_id] gu.addDeprecatedIndividual(g, al_id, moved_ids) else: logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status']) # end loop allelicVariantList return
def _process_genes(self, taxid, limit=None): gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 logger.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: logger.error("Data error for file %s", raw) return (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene) = row[0:5] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[5] else: hgnc_id = None if self.testMode and entrezgene != '' \ and int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:'+ensembl_gene_id if description == '': description = None gene_type_id = self._get_gene_type(gene_biotype) gene_type_id = None gu.addClassToGraph( g, gene_id, external_gene_name, gene_type_id, description) if entrezgene != '': gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene) if hgnc_id is not None and hgnc_id != '': gu.addEquivalentClass(g, gene_id, hgnc_id) geno.addTaxon('NCBITaxon:'+taxid, gene_id) if not self.testMode \ and limit is not None and line_counter > limit: break gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
def _process_genes_kegg2ncbi(self, limit=None): """ This method maps the KEGG human gene IDs to the corresponding NCBI Gene IDs. Triples created: <kegg_gene_id> is a class <ncbi_gene_id> is a class <kegg_gene_id> equivalentClass <ncbi_gene_id> :param limit: :return: """ logger.info("Processing KEGG gene IDs to NCBI gene IDs") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['ncbi']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, ncbi_gene_id, link_type) = row if self.testMode and \ kegg_gene_id not in self.test_ids['genes']: continue # Adjust the NCBI gene ID prefix. ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id) kegg_gene_id = 'KEGG-'+kegg_gene_id # Adding the KEGG gene ID to the graph here is redundant, # unless there happens to be additional gene IDs in this table # not present in the genes table. gu.addClassToGraph(g, kegg_gene_id, None) gu.addClassToGraph(g, ncbi_gene_id, None) gu.addEquivalentClass(g, kegg_gene_id, ncbi_gene_id) if (not self.testMode) and ( limit is not None and line_counter > limit): break logger.info("Done with KEGG gene IDs to NCBI gene IDs") return
def _process_pathway_pathway(self, limit): """ There are "map" and "ko" identifiers for pathways. This makes equivalence mapping between them, where they exist. :param limit: :return: """ logger.info("Processing KEGG pathways to other ids") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['pathway_pathway']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (pathway_id_1, pathway_id_2) = row if self.testMode and \ pathway_id_1 not in self.test_ids['pathway']: continue pathway_id_1 = 'KEGG-'+pathway_id_1 # will look like KEGG-path:map04130 or KEGG-path:ko04130 pathway_id_2 = 'KEGG-'+pathway_id_2 if pathway_id_1 != pathway_id_2: gu.addEquivalentClass(g, pathway_id_1, pathway_id_2) if not self.testMode and \ limit is not None and line_counter > limit: break return
def _get_mappedids(self, entry, g): """ Extract the Orphanet and UMLS ids as equivalences from the entry :param entry: :return: """ # umlsIDs gu = GraphUtils(curie_map.get()) omimid = 'OMIM:'+str(entry['mimNumber']) orpha_mappings = [] if 'externalLinks' in entry: links = entry['externalLinks'] if 'orphanetDiseases' in links: # triple semi-colon delimited list of double semi-colon delimited orphanet ID/disease pairs # 2970;;566;;Prune belly syndrome items = links['orphanetDiseases'].split(';;;') for i in items: (orpha_num, internal_num, orpha_label) = i.split(';;') orpha_id = 'Orphanet:'+orpha_num.strip() orpha_mappings.append(orpha_id) gu.addClassToGraph(g, orpha_id, orpha_label.strip()) gu.addXref(g, omimid, orpha_id) if 'umlsIDs' in links: umls_mappings = links['umlsIDs'].split(',') for i in umls_mappings: umls_id = 'UMLS:'+i gu.addClassToGraph(g, umls_id, None) gu.addXref(g, omimid, umls_id) if self._get_omimtype(entry) == Genotype.genoparts['gene'] and 'geneIDs' in links: entrez_mappings = links['geneIDs'] for i in entrez_mappings.split(','): gu.addEquivalentClass(g, omimid, 'NCBIGene:'+str(i)) return
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) myfile = "/".join((self.rawdir, self.files["disease-gene"]["file"])) for event, elem in ET.iterparse(myfile): if elem.tag == "Disorder": # get the element name and id # id = elem.get('id') # some internal identifier disorder_num = elem.find("OrphaNumber").text disorder_id = "Orphanet:" + str(disorder_num) if self.testMode and disorder_id not in config.get_config()["test_ids"]["disease"]: continue disorder_label = elem.find("Name").text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find("GeneList") for gene in gene_list.findall("Gene"): gene_iid = gene.get("id") gene_type = gene.find("GeneType").get("id") gene_iid_to_type[gene_iid] = gene_type gu.addClassToGraph(g, disorder_id, disorder_label) # assuming that these are in the ontology assoc_list = elem.find("DisorderGeneAssociationList") for a in assoc_list.findall("DisorderGeneAssociation"): gene_iid = a.find(".//Gene").get("id") gene_name = a.find(".//Gene/Name").text gene_symbol = a.find(".//Gene/Symbol").text gene_num = a.find("./Gene/OrphaNumber").text gene_id = "Orphanet:" + str(gene_num) gene_type_id = self._map_gene_type_id(gene_iid_to_type[gene_iid]) gu.addClassToGraph(g, gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find("./Gene/SynonymList") if int(syn_list.get("count")) > 0: for s in syn_list.findall("./Synonym"): gu.addSynonym(g, gene_id, s.text) dgtype = a.find("DisorderGeneAssociationType").get("id") rel_id = self._map_rel_id(dgtype) dg_label = a.find("./DisorderGeneAssociationType/Name").text if rel_id is None: logger.warn( "Cannot map association type (%s) to RO for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol, ) continue alt_locus_id = "_" + gene_num + "-" + disorder_num + "VL" alt_label = " ".join( ("some variant of", gene_symbol.strip(), "that is a", dg_label.lower(), disorder_label) ) if self.nobnodes: alt_locus_id = ":" + alt_locus_id gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts["variant_locus"]) geno.addAlleleOfGene(alt_locus_id, gene_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = a.find("DisorderGeneAssociationStatus").get("id") eco_id = "ECO:0000323" # imported automatically asserted information used in automatic assertion if status_code == "17991": # Assessed # TODO are these internal ids stable between releases? eco_id = "ECO:0000322" # imported manually asserted information used in automatic assertion # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph(g) rlist = a.find("./Gene/ExternalReferenceList") eqid = None for r in rlist.findall("ExternalReference"): if r.find("Source").text == "Ensembl": eqid = "ENSEMBL:" + r.find("Reference").text elif r.find("Source").text == "HGNC": eqid = "HGNC:" + r.find("Reference").text elif r.find("Source").text == "OMIM": eqid = "OMIM:" + r.find("Reference").text else: pass # skip the others for now if eqid is not None: gu.addClassToGraph(g, eqid, None) gu.addEquivalentClass(g, gene_id, eqid) pass elem.clear() # discard the element if self.testMode and limit is not None and line_counter > limit: return gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadAllProperties(g) return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) # not unzipping the file logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", myfile) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere gu.addClassToGraph(g, tax_id, None) # label added elsewhere with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chr, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self._map_type_of_gene(gtype) if symbol == 'NEWENTRY': label = None else: label = symbol # TODO might have to figure out if things aren't genes, and make them individuals gu.addClassToGraph(g, gene_id, label, gene_type_id, desc) # we have to do special things here for genes, because they're classes not individuals # f = Feature(gene_id,label,gene_type_id,desc) if name != '-': gu.addSynonym(g, gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 if xrefs.strip() != '-': for r in xrefs.strip().split('|'): fixedr = self._cleanup_id(r) if fixedr is not None and fixedr.strip() != '': if re.match('HPRD', fixedr): # proteins are not == genes. gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr) else: # skip some of these for now if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']: gu.addEquivalentClass(g, gene_id, fixedr) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility # 101928066 LOC101928066 1|Un - # unlocated scaffold # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 2C3 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # mouse --> 11B1.1 # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table when there is > 1 listed # with the exception of human X|Y, i will only take those that align to one chr # FIXME remove the chr mapping below when we pull in the genomic coords if str(chr) != '-' and str(chr) != '': if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']: # this means that there's uncertainty in the mapping. skip it # TODO we'll need to figure out how to deal with >1 loc mapping logger.info('%s is non-uniquely mapped to %s. Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chr) == 'X; Y': chr = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split('\|',str(chr)) : geno.addChromosomeClass(c, tax_id, None) # assume that the chromosome label will get added elsewhere mychrom = makeChromID(c, tax_num, 'CHR') mychrom_syn = makeChromLabel(c, tax_num) # temporarily use the taxnum for the disambiguating label gu.addSynonym(g, mychrom, mychrom_syn) band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, so make that kind of band # not sure why this matches? chrX|Y or 10090chr12|Un" # TODO we probably need a different regex per organism # the maploc_id already has the numeric chromosome in it, strip it first bid = re.sub('^'+c, '', map_loc) maploc_id = makeChromID(c+bid, tax_num, 'CHR') # the generic location (no coordinates) # print(map_loc,'-->',bid,'-->',maploc_id) band = Feature(maploc_id, None, None) # Assume it's type will be added elsewhere band.addFeatureToGraph(g) # add the band as the containing feature gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases # examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24, ## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1, 12cen-q21, 22q13.3|22q13.3 logger.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) if not self.testMode and limit is not None and line_counter > limit: break gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
def _get_identifiers(self, limit): """ This will process the id mapping file provided by Biogrid. The file has a very large header, which we scan past, then pull the identifiers, and make equivalence axioms :param limit: :return: """ logger.info("getting identifier mapping") line_counter = 0 f = '/'.join((self.rawdir, self.files['identifiers']['file'])) myzip = ZipFile(f, 'r') # assume that the first entry is the item fname = myzip.namelist()[0] foundheader = False gu = GraphUtils(curie_map.get()) # TODO align this species filter with the one above # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster, # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',') speciesfilters = 'H**o sapiens,Mus musculus'.split(',') with myzip.open(fname, 'r') as csvfile: for line in csvfile: # skip header lines if not foundheader: if re.match(r'BIOGRID_ID', line.decode()): foundheader = True continue line = line.decode().strip() # BIOGRID_ID # IDENTIFIER_VALUE # IDENTIFIER_TYPE # ORGANISM_OFFICIAL_NAME # 1 814566 ENTREZ_GENE Arabidopsis thaliana (biogrid_num, id_num, id_type, organism_label) = line.split('\t') if self.testMode: g = self.testgraph # skip any genes that don't match our test set if int(biogrid_num) not in self.biogrid_ids: continue else: g = self.graph # for each one of these, # create the node and add equivalent classes biogrid_id = 'BIOGRID:'+biogrid_num prefix = self._map_idtype_to_prefix(id_type) # TODO make these filters available as commandline options # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC, # WormBase,XenBase,ENSEMBL,miRBase'.split(',') geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',') # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein' if (speciesfilters is not None) \ and (organism_label.strip() in speciesfilters): line_counter += 1 if (geneidtypefilters is not None) \ and (prefix in geneidtypefilters): mapped_id = ':'.join((prefix, id_num)) gu.addEquivalentClass(g, biogrid_id, mapped_id) # this symbol will only get attached to the biogrid class elif id_type == 'OFFICIAL_SYMBOL': gu.addClassToGraph(g, biogrid_id, id_num) # elif (id_type == 'SYNONYM'): # FIXME - i am not sure these are synonyms, altids? # gu.addSynonym(g,biogrid_id,id_num) if not self.testMode and limit is not None \ and line_counter > limit: break myzip.close() return
def _process_omim2disease(self, limit=None): """ This method maps the KEGG disease IDs to the corresponding OMIM disease IDs. Currently this only maps KEGG diseases and OMIM diseases that have a 1:1 mapping. Triples created: <kegg_disease_id> is a class <omim_disease_id> is a class <kegg_disease_id> hasXref <omim_disease_id> :param limit: :return: """ logger.info("Processing 1:1 KEGG disease to OMIM disease mappings") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['omim2disease']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (omim_disease_id, kegg_disease_id, link_type) = row kegg_disease_id = 'KEGG-'+kegg_disease_id.strip() omim_disease_id = re.sub('omim', 'OMIM', omim_disease_id) # Create hash for the links from OMIM ID -> KEGG ID if omim_disease_id not in self.omim_disease_hash: self.omim_disease_hash[omim_disease_id] = [kegg_disease_id] else: self.omim_disease_hash[omim_disease_id].append(kegg_disease_id) # Create hash for the links from KEGG ID -> OMIM ID if kegg_disease_id not in self.kegg_disease_hash: self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id] else: self.kegg_disease_hash[kegg_disease_id].append(omim_disease_id) # Now process the disease hashes and only process 1:1 omim disease:KEGG disease entries. for omim_disease_id in self.omim_disease_hash: if self.testMode and omim_disease_id not in self.test_ids['disease']: continue if (not self.testMode) and (limit is not None and line_counter > limit): break line_counter += 1 if len(self.omim_disease_hash[omim_disease_id]) == 1: kegg_disease_id = ''.join(self.omim_disease_hash.get(omim_disease_id)) if len(self.kegg_disease_hash[kegg_disease_id]) == 1: # add ids, and deal with the labels separately gu.addClassToGraph(g, kegg_disease_id, None) gu.addClassToGraph(g, omim_disease_id, None) gu.addEquivalentClass(g, kegg_disease_id, omim_disease_id) # safe? # gu.addXref(g, kegg_disease_id, omim_disease_id) logger.info("Done with KEGG disease to OMIM disease mappings.") return
def _process_omim2gene(self, limit=None): """ This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field. Equivalent link types are mapped as gene XRefs. Reverse link types are mapped as disease to gene associations. Original link types are currently skipped. Triples created: <kegg_gene_id> is a Gene <omim_gene_id> is a Gene <kegg_gene_id>> hasXref <omim_gene_id> <assoc_id> has subject <omim_disease_id> <assoc_id> has object <kegg_gene_id> :param limit: :return: """ logger.info("Processing OMIM to KEGG gene") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['omim2gene']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (kegg_gene_id, omim_id, link_type) = row if self.testMode and kegg_gene_id not in self.test_ids['genes']: continue kegg_gene_id = 'KEGG-'+kegg_gene_id.strip() omim_id = re.sub('omim', 'OMIM', omim_id) if link_type == 'equivalent': # these are genes! so add them as a class then make equivalence gu.addClassToGraph(g, omim_id, None) geno.addGene(kegg_gene_id, None) gu.addEquivalentClass(g, kegg_gene_id, omim_id) elif link_type == 'reverse': # make an association between an OMIM ID and the KEGG gene ID # we do this with omim ids because they are more atomic than KEGG ids alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id) alt_label = self.label_hash[alt_locus_id] gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAlleleOfGene(alt_locus_id, kegg_gene_id) # Add the disease to gene relationship. rel = gu.object_properties['is_marker_for'] assoc = G2PAssoc(self.name, alt_locus_id, omim_id, rel) assoc.add_association_to_graph(g) elif link_type == 'original': # these are sometimes a gene, and sometimes a disease logger.info('Unable to handle original link for %s-%s', kegg_gene_id, omim_id) else: # don't know what these are logger.warn('Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type) if (not self.testMode) and (limit is not None and line_counter > limit): break logger.info("Done with OMIM to KEGG gene") gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) return
def _process_genes(self, limit=None): gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' \ and int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self._get_gene_type(locus_type) gu.addClassToGraph(g, hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': gu.addDeprecatedClass(g, hgnc_id) if entrez_id != '': gu.addEquivalentClass( g, hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': gu.addEquivalentClass( g, hgnc_id, 'ENSEMBL:' + ensembl_gene_id) geno.addTaxon('NCBITaxon:9606', hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': gu.addTriple( g, 'PMID:' + str(p.strip()), gu.object_properties['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) f = Feature(hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom # TEC Monoch? Monarchdom?? band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR') gu.addClassToGraph(g, band_id, None) f.addSubsequenceOfFeature(g, band_id) else: gu.addClassToGraph(g, chrom_id, None) f.addSubsequenceOfFeature(g, chrom_id) if not self.testMode \ and limit is not None and line_counter > limit: break # end loop through file gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
def _process_trait_mappings(self, raw, limit=None): """ This method Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) # with open(raw, 'r') as csvfile: # filereader = csv.reader(csvfile, delimiter=',') # row_count = sum(1 for row in filereader) # row_count = row_count - 1 with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip header line for row in filereader: line_counter += 1 # need to skip the last line if len(row) < 8: logger.info("skipping line %d: %s", line_counter, '\t'.join(row)) continue (vto_id, pto_id, cmo_id, ato_column, species, trait_class, trait_type, qtl_count) = row ato_id = re.sub('ATO #', 'AQTLTrait:', re.sub('\].*', '', re.sub('\[', '', ato_column))) ato_label = re.sub('.*\]\s*', '', ato_column) # if species == 'Cattle': # ato_id = re.sub('ATO:', 'AQTLTraitCattle:', ato_id) # elif species == 'Chicken': # ato_id = re.sub('ATO:', 'AQTLTraitChicken:', ato_id) # elif species == 'Sheep': # ato_id = re.sub('ATO:', 'AQTLTraitSheep:', ato_id) # elif species == 'Horse': # ato_id = re.sub('ATO:', 'AQTLTraitHorse:', ato_id) # elif species == 'Pig': # ato_id = re.sub('ATO:', 'AQTLTraitPig:', ato_id) # elif species == 'Rainbow trout': # ato_id = re.sub('ATO:', 'AQTLTraitRainbowTrout:', ato_id) # else: # logger.warn(' Unknown species %s found in trait mapping file.', species) # continue #print(ato_label) gu.addClassToGraph(g, ato_id, ato_label.strip()) if re.match('VT:.*', vto_id): gu.addClassToGraph(g, vto_id, None) gu.addEquivalentClass(g, ato_id, vto_id) if re.match('PT:.*', pto_id): gu.addClassToGraph(g, pto_id, None) gu.addEquivalentClass(g, ato_id, pto_id) if re.match('CMO:.*', cmo_id): gu.addClassToGraph(g, cmo_id, None) gu.addXref(g, ato_id, cmo_id) logger.info("Done with trait mappings") return
def _process_diseasegene(self, limit): """ :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu = GraphUtils(curie_map.get()) myfile = '/'.join((self.rawdir, self.files['disease-gene']['file'])) # PYLINT complains iterparse deprecated, # but as of py 3.4 only the optional & unsupplied parse arg is. for event, elem in ET.iterparse(myfile): if elem.tag == 'Disorder': # get the element name and id, ignoreS element name # id = elem.get('id') # some internal identifier disorder_num = elem.find('OrphaNumber').text disorder_id = 'Orphanet:'+str(disorder_num) if self.testMode and \ disorder_id not in \ config.get_config()['test_ids']['disease']: continue disorder_label = elem.find('Name').text # make a hash of internal gene id to type for later lookup gene_iid_to_type = {} gene_list = elem.find('GeneList') for gene in gene_list.findall('Gene'): gene_iid = gene.get('id') gene_type = gene.find('GeneType').get('id') gene_iid_to_type[gene_iid] = gene_type # assuming that these are in the ontology gu.addClassToGraph(g, disorder_id, disorder_label) assoc_list = elem.find('DisorderGeneAssociationList') for a in assoc_list.findall('DisorderGeneAssociation'): gene_iid = a.find('.//Gene').get('id') gene_name = a.find('.//Gene/Name').text gene_symbol = a.find('.//Gene/Symbol').text gene_num = a.find('./Gene/OrphaNumber').text gene_id = 'Orphanet:'+str(gene_num) gene_type_id = \ self._map_gene_type_id(gene_iid_to_type[gene_iid]) gu.addClassToGraph( g, gene_id, gene_symbol, gene_type_id, gene_name) syn_list = a.find('./Gene/SynonymList') if int(syn_list.get('count')) > 0: for s in syn_list.findall('./Synonym'): gu.addSynonym(g, gene_id, s.text) dgtype = a.find('DisorderGeneAssociationType').get('id') rel_id = self._map_rel_id(dgtype) dg_label = \ a.find('./DisorderGeneAssociationType/Name').text if rel_id is None: logger.warning( "Cannot map association type (%s) to RO " + "for association (%s | %s). Skipping.", dg_label, disorder_label, gene_symbol) continue alt_locus_id = '_'+gene_num+'-'+disorder_num+'VL' alt_label = \ ' '.join(('some variant of', gene_symbol.strip(), 'that is a', dg_label.lower(), disorder_label)) if self.nobnodes: alt_locus_id = ':'+alt_locus_id gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus']) geno.addAlleleOfGene(alt_locus_id, gene_id) # consider typing the gain/loss-of-function variants like: # http://sequenceontology.org/browser/current_svn/term/SO:0002054 # http://sequenceontology.org/browser/current_svn/term/SO:0002053 # use "assessed" status to issue an evidence code # FIXME I think that these codes are sub-optimal status_code = \ a.find('DisorderGeneAssociationStatus').get('id') # imported automatically asserted information # used in automatic assertion eco_id = 'ECO:0000323' # Assessed # TODO are these internal ids stable between releases? if status_code == '17991': # imported manually asserted information # used in automatic assertion eco_id = 'ECO:0000322' # Non-traceable author statement ECO_0000034 # imported information in automatic assertion ECO_0000313 assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph(g) rlist = a.find('./Gene/ExternalReferenceList') eqid = None for r in rlist.findall('ExternalReference'): if r.find('Source').text == 'Ensembl': eqid = 'ENSEMBL:'+r.find('Reference').text elif r.find('Source').text == 'HGNC': eqid = 'HGNC:'+r.find('Reference').text elif r.find('Source').text == 'OMIM': eqid = 'OMIM:'+r.find('Reference').text else: pass # skip the others for now if eqid is not None: gu.addClassToGraph(g, eqid, None) gu.addEquivalentClass(g, gene_id, eqid) elem.clear() # discard the element if self.testMode and limit is not None and line_counter > limit: return gu.loadProperties( g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadAllProperties(g) return