def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in xrefs.strip().split('|'): prefix = ':'.join(dbxref.split(':')[:-1]).strip() if prefix in self.localtt: prefix = self.localtt[prefix] dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None and prefix != '': if prefix == 'HPRD': # proteins are not == genes. model.addTriple( gene_id, self.globaltt['has gene product'], dbxref_curie) continue # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': if dbxref_curie in self.omim_replaced: repl = self.omim_replaced[dbxref_curie] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = omim if dbxref_curie in self.omim_type and \ self.omim_type[dbxref_curie] != self.globaltt['gene']: continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _get_mappedids(self, entry, g): """ Extract the Orphanet and UMLS ids as equivalences from the entry :param entry: :return: """ model = Model(g) omimid = 'OMIM:'+str(entry['mimNumber']) orpha_mappings = [] if 'externalLinks' in entry: links = entry['externalLinks'] if 'orphanetDiseases' in links: # triple semi-colon delimited list of # double semi-colon delimited orphanet ID/disease pairs # 2970;;566;;Prune belly syndrome items = links['orphanetDiseases'].split(';;;') for i in items: # note 'internal_num unused (orpha_num, internal_num, orpha_label) = i.split(';;') orpha_id = 'Orphanet:'+orpha_num.strip() orpha_mappings.append(orpha_id) model.addClassToGraph(orpha_id, orpha_label.strip()) model.addXref(omimid, orpha_id) if 'umlsIDs' in links: umls_mappings = links['umlsIDs'].split(',') for i in umls_mappings: umls_id = 'UMLS:'+i model.addClassToGraph(umls_id, None) model.addXref(omimid, umls_id) return
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in xrefs.strip().split('|'): prefix = ':'.join(dbxref.split(':')[:-1]).strip() if prefix in self.localtt: prefix = self.localtt[prefix] dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None and prefix != '': if prefix == 'HPRD': # proteins are not == genes. model.addTriple(gene_id, self.globaltt['has gene product'], dbxref_curie) continue # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': if dbxref_curie in self.omim_replaced: repl = self.omim_replaced[dbxref_curie] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = omim if dbxref_curie in self.omim_type and \ self.omim_type[dbxref_curie] != self.globaltt['gene']: continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _get_mappedids(entry, graph): """ Extract the Orphanet and UMLS ids as equivalences from the entry :param entry: :return: """ model = Model(graph) omim_num = str(entry['mimNumber']) omim_curie = 'OMIM:' + omim_num orpha_mappings = [] if 'externalLinks' in entry: links = entry['externalLinks'] if 'orphanetDiseases' in links: # triple semi-colon delimited list of # double semi-colon delimited orphanet ID/disease pairs # 2970;;566;;Prune belly syndrome items = links['orphanetDiseases'].strip().split(';;;') for item in items: orphdis = item.strip().split(';;') orpha_num = orphdis[0].strip() orpha_label = orphdis[2].strip() orpha_curie = 'ORPHA:' + orpha_num orpha_mappings.append(orpha_curie) model.addClassToGraph(orpha_curie, orpha_label) model.addXref(omim_curie, orpha_curie) if 'umlsIDs' in links: umls_mappings = links['umlsIDs'].split(',') for umls in umls_mappings: umls_curie = 'UMLS:' + umls model.addClassToGraph(umls_curie, None) model.addXref(omim_curie, umls_curie)
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.testMode: graph = self.testgraph else: graph = self.graph filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # These will be made xrefs taxon_spec_xref_filters = {'10090': ['ENSEMBL'], '9606': ['ENSEMBL']} if taxon in taxon_spec_xref_filters: taxon_spec_filters = taxon_spec_xref_filters[taxon] else: taxon_spec_filters = [] model = Model(graph) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for ref in xrefs.strip().split('|'): xref_curie = self._cleanup_id(ref) if xref_curie is not None and xref_curie.strip() != '': if re.match(r'HPRD', xref_curie): # proteins are not == genes. model.addTriple(gene_id, self.properties['has_gene_product'], xref_curie) continue # skip some of these for now if xref_curie.split(':')[0] in filter_out: continue if xref_curie.split(':')[0] in taxon_spec_xref_filters: model.addXref(gene_id, xref_curie) if re.match(r'^OMIM', xref_curie): if DipperUtil.is_omim_disease(xref_curie): continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, xref_curie) if int(taxon) in clique_map: if clique_map[int(taxon)] == xref_curie.split( ':')[0]: model.makeLeader(xref_curie) elif clique_map[int(taxon)] == gene_id.split( ':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, xref_curie) except AssertionError as e: logger.warn("Error parsing {0}: {1}".format(gene_id, e)) return
def _process_trait_mappings(self, raw, src_key, limit=None): """ This method mapps traits from/to ... Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) col = self.files[src_key]['columns'] with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') header = next(filereader, None) self.check_fileheader(col, header) for row in filereader: line_counter += 1 # need to skip the last line if len(row) != len(col): LOG.info("skipping line %d: %s", line_counter, '\t'.join(row)) continue vto_id = row[col.index('VT')].strip() pto_id = row[col.index('LPT')].strip() cmo_id = row[col.index('CMO')].strip() ato_column = row[col.index('ATO')].strip() # species = row[col.index('Species')].strip() # trait_class = row[col.index('Class')].strip() # trait_type = row[col.index('Type')].strip() # qtl_count = row[col.index('QTL_Count')].strip() ato_id = re.sub( r'ATO #', 'AQTLTrait:', re.sub( r'\].*', '', re.sub(r'\[', '', ato_column))) ato_id = ato_id.strip() ato_label = re.sub(r'.*\]\s*', '', ato_column) model.addClassToGraph(ato_id, ato_label.strip()) if re.match(r'VT:.*', vto_id): model.addClassToGraph(vto_id, None) model.addEquivalentClass(ato_id, vto_id) if re.match(r'LPT:.*', pto_id): model.addClassToGraph(pto_id, None) model.addXref(ato_id, pto_id) if re.match(r'CMO:.*', cmo_id): model.addClassToGraph(cmo_id, None) model.addXref(ato_id, cmo_id) LOG.info("Done with trait mappings") return
def _process_lida_links_row(self, row): model = Model(self.graph) # lidaurl, omia_id, added_by omia_id = 'OMIA:' + row['omia_id'] lidaurl = row['lidaurl'] if self.test_mode and omia_id not in self.test_ids['disease']: return model.addXref(omia_id, lidaurl, True)
def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') f.readline() # read the header row; skip for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:' + str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:' + str(mpd_strainid) model.addIndividualToGraph(strain_id, strain_name, tax_id) if mpdshortname.strip() != '': model.addSynonym(strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum model.addSameIndividual(strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) model.addSameIndividual(strain_id, reiken_id) else: if url != '': model.addXref(strain_id, url, True) if vendor != '': model.addXref( strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' model.addDescription(strain_id, desc) # TODO make the panels as a resource collection return
def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') self.check_header(self.files['straininfo']['file'], f.readline()) for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:' + str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:' + str(mpd_strainid) model.addIndividualToGraph(strain_id, strain_name, tax_id) if mpdshortname.strip() != '': model.addSynonym(strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum model.addSameIndividual(strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) model.addSameIndividual(strain_id, reiken_id) else: if url != '': model.addXref(strain_id, url, True) if vendor != '': model.addXref( strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' model.addDescription(strain_id, desc) # TODO make the panels as a resource collection return
def _process_lida_links_row(self, row): model = Model(self.graph) # lidaurl, omia_id, added_by omia_id = 'OMIA:' + row['omia_id'] lidaurl = row['lidaurl'] if self.test_mode and omia_id not in self.test_ids['disease']: return # LIDIA is hard to find/redolve (404s; suspect offline) # consider changing to model.addSynonym((omia_id, lidaurl) # b/c uri are not literals model.addXref(omia_id, urllib.parse.quote(lidaurl))
def _process_trait_mappings(self, raw, limit=None): """ This method mapps traits from/to ... Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip header line for row in filereader: line_counter += 1 # need to skip the last line if len(row) < 8: LOG.info("skipping line %d: %s", line_counter, '\t'.join(row)) continue (vto_id, pto_id, cmo_id, ato_column, species, trait_class, trait_type, qtl_count) = row ato_id = re.sub( r'ATO #', 'AQTLTrait:', re.sub(r'\].*', '', re.sub(r'\[', '', ato_column))) ato_id = ato_id.strip() ato_label = re.sub(r'.*\]\s*', '', ato_column) model.addClassToGraph(ato_id, ato_label.strip()) if re.match(r'VT:.*', vto_id): model.addClassToGraph(vto_id, None) model.addEquivalentClass(ato_id, vto_id) if re.match(r'LPT:.*', pto_id): model.addClassToGraph(pto_id, None) model.addXref(ato_id, pto_id) if re.match(r'CMO:.*', cmo_id): model.addClassToGraph(cmo_id, None) model.addXref(ato_id, cmo_id) LOG.info("Done with trait mappings") return
def _process_trait_mappings(self, raw, limit=None): """ This method mapps traits from/to ... Triples created: :param limit: :return: """ if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 model = Model(graph) with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip header line for row in filereader: line_counter += 1 # need to skip the last line if len(row) < 8: LOG.info("skipping line %d: %s", line_counter, '\t'.join(row)) continue (vto_id, pto_id, cmo_id, ato_column, species, trait_class, trait_type, qtl_count) = row ato_id = re.sub( r'ATO #', 'AQTLTrait:', re.sub( r'\].*', '', re.sub(r'\[', '', ato_column))) ato_id = ato_id.strip() ato_label = re.sub(r'.*\]\s*', '', ato_column) model.addClassToGraph(ato_id, ato_label.strip()) if re.match(r'VT:.*', vto_id): model.addClassToGraph(vto_id, None) model.addEquivalentClass(ato_id, vto_id) if re.match(r'LPT:.*', pto_id): model.addClassToGraph(pto_id, None) model.addXref(ato_id, pto_id) if re.match(r'CMO:.*', cmo_id): model.addClassToGraph(cmo_id, None) model.addXref(ato_id, cmo_id) LOG.info("Done with trait mappings") return
def _process_omia_omim_map(self, row): """ Links OMIA groups to OMIM equivalents. :param row: :return: """ # omia_id, omim_id, added_by model = Model(self.graph) omia_id = 'OMIA:' + row['omia_id'] omim_id = 'OMIM:' + row['omim_id'] # also store this for use when we say that a given animal is # a model of a disease if omia_id not in self.omia_omim_map: self.omia_omim_map[omia_id] = set() self.omia_omim_map[omia_id].add(omim_id) if self.test_mode and omia_id not in self.test_ids['disease']: return model.addXref(omia_id, omim_id)
def _get_process_allelic_variants(self, entry, g): model = Model(g) reference = Reference(g) geno = Genotype(g) if entry is not None: # to hold the entry-specific publication mentions # for the allelic variants publist = {} entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, g) if 'allelicVariantList' in entry: allelicVariantList = entry['allelicVariantList'] for al in allelicVariantList: al_num = al['allelicVariant']['number'] al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4) al_label = None al_description = None if al['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in al['allelicVariant']: al_label = al['allelicVariant']['mutations'] if 'text' in al['allelicVariant']: al_description = al['allelicVariant']['text'] m = re.findall(r'\{(\d+)\:', al_description) publist[al_id] = set(m) geno.addAllele( al_id, al_label, geno.genoparts['variant_locus'], al_description) geno.addAlleleOfGene( al_id, 'OMIM:'+str(entry_num), geno.object_properties[ 'is_sequence_variant_instance_of']) for r in publist[al_id]: pmid = ref_to_pmid[int(r)] g.addTriple( pmid, model.object_properties['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in al['allelicVariant']: dbsnp_ids = \ re.split(r',', al['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:'+dnum.strip() model.addIndividualToGraph(did, None) model.addSameIndividual(al_id, did) if 'clinvarAccessions' in al['allelicVariant']: # clinvarAccessions triple semicolon delimited # each >1 like RCV000020059;;; rcv_ids = \ re.split( r';;;', al['allelicVariant']['clinvarAccessions']) rcv_ids = [ (re.match(r'(RCV\d+);*', r)).group(1) for r in rcv_ids] for rnum in rcv_ids: rid = 'ClinVar:'+rnum model.addXref(al_id, rid) reference.addPage( al_id, "http://omim.org/entry/" + str(entry_num)+"#" + str(al_num).zfill(4)) elif re.search( r'moved', al['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in al['allelicVariant']: moved_id = 'OMIM:'+al['allelicVariant']['movedTo'] moved_ids = [moved_id] model.addDeprecatedIndividual(al_id, moved_ids) else: logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status']) # end loop allelicVariantList return
def _process_genes(self, taxid, limit=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 logger.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: raise ValueError("Data error for file %s", raw) (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene, peptide_id, uniprot_swissprot) = row[0:7] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[7] else: hgnc_id = None if self.testMode and entrezgene != '' \ and int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:' + ensembl_gene_id peptide_curie = 'ENSEMBL:{}'.format(peptide_id) uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot) entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None # gene_type_id = self._get_gene_type(gene_biotype) gene_type_id = None model.addClassToGraph( gene_id, external_gene_name, gene_type_id, description) model.addIndividualToGraph(peptide_curie, None, self._get_gene_type("polypeptide")) model.addIndividualToGraph(uniprot_curie, None, self._get_gene_type("polypeptide")) if entrezgene != '': model.addEquivalentClass(gene_id, entrez_curie) if hgnc_id is not None and hgnc_id != '': model.addEquivalentClass(gene_id, hgnc_id) geno.addTaxon('NCBITaxon:'+taxid, gene_id) if peptide_id != '': geno.addGeneProduct(gene_id, peptide_curie) if uniprot_swissprot != '': geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_ortholog_classes(self, limit=None): """ This method add the KEGG orthology classes to the graph. If there's an embedded enzyme commission number, that is added as an xref. Triples created: <orthology_class_id> is a class <orthology_class_id> has label <orthology_symbols> <orthology_class_id> has description <orthology_description> :param limit: :return: """ LOG.info("Processing ortholog classes") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (orthology_class_id, orthology_class_name) = row if self.test_mode and orthology_class_id \ not in self.test_ids['orthology_classes']: continue # The orthology class is essentially a KEGG gene ID # that is species agnostic. # Add the ID and label as a gene family class other_labels = re.split(r'[;,]', orthology_class_name) # the first one is the label we'll use orthology_label = other_labels[0] orthology_class_id = 'KEGG-'+orthology_class_id.strip() orthology_type = self.globaltt['gene_family'] model.addClassToGraph( orthology_class_id, orthology_label, orthology_type) if len(other_labels) > 1: # add the rest as synonyms # todo skip the first for s in other_labels: model.addSynonym(orthology_class_id, s.strip()) # add the last one as the description d = other_labels[len(other_labels)-1] model.addDescription(orthology_class_id, d) # add the enzyme commission number (EC:1.2.99.5)as an xref # sometimes there's two, like [EC:1.3.5.1 1.3.5.4] # can also have a dash, like EC:1.10.3.- ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d) if ec_matches is not None: for ecm in ec_matches: model.addXref(orthology_class_id, 'EC:' + ecm) if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with ortholog classes") return
def _process_qtls_genetic_location( self, raw, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[common_name + '_cm']['curie'] if self.test_mode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def _process_qtls_genetic_location( self, raw, src_key, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ aql_curie = self.files[src_key]['curie'] common_name = common_name.strip() if self.test_mode: graph = self.testgraph else: graph = self.graph geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') # no header in these files, so no header checking col = self.files[src_key]['columns'] col_len = len(col) for row in reader: if len(row) != col_len and ''.join(row[col_len:]) != '': LOG.warning( "Problem parsing %s line %i containing: \n%s\n" "got %i cols but expected %i", raw, reader.line_num, row, len(row), col_len) # LOG.info(row) continue qtl_id = row[col.index('QTL_ID')].strip() qtl_symbol = row[col.index('QTL_symbol')].strip() trait_name = row[col.index('Trait_name')].strip() # assotype = row[col.index('assotype')].strip() chromosome = row[col.index('Chromosome')].strip() position_cm = row[col.index('Position_cm')].strip() range_cm = row[col.index('range_cm')].strip() # flankmark_a2 = row[col.index('FlankMark_A2')].strip() # flankmark_a1 = row[col.index('FlankMark_A1')].strip() peak_mark = row[col.index('Peak_Mark')].strip() # flankmark_b1 = row[col.index('FlankMark_B1')].strip() # flankmark_b2 = row[col.index('FlankMark_B2')].strip() # exp_id = row[col.index('Exp_ID')].strip() # model_id = row[col.index('Model')].strip() # test_base = row[col.index('testbase')].strip() # sig_level = row[col.index('siglevel')].strip() # lod_score = row[col.index('LOD_score')].strip() # ls_mean = row[col.index('LS_mean')].strip() p_values = row[col.index('P_values')].strip() # f_statistics = row[col.index('F_Statistics')].strip() # variance = row[col.index('VARIANCE')].strip() # bayes_value = row[col.index('Bayes_value')].strip() # likelihood_ratio = row[col.index('LikelihoodR')].strip() trait_id = row[col.index('TRAIT_ID')].strip() # dom_effect = row[col.index('Dom_effect')].strip() # add_effect = row[col.index('Add_effect')].strip() pubmed_id = row[col.index('PUBMED_ID')].strip() gene_id = row[col.index('geneID')].strip() gene_id_src = row[col.index('geneIDsrc')].strip() # gene_id_type = row[col.index('geneIDtype')].strip() if self.test_mode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = ':'.join((aql_curie, trait_id.strip())) # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:' + common_name + '-linkage' build_label = common_name + ' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:' + peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref( qtl_id, dbsnp_id, xref_category=blv.terms['SequenceVariant']) gene_id = gene_id.replace('uncharacterized ', '').strip() gene_id = gene_id.strip(',') # for "100157483," in pig_QTLdata.txt if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id as a bnode vl_id = self.make_id(re.sub( r':', '', gene_id) + '-' + peak_mark.strip(), '_') geno.addSequenceAlterationToVariantLocus(dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph( trait_id, trait_name, class_category=blv.terms['PhenotypicFeature']) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:' + pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # off by one - the following actually gives us (limit + 1) records if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with QTL genetic info")
def _process_qtls_genetic_location( self, raw, txid, common_name, limit=None): """ This function processes Triples created: :param limit: :return: """ if self.testMode: graph = self.testgraph else: graph = self.graph line_counter = 0 geno = Genotype(graph) model = Model(graph) eco_id = self.globaltt['quantitative trait analysis evidence'] taxon_curie = 'NCBITaxon:' + txid LOG.info("Processing genetic location for %s from %s", taxon_curie, raw) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (qtl_id, qtl_symbol, trait_name, assotype, empty, chromosome, position_cm, range_cm, flankmark_a2, flankmark_a1, peak_mark, flankmark_b1, flankmark_b2, exp_id, model_id, test_base, sig_level, lod_score, ls_mean, p_values, f_statistics, variance, bayes_value, likelihood_ratio, trait_id, dom_effect, add_effect, pubmed_id, gene_id, gene_id_src, gene_id_type, empty2) = row if self.testMode and int(qtl_id) not in self.test_ids: continue qtl_id = common_name + 'QTL:' + qtl_id.strip() trait_id = 'AQTLTrait:' + trait_id.strip() # Add QTL to graph feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL']) feature.addTaxonToFeature(taxon_curie) # deal with the chromosome chrom_id = makeChromID(chromosome, taxon_curie, 'CHR') # add a version of the chromosome which is defined as # the genetic map build_id = 'MONARCH:'+common_name.strip()+'-linkage' build_label = common_name+' genetic map' geno.addReferenceGenome(build_id, build_label, taxon_curie) chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH') geno.addChromosomeInstance( chromosome, build_id, build_label, chrom_id) start = stop = None # range_cm sometimes ends in "(Mb)" (i.e pig 2016 Nov) range_mb = re.split(r'\(', range_cm) if range_mb is not None: range_cm = range_mb[0] if re.search(r'[0-9].*-.*[0-9]', range_cm): range_parts = re.split(r'-', range_cm) # check for poorly formed ranges if len(range_parts) == 2 and\ range_parts[0] != '' and range_parts[1] != '': (start, stop) = [ int(float(x.strip())) for x in re.split(r'-', range_cm)] else: LOG.info( "A cM range we can't handle for QTL %s: %s", qtl_id, range_cm) elif position_cm != '': match = re.match(r'([0-9]*\.[0-9]*)', position_cm) if match is not None: position_cm = match.group() start = stop = int(float(position_cm)) # FIXME remove converion to int for start/stop # when schema can handle floats add in the genetic location # based on the range feature.addFeatureStartLocation( start, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureEndLocation( stop, chrom_in_build_id, None, [self.globaltt['FuzzyPosition']]) feature.addFeatureToGraph() # sometimes there's a peak marker, like a rsid. # we want to add that as a variant of the gene, # and xref it to the qtl. dbsnp_id = None if peak_mark != '' and peak_mark != '.' and \ re.match(r'rs', peak_mark.strip()): dbsnp_id = 'dbSNP:'+peak_mark.strip() model.addIndividualToGraph( dbsnp_id, None, self.globaltt['sequence_alteration']) model.addXref(qtl_id, dbsnp_id) gene_id = gene_id.replace('uncharacterized ', '').strip() if gene_id is not None and gene_id != '' and gene_id != '.'\ and re.fullmatch(r'[^ ]*', gene_id) is not None: # we assume if no src is provided and gene_id is an integer, # then it is an NCBI gene ... (okay, lets crank that back a notch) if gene_id_src == '' and gene_id.isdigit() and \ gene_id in self.gene_info: # LOG.info( # 'Warm & Fuzzy saying %s is a NCBI gene for %s', # gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '' and gene_id.isdigit(): LOG.warning( 'Cold & Prickely saying %s is a NCBI gene for %s', gene_id, common_name) gene_id_src = 'NCBIgene' elif gene_id_src == '': LOG.error( ' "%s" is a NOT NCBI gene for %s', gene_id, common_name) gene_id_src = None if gene_id_src == 'NCBIgene': gene_id = 'NCBIGene:' + gene_id # we will expect that these will get labels elsewhere geno.addGene(gene_id, None) # FIXME what is the right relationship here? geno.addAffectedLocus(qtl_id, gene_id) if dbsnp_id is not None: # add the rsid as a seq alt of the gene_id vl_id = '_:' + re.sub( r':', '', gene_id) + '-' + peak_mark.strip() geno.addSequenceAlterationToVariantLocus( dbsnp_id, vl_id) geno.addAffectedLocus(vl_id, gene_id) # add the trait model.addClassToGraph(trait_id, trait_name) # Add publication reference = None if re.match(r'ISU.*', pubmed_id): pub_id = 'AQTLPub:'+pubmed_id.strip() reference = Reference(graph, pub_id) elif pubmed_id != '': pub_id = 'PMID:' + pubmed_id.strip() reference = Reference( graph, pub_id, self.globaltt['journal article']) if reference is not None: reference.addRefToGraph() # make the association to the QTL assoc = G2PAssoc( graph, self.name, qtl_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id as evidence # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) # international notation if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() # make the association to the dbsnp_id, if found if dbsnp_id is not None: # make the association to the dbsnp_id assoc = G2PAssoc( graph, self.name, dbsnp_id, trait_id, self.globaltt['is marker for']) assoc.add_evidence(eco_id) assoc.add_source(pub_id) # create a description from the contents of the file # desc = '' # assoc.addDescription(g, assoc_id, desc) # TODO add exp_id # if exp_id != '': # exp_id = 'AQTLExp:'+exp_id # gu.addIndividualToGraph(g, exp_id, None, eco_id) if p_values != '': scr = re.sub(r'<', '', p_values) scr = re.sub(r',', '.', scr) if scr.isnumeric(): score = float(scr) assoc.set_score(score) # todo add score type # TODO add LOD score? assoc.add_association_to_graph() if not self.testMode and limit is not None and line_counter > limit: break LOG.info("Done with QTL genetic info") return
def _process_genes(self, taxid, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) col = list(self.columns['bmq_attributes']) if taxid != '9606' and 'hgnc_id' in col: col.remove('hgnc_id') col_exp = [ self.columns['bmq_headers'][self.columns['bmq_attributes'].index(x)] for x in col] LOG.info("Processing Ensembl genes for NCBITaxon:%s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: reader = csv.reader(csvfile, delimiter='\t') row = next(reader) if not self.check_fileheader(col_exp, row): pass for row in reader: ensembl_gene_id = row[col.index('ensembl_gene_id')] external_gene_name = row[col.index('external_gene_name')] description = row[col.index('description')].strip() gene_biotype = row[col.index('gene_biotype')].strip() entrezgene = row[col.index('entrezgene_id')].strip() ensembl_peptide_id = row[col.index('ensembl_peptide_id')].strip() uniprotswissprot = row[col.index('uniprotswissprot')].strip() hgnc_curie = None # in the case of human genes, we also get the hgnc id, if taxid == '9606' and 'hgnc_id' in col: hgnc_curie = row[col.index('hgnc_id')].strip() if self.test_mode and entrezgene != '' and \ entrezgene not in self.gene_ids: continue gene_id = 'ENSEMBL:' + ensembl_gene_id entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None gene_type_id = self.resolve( gene_biotype, mandatory=False, default=self.globaltt['polypeptide']) model.addClassToGraph( gene_id, external_gene_name, gene_type_id, description) if entrezgene != '': if taxid == '9606': # Use HGNC for eq in human data model.addXref(gene_id, entrez_curie) else: model.addEquivalentClass(gene_id, entrez_curie) if hgnc_curie is not None and hgnc_curie != '': model.addEquivalentClass(gene_id, hgnc_curie) geno.addTaxon('NCBITaxon:' + taxid, gene_id) if ensembl_peptide_id is not None and ensembl_peptide_id != '': peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id) model.addIndividualToGraph(peptide_curie, None, gene_type_id) geno.addGeneProduct(gene_id, peptide_curie) if uniprotswissprot != '': uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot) model.addIndividualToGraph(uniprot_curie, None, gene_type_id) geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.test_mode and limit is not None and reader.line_num > limit: break
def _get_process_allelic_variants(self, entry, graph): model = Model(graph) reference = Reference(graph) geno = Genotype(graph) if entry is not None: # to hold the entry-specific publication mentions # for the allelic variants publist = {} entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, graph) if 'allelicVariantList' in entry: for alv in entry['allelicVariantList']: al_num = alv['allelicVariant']['number'] al_id = 'OMIM:' + str(entry_num) + '.' + str(al_num).zfill( 4) al_label = None al_description = None if alv['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in alv['allelicVariant']: al_label = alv['allelicVariant']['mutations'] if 'text' in alv['allelicVariant']: al_description = alv['allelicVariant']['text'] mch = re.findall(r'\{(\d+)\:', al_description) publist[al_id] = set(mch) geno.addAllele(al_id, al_label, self.globaltt['variant_locus'], al_description) geno.addAlleleOfGene(al_id, 'OMIM:' + str(entry_num), self.globaltt['is_allele_of']) for ref in publist[al_id]: pmid = ref_to_pmid[int(ref)] graph.addTriple(pmid, self.globaltt['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in alv['allelicVariant']: dbsnp_ids = re.split( r',', alv['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:' + dnum.strip() model.addIndividualToGraph(did, None) model.addSameIndividual(al_id, did) # Note that RCVs are variant to disease associations # in ClinVar, rather than variant entries # so we make these xrefs instead of equivalents if 'clinvarAccessions' in alv['allelicVariant']: # clinvarAccessions triple semicolon delimited # each >1 like RCV000020059;;; rcv_ids = \ alv['allelicVariant']['clinvarAccessions'].split(';;;') rcv_ids = [rcv[:12] for rcv in rcv_ids] # incase more cruft for rnum in rcv_ids: rid = 'ClinVar:' + rnum model.addXref(al_id, rid) reference.addPage( al_id, "http://omim.org/entry/" + '#'.join( (str(entry_num), str(al_num).zfill(4)))) elif re.search(r'moved', alv['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in alv['allelicVariant']: moved_id = 'OMIM:' + alv['allelicVariant'][ 'movedTo'] moved_ids = [moved_id] model.addDeprecatedIndividual(al_id, moved_ids) else: LOG.error('Uncaught alleleic variant status %s', alv['allelicVariant']['status'])
def _process_straininfo(self, limit): src_key = 'straininfo' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info('Processing measurementsfrom file: %s', raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) tax_id = self.globaltt['Mus musculus'] col = self.files[src_key]['columns'] with open(raw, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') row = next(reader) if self.check_fileheader(col, row): pass for row in reader: if not row: continue # skip blank rows strain_name = row[col.index('strainname')] vendor = row[col.index('vendor')] stocknum = row[col.index('stocknum')] panel = row[col.index('panel')] mpd_strainid = str(row[col.index('mpd_strainid')]) # straintype = row[col.index('straintype')] # n_proj = row[col.index('n_proj')] # n_snp_datasets = row[col.index('n_snp_datasets')] mpdshortname = row[col.index('mpd_shortname')].strip() url = row[col.index('url')] # new? # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.test_mode and 'MPD:' + mpd_strainid not in self.test_ids: continue strain_id = 'MPD-strain:' + mpd_strainid model.addIndividualToGraph(strain_id, strain_name, tax_id) if mpdshortname != '': model.addSynonym(strain_id, mpdshortname) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:' + stocknum model.addSameIndividual(strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:' + stocknum model.addSameIndividual(strain_id, reiken_id) else: if url != '': model.addXref(strain_id, url, True) if vendor != '': model.addXref(strain_id, ':'.join( (vendor, stocknum)), True) # add the panel information if panel != '': desc = panel + ' [panel]' model.addDescription(strain_id, desc)
def _add_gene_equivalencies(self, dbxrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport', ''] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in dbxrefs.strip().split('|'): prefix = ':'.join( dbxref.split(':')[:-1]).strip() # restore nonterminal ':' if prefix in self.localtt: prefix = self.localtt[prefix] # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'AnimalQTLdb' and taxon in self.informal_species: prefix = self.informal_species[taxon] + 'QTL' dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None: if prefix == 'HPRD': # proteins are not == genes. model.addTriple(gene_id, self.globaltt['has gene product'], dbxref_curie) continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': omim_num = dbxref_curie[5:] if omim_num in self.omim_replaced: repl = self.omim_replaced[omim_num] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = 'OMIM:' + omim model.addXref(gene_id, dbxref_curie) omim_num = omim # last wins elif omim_num in self.omim_type and\ self.omim_type[omim_num] == self.globaltt['gene']: model.addXref(gene_id, dbxref_curie) else: continue # no equivilance between ncbigene and omin-nongene # designate clique leaders # (perhaps premature as this ingest can't know what else exists) try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:' + tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance(cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance(str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs' + str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:' + dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_' + gene_num + '-' + variant_num if self.nobnodes: vl_id = ':' + vl_id vl_label = allele_name model.addIndividualToGraph(vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info("No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub(m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub(r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub(r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub(r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc(g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:' + xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _process_genes(self, taxid, limit=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 logger.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: raise ValueError("Data error for file %s", raw) (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene, peptide_id, uniprot_swissprot) = row[0:7] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[7] else: hgnc_id = None if self.testMode and entrezgene != '' \ and int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:' + ensembl_gene_id peptide_curie = 'ENSEMBL:{}'.format(peptide_id) uniprot_curie = 'UniProtKB:{}'.format(uniprot_swissprot) entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None # gene_type_id = self._get_gene_type(gene_biotype) gene_type_id = None model.addClassToGraph(gene_id, external_gene_name, gene_type_id, description) model.addIndividualToGraph(peptide_curie, None, self._get_gene_type("polypeptide")) model.addIndividualToGraph(uniprot_curie, None, self._get_gene_type("polypeptide")) if entrezgene != '': model.addEquivalentClass(gene_id, entrez_curie) if hgnc_id is not None and hgnc_id != '': model.addEquivalentClass(gene_id, hgnc_id) geno.addTaxon('NCBITaxon:' + taxid, gene_id) if peptide_id != '': geno.addGeneProduct(gene_id, peptide_curie) if uniprot_swissprot != '': geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_ortholog_classes(self, limit=None): """ This method add the KEGG orthology classes to the graph. If there's an embedded enzyme commission number, that is added as an xref. Triples created: <orthology_class_id> is a class <orthology_class_id> has label <orthology_symbols> <orthology_class_id> has description <orthology_description> :param limit: :return: """ LOG.info("Processing ortholog classes") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (orthology_class_id, orthology_class_name) = row if self.test_mode and orthology_class_id \ not in self.test_ids['orthology_classes']: continue # The orthology class is essentially a KEGG gene ID # that is species agnostic. # Add the ID and label as a gene family class other_labels = re.split(r'[;,]', orthology_class_name) # the first one is the label we'll use orthology_label = other_labels[0] orthology_class_id = 'KEGG-'+orthology_class_id.strip() orthology_type = self.globaltt['gene_family'] model.addClassToGraph( orthology_class_id, orthology_label, orthology_type) if len(other_labels) > 1: # add the rest as synonyms # todo skip the first for s in other_labels: model.addSynonym(orthology_class_id, s.strip()) # add the last one as the description d = other_labels[len(other_labels)-1] model.addDescription(orthology_class_id, d) # add the enzyme commission number (EC:1.2.99.5)as an xref # sometimes there's two, like [EC:1.3.5.1 1.3.5.4] # can also have a dash, like EC:1.10.3.- ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d) if ec_matches is not None: for ecm in ec_matches: model.addXref(orthology_class_id, 'EC:' + ecm) if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with ortholog classes")
def _add_gene_equivalencies(self, dbxrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport', '', None] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in dbxrefs.strip().split('|'): dbxref = dbxref.strip() # de stutter dbxref (prefix, local_id) = dbxref.split(':')[-2:] prefix = prefix.strip() local_id = local_id.strip() # skip some of these based on curie prefix or malformatting if prefix is None or prefix in filter_out or \ local_id is None or local_id == '': continue if prefix in self.localtt: prefix = self.localtt[prefix] if prefix == 'AnimalQTLdb' and taxon in self.informal_species: prefix = self.informal_species[taxon] + 'QTL' elif prefix == 'AnimalQTLdb': LOG.warning('Unknown AnimalQTLdb species %s for %s:%s', taxon, prefix, local_id) # else: # taxon is not in informal species (not unexpected) dbxref_curie = ':'.join((prefix, local_id)) if dbxref_curie is not None: if prefix == 'HPRD': # proteins are not == genes. model.addTriple(gene_id, self.globaltt['has gene product'], dbxref_curie) continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) # For Ensembl xrefs, don't proceed to equivalent class code # these are more loose xrefs than equivalent identifiers continue if prefix == 'OMIM': omim_num = dbxref_curie[5:] if omim_num in self.omim_replaced: repl = self.omim_replaced[omim_num] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = 'OMIM:' + omim omim_num = omim # last "gene" wins (is never > 2) if omim_num in self.omim_type and\ self.omim_type[omim_num] == self.globaltt['gene']: model.addXref(gene_id, dbxref_curie) else: # OMIM disease/phenotype is not considered a gene at all # no equivilance between ncbigene and omin-nongene # and ncbi is never a human clique leader in any case dbxref_curie = None continue # designate clique leaders and equivalentClass/sameAs triples # (perhaps premature as this ingest can't know what else exists) try: if self.class_or_indiv.get(gene_id) == 'C' and \ dbxref_curie is not None: model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) elif dbxref_curie is not None: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _process_trait_mappings(self, raw, limit=None): """ This method mapps traits from/to ... Triples created: :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 model = Model(g) # with open(raw, 'r') as csvfile: # filereader = csv.reader(csvfile, delimiter=',') # row_count = sum(1 for row in filereader) # row_count = row_count - 1 with open(raw, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip header line for row in filereader: line_counter += 1 # need to skip the last line if len(row) < 8: logger.info( "skipping line %d: %s", line_counter, '\t'.join(row)) continue (vto_id, pto_id, cmo_id, ato_column, species, trait_class, trait_type, qtl_count) = row ato_id = re.sub(r'ATO #', 'AQTLTrait:', re.sub(r'\].*', '', re.sub(r'\[', '', ato_column))) ato_id = ato_id.strip() ato_label = re.sub(r'.*\]\s*', '', ato_column) # if species == 'Cattle': # ato_id = re.sub(r'ATO:', 'AQTLTraitCattle:', ato_id) # elif species == 'Chicken': # ato_id = re.sub(r'ATO:', 'AQTLTraitChicken:', ato_id) # elif species == 'Sheep': # ato_id = re.sub(r'ATO:', 'AQTLTraitSheep:', ato_id) # elif species == 'Horse': # ato_id = re.sub(r'ATO:', 'AQTLTraitHorse:', ato_id) # elif species == 'Pig': # ato_id = re.sub(r'ATO:', 'AQTLTraitPig:', ato_id) # elif species == 'Rainbow trout': # ato_id = re.sub( # r'ATO:', 'AQTLTraitRainbowTrout:', ato_id) # else: # logger.warning( # 'Unknown species %s foufnd in trait mapping file.', # species) # continue # print(ato_label) model.addClassToGraph(ato_id, ato_label.strip()) if re.match(r'VT:.*', vto_id): model.addClassToGraph(vto_id, None) model.addEquivalentClass(ato_id, vto_id) if re.match(r'LPT:.*', pto_id): model.addClassToGraph(pto_id, None) model.addXref(ato_id, pto_id) if re.match(r'CMO:.*', cmo_id): model.addClassToGraph(cmo_id, None) model.addXref(ato_id, cmo_id) logger.info("Done with trait mappings") return
def _get_variants(self, limit): """ Currently loops through the variant_summary file. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) f = Feature(g, None, None, None) # add the taxon and the genome tax_num = '9606' # HARDCODE tax_id = 'NCBITaxon:'+tax_num tax_label = 'Human' model.addClassToGraph(tax_id, None) geno.addGenome(tax_id, tax_label) # label gets added elsewhere # not unzipping the file logger.info("Processing Variant records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_summary']['file'])) with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # Type character, the type of variation # Name character, the preferred name for the variation # GeneID integer, GeneID in NCBI's Gene database # GeneSymbol character, comma-separated list of GeneIDs overlapping the variation # ClinicalSignificance character, comma-separated list of values of clinical significance reported for this variation # for the mapping between the terms listed here and the integers in the .VCF files, see # http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ # RS# (dbSNP) integer, rs# in dbSNP # nsv (dbVar) character, the NSV identifier for the region in dbVar # RCVaccession character, list of RCV accessions that report this variant # TestedInGTR character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR) # PhenotypeIDs character, list of db names and identifiers for phenotype(s) reported for this variant # Origin character, list of all allelic origins for this variation # Assembly character, name of the assembly on which locations are based # Chromosome character, chromosomal location # Start integer, starting location, in pter->qter orientation # Stop integer, end location, in pter->qter orientation # Cytogenetic character, ISCN band # ReviewStatus character, highest review status for reporting this measure. For the key to the terms, # and their relationship to the star graphics ClinVar displays on its web pages, # see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation # HGVS(c.) character, RefSeq cDNA-based HGVS expression # HGVS(p.) character, RefSeq protein-based HGVS expression # NumberSubmitters integer, number of submissions with this variant # LastEvaluated datetime, the latest time any submitter reported clinical significance # Guidelines character, ACMG only right now, for the reporting of incidental variation in a Gene # (NOTE: if ACMG, not a specific to the allele but to the Gene) # OtherIDs character, list of other identifiers or sources of information about this variant # VariantID integer, the value used to build the URL for the current default report, # e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/ # # a crude check that there's an expected number of cols. # if not, error out because something changed. num_cols = len(line.split('\t')) expected_numcols = 29 if num_cols != expected_numcols: logger.error( "Unexpected number of columns in raw file " + "(%d actual vs %d expected)", num_cols, expected_numcols) (allele_num, allele_type, allele_name, gene_num, gene_symbol, clinical_significance, dbsnp_num, dbvar_num, rcv_nums, tested_in_gtr, phenotype_ids, origin, assembly, chr, start, stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p, number_of_submitters, last_eval, guidelines, other_ids, variant_num, reference_allele, alternate_allele, categories, ChromosomeAccession) = line.split('\t') # ###set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and\ # (int(tax_num) not in self.tax_ids)) or\ # (self.filter == 'geneids' and\ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter line_counter += 1 pheno_list = [] if phenotype_ids != '-': # trim any leading/trailing semicolons/commas phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids) phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids) pheno_list = re.split(r'[,;]', phenotype_ids) if self.testMode: # get intersection of test disease ids # and these phenotype_ids intersect = \ list( set([str(i) for i in self.disease_ids]) & set(pheno_list)) if int(gene_num) not in self.gene_ids and\ int(variant_num) not in self.variant_ids and\ len(intersect) < 1: continue # TODO may need to switch on assembly to create correct # assembly/build identifiers build_id = ':'.join(('NCBIGenome', assembly)) # make the reference genome build geno.addReferenceGenome(build_id, assembly, tax_id) allele_type_id = self._map_type_of_allele(allele_type) bandinbuild_id = None if str(chr) == '': # check cytogenic location if str(cytogenetic_loc).strip() != '': # use cytogenic location to get the apx location # oddly, they still put an assembly number even when # there's no numeric location if not re.search(r'-', str(cytogenetic_loc)): band_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), tax_num, 'CHR') geno.addChromosomeInstance( cytogenetic_loc, build_id, assembly, band_id) bandinbuild_id = makeChromID( re.split(r'-', str(cytogenetic_loc)), assembly, 'MONARCH') else: # can't deal with ranges yet pass else: # add the human chromosome class to the graph, # and add the build-specific version of it chr_id = makeChromID(str(chr), tax_num, 'CHR') geno.addChromosomeClass(str(chr), tax_id, tax_label) geno.addChromosomeInstance( str(chr), build_id, assembly, chr_id) chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH') seqalt_id = ':'.join(('ClinVarVariant', variant_num)) gene_id = None # they use -1 to indicate unknown gene if str(gene_num) != '-1' and str(gene_num) != 'more than 10': if re.match(r'^Gene:', gene_num): gene_num = "NCBI" + gene_num else: gene_id = ':'.join(('NCBIGene', str(gene_num))) # FIXME there are some "variants" that are actually haplotypes # probably will get taken care of when we switch to processing # the xml for example, variant_num = 38562 # but there's no way to tell if it's a haplotype # in the csv data so the dbsnp or dbvar # should probably be primary, # and the variant num be the vslc, # with each of the dbsnps being added to it # TODO clinical significance needs to be mapped to # a list of terms # first, make the variant: f = Feature(seqalt_id, allele_name, allele_type_id) if start != '-' and start.strip() != '': f.addFeatureStartLocation(start, chrinbuild_id) if stop != '-' and stop.strip() != '': f.addFeatureEndLocation(stop, chrinbuild_id) f.addFeatureToGraph() f.addTaxonToFeature(tax_id) # make the ClinVarVariant the clique leader model.makeLeader(seqalt_id) if bandinbuild_id is not None: f.addSubsequenceOfFeature(bandinbuild_id) # CHECK - this makes the assumption that there is # only one affected chromosome per variant what happens with # chromosomal rearrangement variants? # shouldn't both chromosomes be here? # add the hgvs as synonyms if hgvs_c != '-' and hgvs_c.strip() != '': model.addSynonym(seqalt_id, hgvs_c) if hgvs_p != '-' and hgvs_p.strip() != '': model.addSynonym(seqalt_id, hgvs_p) # add the dbsnp and dbvar ids as equivalent if dbsnp_num != '-' and int(dbsnp_num) != -1: dbsnp_id = 'dbSNP:rs'+str(dbsnp_num) model.addIndividualToGraph(dbsnp_id, None) model.addSameIndividual(seqalt_id, dbsnp_id) if dbvar_num != '-': dbvar_id = 'dbVar:'+dbvar_num model.addIndividualToGraph(dbvar_id, None) model.addSameIndividual(seqalt_id, dbvar_id) # TODO - not sure if this is right... add as xref? # the rcv is like the combo of the phenotype with the variant if rcv_nums != '-': for rcv_num in re.split(r';', rcv_nums): rcv_id = 'ClinVar:' + rcv_num model.addIndividualToGraph(rcv_id, None) model.addXref(seqalt_id, rcv_id) if gene_id is not None: # add the gene model.addClassToGraph(gene_id, gene_symbol) # make a variant locus vl_id = '_'+gene_num+'-'+variant_num if self.nobnodes: vl_id = ':'+vl_id vl_label = allele_name model.addIndividualToGraph( vl_id, vl_label, geno.genoparts['variant_locus']) geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id) geno.addAlleleOfGene(vl_id, gene_id) else: # some basic reporting gmatch = re.search(r'\(\w+\)', allele_name) if gmatch is not None and len(gmatch.groups()) > 0: logger.info( "Gene found in allele label, but no id provided: %s", gmatch.group(1)) elif re.match(r'more than 10', gene_symbol): logger.info( "More than 10 genes found; " "need to process XML to fetch (variant=%d)", int(variant_num)) else: logger.info( "No gene listed for variant %d", int(variant_num)) # parse the list of "phenotypes" which are diseases. # add them as an association # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374 # the list is both semicolon delimited and comma delimited, # but i don't know why! some are bad, like: # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000 if phenotype_ids != '-': for phenotype in pheno_list: m = re.match( r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype) if m is not None and len(m.groups()) > 0: phenotype = re.sub( m.group(1), 'Orphanet:', phenotype.strip()) elif re.match(r'ORPHA:\d+', phenotype): phenotype = re.sub( r'^ORPHA', 'Orphanet', phenotype.strip()) elif re.match(r'Human Phenotype Ontology', phenotype): phenotype = re.sub( r'^Human Phenotype Ontology', '', phenotype.strip()) elif re.match(r'SNOMED CT:\s?', phenotype): phenotype = re.sub( r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip()) elif re.match(r'^Gene:', phenotype): continue assoc = G2PAssoc( g, self.name, seqalt_id, phenotype.strip()) assoc.add_association_to_graph() if other_ids != '-': id_list = other_ids.split(',') # process the "other ids" ex: # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001 # TODO make more xrefs for xrefid in id_list: prefix = xrefid.split(':')[0].strip() if prefix == 'OMIM Allelic Variant': xrefid = 'OMIM:'+xrefid.split(':')[1] model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'HGMD': model.addIndividualToGraph(xrefid, None) model.addSameIndividual(seqalt_id, xrefid) elif prefix == 'dbVar' \ and dbvar_num == xrefid.split(':')[1].strip(): pass # skip over this one elif re.search(r'\s', prefix): pass # logger.debug( # 'xref prefix has a space: %s', xrefid) else: # should be a good clean prefix # note that HGMD variants are in here as Xrefs # because we can't resolve URIs for them # logger.info("Adding xref: %s", xrefid) # gu.addXref(g, seqalt_id, xrefid) # logger.info("xref prefix to add: %s", xrefid) pass if not self.testMode and limit is not None \ and line_counter > limit: break logger.info("Finished parsing variants") return
def _process_genes(self, taxid, limit=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) raw = '/'.join((self.rawdir, self.files[taxid]['file'])) line_counter = 0 LOG.info("Processing Ensembl genes for tax %s", taxid) with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t') for row in filereader: if len(row) < 4: LOG.warning("Too few columns in: " + row) raise ValueError("Data error for file %s", raw) (ensembl_gene_id, external_gene_name, description, gene_biotype, entrezgene, ensembl_peptide_id, uniprotswissprot) = row[0:7] # in the case of human genes, we also get the hgnc id, # and is the last col if taxid == '9606': hgnc_id = row[7] else: hgnc_id = None if self.test_mode and entrezgene != '' and \ int(entrezgene) not in self.gene_ids: continue line_counter += 1 gene_id = 'ENSEMBL:' + ensembl_gene_id peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id) uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot) entrez_curie = 'NCBIGene:{}'.format(entrezgene) if description == '': description = None gene_biotype = gene_biotype.strip() gene_type_id = self.resolve(gene_biotype, False) if gene_type_id == gene_biotype.strip(): # did not resolve gene_type_id = self.globaltt['polypeptide'] model.addClassToGraph( gene_id, external_gene_name, gene_type_id, description) model.addIndividualToGraph(peptide_curie, None, gene_type_id) model.addIndividualToGraph(uniprot_curie, None, gene_type_id) if entrezgene != '': if taxid == '9606': # Use HGNC for eq in human data model.addXref(gene_id, entrez_curie) else: model.addEquivalentClass(gene_id, entrez_curie) if hgnc_id is not None and hgnc_id != '': model.addEquivalentClass(gene_id, hgnc_id) geno.addTaxon('NCBITaxon:'+taxid, gene_id) if ensembl_peptide_id != '': geno.addGeneProduct(gene_id, peptide_curie) if uniprotswissprot != '': geno.addGeneProduct(gene_id, uniprot_curie) model.addXref(peptide_curie, uniprot_curie) if not self.test_mode and limit is not None and line_counter > limit: break return