def _add_therapy_drug_association(self, drug_id, disease_id, therapy_status_id): """ Create an association linking a drug and disease with RO:0002606 (substance_that_treats) and any supporting information such as FDA approval and source (not implemented) :param drug_id: Id as curie of the drug :param disease_id: Id as curie of the disease :param therapy_status: (Optional) String label of therapy approval status :return: None """ gu = GraphUtils(curie_map.get()) # Placeholder relationship, note this does not exist in RO relationship_id = "RO:has_approval_status" gu.addTriple(self.graph, drug_id, gu.object_properties['substance_that_treats'], disease_id) # Make association drug_disease_annot = self.make_cgd_id("assoc{0}{1}".format(drug_id, disease_id)) therapy_disease_assoc = Assoc(self.name) therapy_disease_assoc.set_subject(drug_id) therapy_disease_assoc.set_relationship(gu.object_properties['substance_that_treats']) therapy_disease_assoc.set_object(disease_id) therapy_disease_assoc.set_association_id(drug_disease_annot) therapy_disease_assoc.add_association_to_graph(self.graph) gu.addTriple(self.graph, drug_disease_annot, relationship_id, therapy_status_id)
def _get_process_allelic_variants(self, entry, g): gu = GraphUtils(curie_map.get()) geno = Genotype(g) du = DipperUtil() if entry is not None: publist = {} # to hold the entry-specific publication mentions for the allelic variants entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, g) if 'allelicVariantList' in entry: allelicVariantList = entry['allelicVariantList'] for al in allelicVariantList: al_num = al['allelicVariant']['number'] al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4) al_label = None al_description = None if al['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in al['allelicVariant']: al_label = al['allelicVariant']['mutations'] if 'text' in al['allelicVariant']: al_description = al['allelicVariant']['text'] m = re.findall('\{(\d+)\:', al_description) publist[al_id] = set(m) geno.addAllele(al_id, al_label, geno.genoparts['variant_locus'], al_description) geno.addAlleleOfGene(al_id, 'OMIM:'+str(entry_num), geno.object_properties['is_sequence_variant_instance_of']) for r in publist[al_id]: pmid = ref_to_pmid[int(r)] gu.addTriple(g, pmid, gu.object_properties['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in al['allelicVariant']: dbsnp_ids = re.split(',', al['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:'+dnum.strip() gu.addIndividualToGraph(g, did, None) gu.addEquivalentClass(g, al_id, did) if 'clinvarAccessions' in al['allelicVariant']: # clinvarAccessions triple semicolon delimited, each lik eRCV000020059;;1 rcv_ids = re.split(';;;', al['allelicVariant']['clinvarAccessions']) rcv_ids = [(re.match('(RCV\d+)\;\;', r)).group(1) for r in rcv_ids] for rnum in rcv_ids: rid = 'ClinVar:'+rnum gu.addXref(g, al_id, rid) gu.addPage(g, al_id, "http://omim.org/entry/"+str(entry_num)+"#"+str(al_num).zfill(4)) elif re.search('moved', al['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in al['allelicVariant']: moved_id = 'OMIM:'+al['allelicVariant']['movedTo'] moved_ids = [moved_id] gu.addDeprecatedIndividual(g, al_id, moved_ids) else: logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status']) # end loop allelicVariantList return
def _process_pathway_disease(self, limit): """ We make a link between the pathway identifiers, and any diseases associated with them. Since we model diseases as processes, we make a triple saying that the pathway may be causally upstream of or within the disease process. :param limit: :return: """ logger.info("Processing KEGG pathways to disease ids") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['pathway_disease']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (disease_id, kegg_pathway_num) = row if self.testMode and \ kegg_pathway_num not in self.test_ids['pathway']: continue disease_id = 'KEGG-'+disease_id # will look like KEGG-path:map04130 or KEGG-path:hsa04130 pathway_id = 'KEGG-'+kegg_pathway_num gu.addTriple( g, pathway_id, GraphUtils.object_properties[ 'causally_upstream_of_or_within'], disease_id) if not self.testMode and \ limit is not None and line_counter > limit: break return
def _get_pubs(self, entry, g): """ Extract mentioned publications from the reference list :param entry: :return: """ ref_to_pmid = {} du = DipperUtil() entry_num = entry['mimNumber'] gu = GraphUtils(curie_map.get()) if 'referenceList' in entry: reflist = entry['referenceList'] for r in reflist: if 'pubmedID' in r['reference']: pub_id = 'PMID:' + str(r['reference']['pubmedID']) ref = Reference(pub_id, Reference.ref_types['journal_article']) else: # make blank node for internal reference pub_id = '_OMIM' + str(entry_num) + 'ref' + str(r['reference']['referenceNumber']) if self.nobnodes: pub_id = ':' + pub_id ref = Reference(pub_id) title = author_list = source = citation = None if 'title' in r['reference']: title = r['reference']['title'] ref.setTitle(title) if 'authors' in r['reference']: author_list = r['reference']['authors'] ref.setAuthorList(author_list) citation = re.split('\.\,', author_list)[0] + ' et al' if 'source' in r['reference']: source = r['reference']['source'] citation = '; '.join(du.flatten([citation, title, source])) ref.setShortCitation(citation) ref.addRefToGraph(g) ref_to_pmid[r['reference']['referenceNumber']] = pub_id # add is_about for the pub omim_id = 'OMIM:'+str(entry_num) gu.addTriple(g, omim_id, gu.object_properties['mentions'], pub_id) return ref_to_pmid
def _process_pathway_pubmed(self, limit): """ Indicate that a pathway is annotated directly to a paper (is about) via it's pubmed id. :param limit: :return: """ logger.info("Processing KEGG pathways to pubmed ids") if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) raw = '/'.join((self.rawdir, self.files['pathway_pubmed']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (pubmed_id, kegg_pathway_num) = row if self.testMode and \ kegg_pathway_num not in self.test_ids['pathway']: continue pubmed_id = pubmed_id.upper() # will look like KEGG-path:map04130 kegg_id = 'KEGG-'+kegg_pathway_num r = Reference( pubmed_id, Reference.ref_types['journal_article']) r.addRefToGraph(g) gu.addTriple(g, pubmed_id, GraphUtils.object_properties['is_about'], kegg_id) if not self.testMode and \ limit is not None and line_counter > limit: break return
class MPD(Source): """ From the [MPD](http://phenome.jax.org/) website: This resource is a collaborative standardized collection of measured data on laboratory mouse strains and populations. Includes baseline phenotype data sets as well as studies of drug, diet, disease and aging effect. Also includes protocols, projects and publications, and SNP, variation and gene expression studies. Here, we pull the data and model the genotypes using GENO and the genotype-to-phenotype associations using the OBAN schema. MPD provide measurements for particular assays for several strains. Each of these measurements is itself mapped to a MP or VT term as a phenotype. Therefore, we can create a strain-to-phenotype association based on those strains that lie outside of the "normal" range for the given measurements. We can compute the average of the measurements for all strains tested, and then threshold any extreme measurements being beyond some threshold beyond the average. Our default threshold here, is +/-2 standard deviations beyond the mean. Because the measurements are made and recorded at the level of a specific sex of each strain, we associate the MP/VT phenotype with the sex-qualified genotype/strain. """ mdpdl = 'http://phenomedoc.jax.org/MPD_downloads' files = { 'ontology_mappings': { 'file': 'ontology_mappings.csv', 'url': mdpdl+'/ontology_mappings.csv'}, 'straininfo': { 'file': 'straininfo.csv', 'url': mdpdl+'/straininfo.csv'}, 'assay_metadata': { 'file': 'measurements.csv', 'url': mdpdl+'/measurements.csv'}, 'strainmeans': { 'file': 'strainmeans.csv.gz', 'url': mdpdl+'/strainmeans.csv.gz'}, # 'mpd_datasets_metadata': { #TEC does not seem to be used # 'file': 'mpd_datasets_metadata.xml.gz', # 'url': mdpdl+'/mpd_datasets_metadata.xml.gz'}, } # the following are strain ids for testing # test_ids = [ # "MPD:2", "MPD:3", "MPD:5", "MPD:6", "MPD:9", "MPD:11", "MPD:18", # "MPD:20", "MPD:24", "MPD:28", "MPD:30", "MPD:33", "MPD:34", "MPD:36", # "MPD:37", "MPD:39", "MPD:40", "MPD:42", "MPD:47", "MPD:66", "MPD:68", # "MPD:71", "MPD:75", "MPD:78", "MPD:122", "MPD:169", "MPD:438", # "MPD:457","MPD:473", "MPD:481", "MPD:759", "MPD:766", "MPD:770", # "MPD:849", "MPD:857", "MPD:955", "MPD:964", "MPD:988", "MPD:1005", # "MPD:1017", "MPD:1204", "MPD:1233", "MPD:1235", "MPD:1236", "MPD:1237"] test_ids = [ 'MPD:6', 'MPD:849', 'MPD:425', 'MPD:569', "MPD:10", "MPD:1002", "MPD:39", "MPD:2319"] mgd_agent_id = "MPD:db/q?rtn=people/allinv" mgd_agent_label = "Mouse Phenotype Database" mgd_agent_type = "foaf:organization" def __init__(self): Source.__init__(self, 'mpd') # @N, not sure if this step is required self.namespaces.update(curie_map.get()) self.stdevthreshold = 2 self.nobnodes = True # FIXME # update the dataset object with details about this resource # @N: Note that there is no license as far as I can tell self.dataset = Dataset( 'mpd', 'MPD', 'http://phenome.jax.org', None, None) # TODO add a citation for mpd dataset as a whole self.dataset.set_citation('PMID:15619963') self.assayhash = {} self.idlabel_hash = {} # to store the mean/zscore of each measure by strain+sex self.score_means_by_measure = {} # to store the mean value for each measure by strain+sex self.strain_scores_by_measure = {} self.geno = Genotype(self.graph) self.gu = GraphUtils(curie_map.get()) return def fetch(self, is_dl_forced=False): self.get_files(is_dl_forced) return def parse(self, limit=None): """ MPD data is delivered in four separate csv files and one xml file, which we process iteratively and write out as one large graph. :param limit: :return: """ if limit is not None: logger.info("Only parsing first %s rows fo each file", str(limit)) logger.info("Parsing files...") if self.testOnly: self.testMode = True g = self.testgraph self.geno = Genotype(self.testgraph) else: g = self.graph self._process_straininfo(limit) # the following will provide us the hash-lookups # These must be processed in a specific order # mapping between assays and ontology terms self._process_ontology_mappings_file(limit) # this is the metadata about the measurements self._process_measurements_file(limit) # get all the measurements per strain self._process_strainmeans_file(limit) # The following will use the hash populated above # to lookup the ids when filling in the graph self._fill_provenance_graph(limit) logger.info("Finished parsing.") self.load_bindings() gu = GraphUtils(curie_map.get()) gu.loadAllProperties(g) gu.loadProperties(g, G2PAssoc.object_properties, GraphUtils.OBJPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, GraphUtils.OBJPROP) gu.loadProperties( g, G2PAssoc.annotation_properties, GraphUtils.ANNOTPROP) logger.info("Found %d nodes", len(self.graph)) return def _process_ontology_mappings_file(self, limit): # line_counter = 0 # TODO unused logger.info("Processing ontology mappings...") raw = '/'.join((self.rawdir, 'ontology_mappings.csv')) with open(raw, 'r') as f: reader = csv.reader(f) # read the header row; skip f.readline() for row in reader: try: (assay_id, ont_term, descrip) = row except ValueError: continue assay_id = int(assay_id) if re.match(r'(MP|VT)', ont_term): # add the mapping denovo if assay_id not in self.assayhash: self.assayhash[assay_id] = {} self.assayhash[assay_id]['ont_terms'] = set() self.assayhash[assay_id]['ont_terms'].add(ont_term) return def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' gu = GraphUtils(curie_map.get()) with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') f.readline() # read the header row; skip for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:'+str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:'+str(mpd_strainid) gu.addIndividualToGraph(g, strain_id, strain_name, tax_id) if mpdshortname.strip() != '': gu.addSynonym(g, strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum gu.addSameIndividual(g, strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) gu.addSameIndividual(g, strain_id, reiken_id) else: if url != '': gu.addXref(g, strain_id, url, True) if vendor != '': gu.addXref( g, strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' gu.addDescription(g, strain_id, desc) # TODO make the panels as a resource collection return def _process_measurements_file(self, limit): line_counter = 0 logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, 'measurements.csv')) with open(raw, 'r') as f: reader = csv.reader(f) # read the header row; skip header = f.readline() logger.info("HEADER: %s", header) for row in reader: # measnum,projsym,varname,descrip,units,cat1,cat2,cat3, # intervention,intparm,appmeth,panelsym,datatype,sextested, # nstrainstested,ageweeks # Again the last row has changed. contains: '(4486 rows)' if len(row) != 16: continue line_counter += 1 assay_id = int(row[0]) assay_label = row[3] assay_units = row[4] assay_type = row[10] if row[10] is not '' else None if assay_id not in self.assayhash: self.assayhash[assay_id] = {} description = self.build_measurement_description(row) self.assayhash[assay_id]['description'] = description self.assayhash[assay_id]['assay_label'] = assay_label self.assayhash[assay_id]['assay_type'] = assay_type self.assayhash[assay_id]['assay_units'] = assay_units # TODO add projectsym property? # TODO add intervention? # ageweeks might be useful for adding to phenotype assoc # end loop on measurement metadata return def _process_strainmeans_file(self, limit): """ This will store the entire set of strain means in a hash. Not the most efficient representation, but easy access. We will loop through this later to then apply cutoffs and add associations :param limit: :return: """ logger.info("Processing strain means ...") line_counter = 0 raw = '/'.join((self.rawdir, self.files['strainmeans']['file'])) with gzip.open(raw, 'rb') as f: f = io.TextIOWrapper(f) reader = csv.reader(f) f.readline() # read the header row; skip score_means_by_measure = {} strain_scores_by_measure = {} for row in reader: try: (measnum, varname, strain, strainid, sex, mean, nmice, sd, sem, cv, minval, maxval, logmean, logsd, zscore, logzscore) = row except ValueError: continue line_counter += 1 strain_num = int(strainid) assay_num = int(measnum) # assuming the zscore is across all the items # in the same measure+var+strain+sex # note: it seems that there is only ever 1 varname per measnum. # note: some assays only tested one sex! # we split this here by sex if assay_num not in score_means_by_measure: score_means_by_measure[assay_num] = {} if sex not in score_means_by_measure[assay_num]: score_means_by_measure[assay_num][sex] = list() score_means_by_measure[assay_num][sex].append(float(mean)) if strain_num not in strain_scores_by_measure: strain_scores_by_measure[strain_num] = {} if sex not in strain_scores_by_measure[strain_num]: strain_scores_by_measure[strain_num][sex] = {} strain_scores_by_measure[strain_num][sex][assay_num] = \ {'mean': float(mean), 'zscore': float(zscore)} # end loop over strainmeans self.score_means_by_measure = score_means_by_measure self.strain_scores_by_measure = strain_scores_by_measure return def _fill_provenance_graph(self, limit): logger.info("Building graph ...") gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph taxon_id = 'NCBITaxon:10090' # hardcode to Mus musculus gu.addClassToGraph(g, taxon_id, None) scores_passing_threshold_count = 0 scores_passing_threshold_with_ontologies_count = 0 scores_not_passing_threshold_count = 0 # loop through all the strains, # and make G2P assoc for those with scores beyond threshold for strain_num in self.strain_scores_by_measure: if self.testMode and 'MPD:'+str(strain_num) not in self.test_ids: continue strain_id = 'MPD-strain:'+str(strain_num) for sex in self.strain_scores_by_measure[strain_num]: measures = self.strain_scores_by_measure[strain_num][sex] for m in measures: assay_id = 'MPD-assay:'+str(m) # TODO consider using the means # instead of precomputed zscores if 'zscore' in measures[m]: zscore = measures[m]['zscore'] if abs(zscore) >= self.stdevthreshold: scores_passing_threshold_count += 1 # logger.info( # "Score passing threshold: %s | %s | %s", # strain_id, assay_id, zscore) # add the G2P assoc prov = Provenance() assay_label = self.assayhash[m]['assay_label'] if assay_label is not None: assay_label += ' ('+str(m)+')' # TODO unused # assay_type = self.assayhash[m]['assay_type'] assay_description = \ self.assayhash[m]['description'] assay_type_id = Provenance.prov_types['assay'] comment = ' '.join((assay_label, '(zscore='+str(zscore)+')')) ont_term_ids = self.assayhash[m].get('ont_terms') if ont_term_ids is not None: scores_passing_threshold_with_ontologies_count += 1 prov.add_assay_to_graph( g, assay_id, assay_label, assay_type_id, assay_description) self._add_g2p_assoc( g, strain_id, sex, assay_id, ont_term_ids, comment) else: scores_not_passing_threshold_count += 1 logger.info("Scores passing threshold: %d", scores_passing_threshold_count) logger.info("Scores passing threshold with ontologies: %d", scores_passing_threshold_with_ontologies_count) logger.info("Scores not passing threshold: %d", scores_not_passing_threshold_count) return def _add_g2p_assoc(self, g, strain_id, sex, assay_id, phenotypes, comment): """ Create an association between a sex-specific strain id and each of the phenotypes. Here, we create a genotype from the strain, and a sex-specific genotype. Each of those genotypes are created as anonymous nodes. The evidence code is hardcoded to be: ECO:experimental_phenotypic_evidence. :param g: :param strain_id: :param sex: :param assay_id: :param phenotypes: a list of phenotypes to association with the strain :param comment: :return: """ eco_id = "ECO:0000059" # experimental_phenotypic_evidence strain_label = self.idlabel_hash.get(strain_id) # strain genotype genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), 'genotype')) genotype_label = '['+strain_label+']' sex_specific_genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), sex, 'genotype')) if strain_label is not None: sex_specific_genotype_label = strain_label + ' (' + sex + ')' else: sex_specific_genotype_label = strain_id + '(' + sex + ')' if self.nobnodes: genotype_id = ':'+genotype_id sex_specific_genotype_id = ':'+sex_specific_genotype_id genotype_type = Genotype.genoparts['sex_qualified_genotype'] if sex == 'm': genotype_type = Genotype.genoparts['male_genotype'] elif sex == 'f': genotype_type = Genotype.genoparts['female_genotype'] # add the genotype to strain connection self.geno.addGenotype( genotype_id, genotype_label, Genotype.genoparts['genomic_background']) self.gu.addTriple( g, strain_id, Genotype.object_properties['has_genotype'], genotype_id) self.geno.addGenotype( sex_specific_genotype_id, sex_specific_genotype_label, genotype_type) # add the strain as the background for the genotype self.gu.addTriple( g, sex_specific_genotype_id, Genotype.object_properties['has_sex_agnostic_genotype_part'], genotype_id) # ############# BUILD THE G2P ASSOC ############# # TODO add more provenance info when that model is completed if phenotypes is not None: for phenotype_id in phenotypes: assoc = G2PAssoc( self.name, sex_specific_genotype_id, phenotype_id) assoc.add_evidence(assay_id) assoc.add_evidence(eco_id) assoc.add_association_to_graph(g) assoc_id = assoc.get_association_id() self.gu.addComment(g, assoc_id, comment) return def getTestSuite(self): import unittest from tests.test_mpd import MPDTestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(MPDTestCase) return test_suite @staticmethod def normalise_units(units): # todo: return units @staticmethod def build_measurement_description(row): (assay_id, projsym, varname, descrip, units, cat1, cat2, cat3, intervention, intparm, appmeth, panelsym, datatype, sextested, nstrainstested, ageweeks) = row if sextested == 'f': sextested = 'female' elif sextested == 'm': sextested = 'male' elif sextested == 'fm': sextested = 'male and female' else: logger.warning("Unknown sex tested key: %s", sextested) description = "This is an assay of [" + descrip + "] shown as a [" + \ datatype + "] measured in [" + units + "]" if intervention is not None and intervention != "": description += " in response to [" + intervention + "]" if intparm is not None and intervention != "": description += \ ". This represents the [" + intparm + \ "] arm, using materials and methods that included [" +\ appmeth + "]" description += \ ". The overall experiment is entitled [" + projsym + "]. " description += \ "It was conducted in [" + sextested + "] mice at [" + \ ageweeks + "] of age in" + " [" + nstrainstested + \ "] different mouse strains. " description += "Keywords: " + cat1 + \ ((", " + cat2) if cat2.strip() is not "" else "") + \ ((", " + cat3) if cat3.strip() is not "" else "") + "." return description
class Genotype(): """ These methods provide convenient methods to add items related to a genotype and it's parts to a supplied graph. They follow the patterns set out in GENO https://github.com/monarch-initiative/GENO-ontology. For specific sequence features, we use the GenomicFeature class to create them. """ # special genotype parts mapped to their GENO and SO classes that we explicitly reference here genoparts = { 'intrinsic_genotype': 'GENO:0000000', 'extrinsic_genotype': 'GENO:0000524', 'effective_genotype': 'GENO:0000525', 'genomic_background': 'GENO:0000611', 'genomic_variation_complement': 'GENO:0000009', 'karyotype_variation_complement': 'GENO:0000644', 'variant_single_locus_complement': 'GENO:0000030', 'variant_locus': 'GENO:0000002', 'reference_locus': 'GENO:0000036', 'allele': 'GENO:0000008', 'gene': 'SO:0000704', 'QTL': 'SO:0000771', 'transgene': 'SO:0000902', 'pseudogene': 'SO:0000336', 'cytogenetic marker': 'SO:0000341', 'sequence_feature': 'SO:0000110', 'sequence_alteration': 'SO:0001059', 'insertion': 'SO:0000667', 'deletion': 'SO:0000159', 'substitution': 'SO:1000002', 'duplication': 'SO:1000035', 'translocation': 'SO:0000199', 'inversion': 'SO:1000036', 'tandem_duplication': 'SO:1000173', 'point_mutation': 'SO:1000008', 'population': 'PCO:0000001', # population 'family': 'PCO:0000020', # family 'wildtype': 'GENO:0000511', 'reagent_targeted_gene': 'GENO:0000504', 'targeted_gene_subregion' : 'GENO:0000534', 'targeted_gene_complement' : 'GENO:0000527', 'biological_region' : 'SO:0001411', 'missense_variant': 'SO:0001583', 'transcript': 'SO:0000233', 'polypeptide': 'SO:0000104', 'cDNA': 'SO:0000756', 'sequence_variant_causing_loss_of_function_of_polypeptide': 'SO:1000118', 'sequence_variant_causing_gain_of_function_of_polypeptide': 'SO:1000125', 'sequence_variant_causing_inactive_catalytic_site': 'SO:1000120', 'sequence_variant_affecting_polypeptide_function': 'SO:1000117', 'regulatory_transgene_feature': 'GENO:0000638', 'coding_transgene_feature': 'GENO:0000637', 'protein_coding_gene': 'SO:0001217', 'ncRNA_gene': 'SO:0001263' } object_properties = { 'is_mutant_of': 'GENO:0000440', 'derives_from': 'RO:0001000', 'has_alternate_part': 'GENO:0000382', 'has_reference_part': 'GENO:0000385', 'in_taxon': 'RO:0002162', 'has_zygosity': 'GENO:0000608', 'is_sequence_variant_instance_of': 'GENO:0000408', # links a alternate locus (instance) to a gene (class) 'targets_instance_of': 'GENO:0000414', 'is_reference_instance_of': 'GENO:0000610', 'has_part': 'BFO:0000051', 'has_member_with_allelotype': 'GENO:0000225', # use this when relating populations 'is_allelotype_of': 'GENO:0000206', 'has_genotype': 'GENO:0000222', 'has_phenotype': 'RO:0002200', 'transcribed_to': 'RO:0002205', 'translates_to': 'RO:0002513', 'is_targeted_expression_variant_of' : 'GENO:0000443', 'is_transgene_variant_of': 'GENO:0000444', 'has_expression-variant_part' : 'GENO:0000532', 'targeted_by' : 'GENO:0000634', # between a (reagent-targeted gene) and a morpholino 'derives_sequence_from_gene': 'GENO:0000639', # FIXME should this just be subsequence of? 'feature_to_gene_relation': 'GENO:0000418' } annotation_properties = { # TODO change properties with https://github.com/monarch-initiative/GENO-ontology/issues/21 'reference_nucleotide': 'GENO:reference_nucleotide', # Made up term 'reference_amino_acid': 'GENO:reference_amino_acid', # Made up term 'altered_nucleotide': 'GENO:altered_nucleotide', # Made up term 'results_in_amino_acid_change': 'GENO:results_in_amino_acid_change' # Made up term } zygosity = { 'homoplasmic': 'GENO:0000602', 'heterozygous': 'GENO:0000135', 'indeterminate': 'GENO:0000137', 'heteroplasmic': 'GENO:0000603', 'hemizygous-y': 'GENO:0000604', 'hemizygous-x': 'GENO:0000605', 'homozygous': 'GENO:0000136', 'hemizygous': 'GENO:0000606', 'complex_heterozygous': 'GENO:0000402', 'simple_heterozygous': 'GENO:0000458' } properties = object_properties.copy() properties.update(annotation_properties) def __init__(self, graph): self.gu = GraphUtils(curie_map.get()) self.graph = graph self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP) return def addGenotype(self, genotype_id, genotype_label, genotype_type=None, genotype_description=None): """ If a genotype_type is not supplied, we will default to 'intrinsic_genotype' :param genotype_id: :param genotype_label: :param genotype_type: :param genotype_description: :return: """ if genotype_type is None: genotype_type = self.genoparts['intrinsic_genotype'] self.gu.addIndividualToGraph(self.graph, genotype_id, genotype_label, genotype_type, genotype_description) return def addAllele(self, allele_id, allele_label, allele_type=None, allele_description=None): """ Make an allele object. If no allele_type is added, it will default to a geno:allele :param allele_id: curie for allele (required) :param allele_label: label for allele (required) :param allele_type: id for an allele type (optional, recommended SO or GENO class) :param allele_description: a free-text description of the allele :return: """ # TODO should we accept a list of allele types? if (allele_type is None): allele_type = self.genoparts['allele'] #TODO is this a good idea? self.gu.addIndividualToGraph(self.graph, allele_id, allele_label, allele_type, allele_description) return def addGene(self, gene_id, gene_label, gene_type=None, gene_description=None): if gene_type is None: gene_type = self.genoparts['gene'] # genes are classes self.gu.addClassToGraph(self.graph, gene_id, gene_label, gene_type, gene_description) return def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None): # TODO add base type for construct # if (constrcut_type is None): # constrcut_type=self.construct_base_type self.gu.addIndividualToGraph(self.graph, construct_id, construct_label, construct_type, construct_description) return def addDerivesFrom(self, child_id, parent_id): """ We add a derives_from relationship between the child and parent id. Examples of uses include between: an allele and a construct or strain here, a cell line and it's parent genotype. Adding the parent and child to the graph should happen outside of this function call to ensure graph integrity. :param child_id: :param parent_id: :return: """ self.gu.addTriple(self.graph, child_id, self.properties['derives_from'], parent_id) return def addSequenceDerivesFrom(self, child_id, parent_id): self.gu.addTriple(self.graph, child_id, self.properties['derives_sequence_from_gene'], parent_id) return def addAlleleOfGene(self, allele_id, gene_id, rel_id=None): """ We make the assumption here that if the relationship is not provided, it is a GENO:is_sequence_variant_instance_of. Here, the allele should be a variant_locus, not a sequence alteration. :param allele_id: :param gene_id: :param rel_id: :return: """ if (rel_id is None): rel_id = self.properties['is_sequence_variant_instance_of'] self.gu.addTriple(self.graph, allele_id, rel_id, gene_id) return def addTranscript(self, variant_id, transcript_id, transcript_label=None, transcript_type=None): """ Add gene/variant/allele transcribes_to relationship :param variant_id: :param transcript_id: :param transcript_label: :param transcript_type: :return: """ self.gu.addIndividualToGraph(self.graph, transcript_id, transcript_label, transcript_type) self.gu.addTriple(self.graph, variant_id, self.properties['transcribed_to'], transcript_id) return def addPolypeptide(self, polypeptide_id, polypeptide_label=None, transcript_id=None, polypeptide_type=None, ): """ :param polypeptide_id: :param polypeptide_label: :param polypeptide_type: :param transcript_id: :return: """ if polypeptide_type is None: polypeptide_type = self.genoparts['polypeptide'] self.gu.addIndividualToGraph(self.graph, polypeptide_id, polypeptide_label, polypeptide_type) if transcript_id is not None: self.gu.addTriple(self.graph, transcript_id, self.properties['translates_to'], polypeptide_id) return def addPartsToVSLC(self, vslc_id, allele1_id, allele2_id, zygosity_id=None, allele1_rel=None, allele2_rel=None): """ Here we add the parts to the VSLC. While traditionally alleles (reference or variant loci) are traditionally added, you can add any node (such as sequence_alterations for unlocated variations) to a vslc if they are known to be paired. However, if a sequence_alteration's loci is unknown, it probably should be added directly to the GVC. :param vslc_id: :param allele1_id: :param allele2_id: :param zygosity_id: :param allele1_rel: :param allele2_rel: :return: """ # vslc has parts allele1/allele2 gu = self.gu vslc = gu.getNode(vslc_id) if allele1_id is not None: self.addParts(allele1_id, vslc_id, allele1_rel) if allele2_id is not None and allele2_id.strip() != '': self.addParts(allele2_id, vslc_id, allele2_rel) # figure out zygosity if it's not supplied if zygosity_id is None: if allele1_id == allele2_id: zygosity_id = self.zygosity['homozygous'] else: zygosity_id = self.zygosity['heterozygous'] if zygosity_id is not None: gu.addTriple(self.graph, vslc_id, self.properties['has_zygosity'], zygosity_id) return def addVSLCtoParent(self, vslc_id, parent_id): """ The VSLC can either be added to a genotype or to a GVC. The vslc is added as a part of the parent. :param vslc_id: :param parent_id: :return: """ self.addParts(vslc_id, parent_id, self.properties['has_alternate_part']) return def addParts(self, part_id, parent_id, part_relationship=None): """ This will add a has_part (or subproperty) relationship between a parent_id and the supplied part. By default the relationship will be BFO:has_part, but any relationship could be given here. :param part_id: :param parent_id: :param part_relationship: :return: """ if part_relationship is None: part_relationship = self.properties['has_part'] self.gu.addTriple(self.graph, parent_id, part_relationship, part_id) return def addSequenceAlteration(self, sa_id, sa_label, sa_type=None, sa_description=None): if sa_type is None: sa_type = self.genoparts['sequence_alteration'] self.gu.addIndividualToGraph(self.graph, sa_id, sa_label, sa_type, sa_description) return def addSequenceAlterationToVariantLocus(self, sa_id, vl_id): self.addParts(sa_id, vl_id, self.properties['has_alternate_part']) return def addGenomicBackground(self, background_id, background_label, background_type=None, background_description=None): if background_type is None: background_type = self.genoparts['genomic_background'] self.gu.addIndividualToGraph(self.graph, background_id, background_label, background_type, background_description) return def addGenomicBackgroundToGenotype(self, background_id, genotype_id): self.gu.addType(self.graph, background_id, self.genoparts['genomic_background']) self.addParts(background_id, genotype_id, self.object_properties['has_reference_part']) return def addTaxon(self, taxon_id, genopart_id): """ The supplied geno part will have the specified taxon added with RO:in_taxon relation. Generally the taxon is associated with a genomic_background, but could be added to any genotype part (including a gene, regulatory element, or sequence alteration). :param taxon_id: :param genopart_id: :return: """ in_taxon = self.gu.getNode(self.properties['in_taxon']) s = self.gu.getNode(genopart_id) self.graph.add((s, in_taxon, self.gu.getNode(taxon_id))) return def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id): # for example, add a morphant reagent thingy to the genotype, assuming it's a extrinsic_genotype p = self.object_properties['has_expression-variant_part'] self.gu.addTriple(self.graph, genotype_id, p, reagent_id) return def addGeneTargetingReagent(self, reagent_id, reagent_label, reagent_type, gene_id, description=None): """ Here, a gene-targeting reagent is added. The actual targets of this reagent should be added separately. :param reagent_id: :param reagent_label: :param reagent_type: :return: """ # TODO add default type to reagent_type self.gu.addIndividualToGraph(self.graph, reagent_id, reagent_label, reagent_type, description) self.gu.addTriple(self.graph, reagent_id, self.object_properties['targets_instance_of'], gene_id) return def addReagentTargetedGene(self, reagent_id, gene_id, targeted_gene_id=None, targeted_gene_label=None, description=None): """ This will create the instance of a gene that is targeted by a molecular reagent (such as a morpholino or rnai). If an instance id is not supplied, we will create it as an anonymous individual which is of the type GENO:reagent_targeted_gene. We will also add the targets relationship between the reagent and gene class. <targeted_gene_id> a GENO:reagent_targeted_gene rdf:label targeted_gene_label dc:description description <reagent_id> GENO:targets_instance_of <gene_id> :param reagent_id: :param gene_id: :param targeted_gene_id: :return: """ # akin to a variant locus if (targeted_gene_id is None): targeted_gene_id = '_' + gene_id + '-' + reagent_id self.gu.addIndividualToGraph(self.graph, targeted_gene_id, targeted_gene_label, self.genoparts['reagent_targeted_gene'], description) self.gu.addTriple(self.graph, targeted_gene_id, self.object_properties['is_targeted_expression_variant_of'], gene_id) self.gu.addTriple(self.graph, targeted_gene_id, self.object_properties['targeted_by'], reagent_id) return def addTargetedGeneSubregion(self, tgs_id, tgs_label, tgs_type=None, tgs_description=None): if tgs_type is None: tgs_type = self.genoparts['targeted_gene_subregion'] self.gu.addIndividualToGraph(self.graph, tgs_id, tgs_label, tgs_type, tgs_description) def addMemberOfPopulation(self, member_id, population_id): self.gu.addTriple(self.graph, population_id, self.properties['has_member_with_allelotype'], member_id) return def addTargetedGeneComplement(self, tgc_id, tgc_label, tgc_type=None, tgc_description=None): if tgc_type is None: tgc_type = self.genoparts['targeted_gene_complement'] self.gu.addIndividualToGraph(self.graph, tgc_id, tgc_label, tgc_type, tgc_description) return def addGenome(self, taxon_id, taxon_label=None): if taxon_label is None: taxon_label = taxon_id genome_label = taxon_label+' genome' genome_id = self.makeGenomeID(taxon_id) self.gu.addClassToGraph(self.graph, genome_id, genome_label, Feature.types['genome']) return def addReferenceGenome(self, build_id, build_label, taxon_id): genome_id = self.makeGenomeID(taxon_id) self.gu.addIndividualToGraph(self.graph, build_id, build_label, Feature.types['reference_genome']) self.gu.addType(self.graph, build_id, genome_id) self.addTaxon(taxon_id, build_id) return def makeGenomeID(self, taxon_id): # scrub off the taxon prefix. put it in base space genome_id = re.sub('.*\:', ':', taxon_id) + 'genome' return genome_id def addChromosome(self, chr, tax_id, tax_label=None, build_id=None, build_label=None): # if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome. # if a build is included, punn the chromosome as a subclass of SO:chromsome, and # make the build-specific chromosome an instance of the supplied chr. The chr then becomes part of the # build or genome. # first, make the chromosome class, at the taxon level chr_id = makeChromID(str(chr), tax_id) if tax_label is not None: chr_label = makeChromLabel(chr, tax_label) else: chr_label = makeChromLabel(chr) genome_id = self.makeGenomeID(tax_id) self.gu.addClassToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome']) self.addTaxon(tax_id, genome_id) # add the taxon to the genome if build_id is not None: chrinbuild_id = makeChromID(chr, build_id) # the build-specific chromosome if build_label is None: build_label = build_id chrinbuild_label = makeChromLabel(chr, build_label) # add the build-specific chromosome as an instance of the chr class self.gu.addIndividualToGraph(self.graph, chrinbuild_id, chrinbuild_label, chr_id) # add the build-specific chromosome as a member of the build (both ways) self.gu.addMember(self.graph, build_id, chrinbuild_id) self.gu.addMemberOf(self.graph, chrinbuild_id, build_id) return def addChromosomeClass(self, chrom_num, taxon_id, taxon_label): taxon = re.sub('NCBITaxon:', '', taxon_id) chrom_class_id = makeChromID(chrom_num, taxon, 'CHR') # the chrom class (generic) id chrom_class_label = makeChromLabel(chrom_num, taxon_label) self.gu.addClassToGraph(self.graph, chrom_class_id, chrom_class_label, Feature.types['chromosome']) return def addChromosomeInstance(self, chr_num, reference_id, reference_label, chr_type=None): """ Add the supplied chromosome as an instance within the given reference :param chr: :param reference_id: for example, a build id like UCSC:hg19 :param reference_label: :param chr_type: this is the class that this is an instance of. typically a genome-specific chr :return: """ chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH') chr_label = makeChromLabel(str(chr_num), reference_label) self.gu.addIndividualToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome']) self.gu.addType(self.graph, chr_id, chr_type) # add the build-specific chromosome as a member of the build (both ways) self.gu.addMember(self.graph, reference_id, chr_id) self.gu.addMemberOf(self.graph, chr_id, reference_id) return def make_variant_locus_label(self, gene_label, allele_label): if gene_label is None: gene_label = '' label = gene_label.strip()+'<' + allele_label.strip() + '>' return label def make_vslc_label(self, gene_label, allele1_label, allele2_label): """ Make a Variant Single Locus Complement (VSLC) in monarch-style. :param gene_label: :param allele1_label: :param allele2_label: :return: """ vslc_label = '' if (gene_label is None and allele1_label is None and allele2_label is None): logger.error("Not enough info to make vslc label") return None top = self.make_variant_locus_label(gene_label, allele1_label) bottom = '' if allele2_label is not None: bottom = self.make_variant_locus_label(gene_label, allele2_label) vslc_label = '/'.join((top, bottom)) return vslc_label
class Assoc: """ An abstract class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ assoc_types = { 'association': 'OBAN:association' } annotation_properties = { 'replaced_by': 'IAO:0100001', 'consider': 'OIO:consider', 'hasExactSynonym': 'OIO:hasExactSynonym', 'hasRelatedSynonym': 'OIO:hasRelatedSynonym', 'definition': 'IAO:0000115', 'has_xref': 'OIO:hasDbXref', } object_properties = { 'has_disposition': 'GENO:0000208', 'has_phenotype': 'RO:0002200', 'in_taxon': 'RO:0002162', 'has_quality': 'RO:0000086', 'towards': 'RO:0002503', 'has_subject': 'OBAN:association_has_subject', 'has_object': 'OBAN:association_has_object', 'has_predicate': 'OBAN:association_has_object_property', 'is_about': 'IAO:00000136', 'has_evidence': 'RO:0002558', 'has_source': 'dc:source', 'has_provenance': 'OBAN:has_provenance' } datatype_properties = { 'position': 'faldo:position', 'has_measurement': 'IAO:0000004' } properties = annotation_properties.copy() properties.update(object_properties) properties.update(datatype_properties) OWLCLASS = OWL['Class'] OWLIND = OWL['NamedIndividual'] OBJECTPROP = OWL['ObjectProperty'] ANNOTPROP = OWL['AnnotationProperty'] DATAPROP = OWL['DatatypeProperty'] SUBCLASS = RDFS['subClassOf'] BASE = Namespace(curie_map.get()['']) def __init__(self, definedby): self.cu = CurieUtil(curie_map.get()) self.gu = GraphUtils(curie_map.get()) # core parts of the association self.definedby = definedby self.sub = self.obj = self.rel = None self.assoc_id = None self.description = None self.source = [] self.evidence = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return def get_properties(self): return self.properties def _is_valid(self): # check if sub/obj/rel are none...throw error if self.sub is None: raise ValueError('No subject set for this association') if self.obj is None: raise ValueError('No object set for this association') if self.rel is None: raise ValueError('No relation set for this association') return True def _add_basic_association_to_graph(self, g): if not self._is_valid(): return # first, add the direct triple # anonymous (blank) nodes are indicated with underscore s = self.gu.getNode(self.sub) o = self.gu.getNode(self.obj) p = self.gu.getNode(self.rel) if s is None: logging.error( "Unable to retrieve graph node for Subject %s ", self.sub) return elif p is None: logging.error( "Unable to retrieve graph node for Predicate %s ", self.rel) return elif o is None: logging.error( "Unable to retrieve graph node for Object %s ", self.obj) return else: g.add((s, p, o)) if self.assoc_id is None: self.set_association_id() node = self.gu.getNode(self.assoc_id) g.add((node, RDF['type'], self.gu.getNode(self.assoc_types['association']))) self.gu.addTriple(g, self.assoc_id, self.object_properties['has_subject'], self.sub) self.gu.addTriple(g, self.assoc_id, self.object_properties['has_object'], self.obj) self.gu.addTriple(g, self.assoc_id, self.object_properties['has_predicate'], self.rel) if self.description is not None: self.gu.addDescription(g, self.assoc_id, self.description) if self.evidence is not None and len(self.evidence) > 0: for e in self.evidence: self.gu.addTriple(g, self.assoc_id, self.object_properties['has_evidence'], e) if self.source is not None and len(self.source) > 0: for s in self.source: if re.match('http', s): # TODO assume that the source is a publication? # use Reference class here self.gu.addTriple(g, self.assoc_id, self.object_properties['has_source'], s, True) else: self.gu.addTriple(g, self.assoc_id, self.object_properties['has_source'], s) if self.provenance is not None and len(self.provenance) > 0: for p in self.provenance: self.gu.addTriple(g, self.assoc_id, self.object_properties['has_provenance'], p) if self.score is not None: self.gu.addTriple( g, self.assoc_id, self.properties['has_measurement'], Literal(self.score, datatype=XSD['float']), True) # TODO # update with some kind of instance of scoring object # that has a unit and type return def add_association_to_graph(self, g): self._add_basic_association_to_graph(g) return def set_subject(self, identifier): self.sub = identifier return def set_object(self, identifier): self.obj = identifier return def set_relationship(self, identifier): self.rel = identifier return def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id(self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return def get_association_id(self): return self.assoc_id def set_description(self, description): self.description = description return def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type return def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] return def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] return def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] return def load_all_properties(self, g): props = { self.OBJECTPROP: self.object_properties, self.ANNOTPROP: self.annotation_properties, self.DATAPROP: self.datatype_properties } for p in props: self.gu.loadProperties(g, props[p], p) return def _get_source_uri(self, pub_id): """ Given some kind of pub_id (which might be a CURIE or url), convert it into a proper node. :param pub_id: :return: source: Well-formed URI for the given identifier (or url) """ source = None if re.compile('http').match(pub_id): source = URIRef(pub_id) else: u = self.gu.getNode(pub_id) if u is not None: source = URIRef(u) else: logger.error( "An id we don't know how to deal with: %s", pub_id) return source @staticmethod def make_association_id(definedby, subject, predicate, object, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively md5 hashes the (+)-joined string from the values. Subclasses of Assoc can submit an additional array of attributes that will be added to the ID. :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ # note others available: # md5(), sha1(), sha224(), sha256(), sha384(), and sha512() # TEC: at our scale, md5 is in danger of having collisions. # putting definedby first, # as this will usually be the datasource providing the annotation # this will end up making the first few parts of the id # be the same for all annotations in that resource items_to_hash = [definedby, subject, predicate, object] if attributes is not None: items_to_hash += attributes for i, val in enumerate(items_to_hash): if val is None: items_to_hash[i] = '' byte_string = '+'.join(items_to_hash).encode("utf-8") # TODO put this in a util? return ':'.join(('MONARCH', hashlib.md5(byte_string).hexdigest()))
def process_catalog(self, limit=None): """ :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['catalog']['file'])) logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: # set the graph to build g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) gu.loadProperties(g, geno.object_properties, gu.OBJPROP) gu.loadAllProperties(g) tax_id = 'NCBITaxon:9606' # hardcode genome_version = 'GRCh38' # hardcode # build a hashmap of genomic location to identifiers, # to try to get the equivalences loc_to_id_hash = {} with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: if not row: pass else: line_counter += 1 (date_added_to_catalog, pubmed_num, first_author, pub_date, journal, link, study_name, disease_or_trait, initial_sample_description, replicate_sample_description, region, chrom_num, chrom_pos, reported_gene_nums, mapped_gene, upstream_gene_num, downstream_gene_num, snp_gene_nums, upstream_gene_distance, downstream_gene_distance, strongest_snp_risk_allele, snps, merged, snp_id_current, context, intergenic_flag, risk_allele_frequency, pvalue, pvalue_mlog, pvalue_text, or_or_beta, confidence_interval_95, platform_with_snps_passing_qc, cnv_flag, mapped_trait, mapped_trait_uri) = row intersect = \ list(set([str(i) for i in self.test_ids['gene']]) & set(re.split(r',', snp_gene_nums))) # skip if no matches found in test set if self.testMode and len(intersect) == 0: continue # 06-May-2015 25917933 Zai CC 20-Nov-2014 J Psychiatr Res http://europepmc.org/abstract/MED/25917933 # A genome-wide association study of suicide severity scores in bipolar disorder. # Suicide in bipolar disorder # 959 European ancestry individuals NA # 10p11.22 10 32704340 C10orf68, CCDC7, ITGB1 CCDC7 # rs7079041-A rs7079041 0 7079041 intron 0 2E-6 5.698970 if chrom_num != '' and chrom_pos != '': loc = 'chr'+str(chrom_num)+':'+str(chrom_pos) if loc not in loc_to_id_hash: loc_to_id_hash[loc] = set() else: loc = None if re.search(r' x ', strongest_snp_risk_allele) \ or re.search(r',', strongest_snp_risk_allele): # TODO deal with haplotypes logger.warning( "We can't deal with haplotypes yet: %s", strongest_snp_risk_allele) continue elif re.match(r'rs', strongest_snp_risk_allele): rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip() # remove the alteration elif re.match(r'kgp', strongest_snp_risk_allele): # FIXME this isn't correct rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip() # http://www.1000genomes.org/faq/what-are-kgp-identifiers # for some information # They were created by Illumina for their genotyping # platform before some variants identified during the # pilot phase of the project had been assigned # rs numbers. elif re.match(r'chr', strongest_snp_risk_allele): # like: chr10:106180121-G rs_id = ':gwas-' + \ re.sub( r':', '-', strongest_snp_risk_allele.strip()) elif strongest_snp_risk_allele.strip() == '': # logger.debug( # "No strongest SNP risk allele for %s:\n%s", # pubmed_num, str(row)) # FIXME still consider adding in the EFO terms # for what the study measured? continue else: logger.warning( "There's a snp id i can't manage: %s", strongest_snp_risk_allele) continue alteration = re.search(r'-(.*)$', rs_id) if alteration is not None \ and re.match(r'[ATGC]', alteration.group(1)): # add variation to snp pass # TODO rs_id = re.sub(r'-.*$', '', rs_id).strip() if loc is not None: loc_to_id_hash[loc].add(rs_id) pubmed_id = 'PMID:'+pubmed_num r = Reference( pubmed_id, Reference.ref_types['journal_article']) r.addRefToGraph(g) # create the chromosome chrom_id = makeChromID(chrom_num, genome_version, 'CHR') # add the feature to the graph snp_description = None if risk_allele_frequency != '' and \ risk_allele_frequency != 'NR': snp_description = \ str(risk_allele_frequency) + \ ' [risk allele frequency]' f = Feature( rs_id, strongest_snp_risk_allele.strip(), Feature.types[r'SNP'], snp_description) if chrom_num != '' and chrom_pos != '': f.addFeatureStartLocation(chrom_pos, chrom_id) f.addFeatureEndLocation(chrom_pos, chrom_id) f.addFeatureToGraph(g) f.addTaxonToFeature(g, tax_id) # TODO consider adding allele frequency as property; # but would need background info to do that # also want to add other descriptive info about # the variant from the context for c in re.split(r';', context): cid = self._map_variant_type(c.strip()) if cid is not None: gu.addType(g, rs_id, cid) # add deprecation information if merged == 1 and str(snp_id_current.strip()) != '': # get the current rs_id current_rs_id = 'dbSNP:' if not re.match(r'rs', snp_id_current): current_rs_id += 'rs' if loc is not None: loc_to_id_hash[loc].append(current_rs_id) current_rs_id += str(snp_id_current) gu.addDeprecatedIndividual(g, rs_id, current_rs_id) # TODO check on this # should we add the annotations to the current # or orig? gu.makeLeader(g, current_rs_id) else: gu.makeLeader(g, rs_id) # add the feature as a sequence alteration # affecting various genes # note that intronic variations don't necessarily list # the genes such as for rs10448080 FIXME if snp_gene_nums != '': for s in re.split(r',', snp_gene_nums): s = s.strip() # still have to test for this, # because sometimes there's a leading comma if s != '': gene_id = 'NCBIGene:'+s geno.addAlleleOfGene(rs_id, gene_id) # add the up and downstream genes if they are available if upstream_gene_num != '': downstream_gene_id = 'NCBIGene:'+downstream_gene_num gu.addTriple( g, rs_id, Feature.object_properties[ r'upstream_of_sequence_of'], downstream_gene_id) if downstream_gene_num != '': upstream_gene_id = 'NCBIGene:'+upstream_gene_num gu.addTriple( g, rs_id, Feature.object_properties[ 'downstream_of_sequence_of'], upstream_gene_id) description = 'A study of ' + disease_or_trait + \ ' in ' + initial_sample_description if replicate_sample_description != '': description = \ ' '.join( (description, 'with', replicate_sample_description)) if platform_with_snps_passing_qc != '': description = ' '.join( (description, 'on platform', platform_with_snps_passing_qc)) description = ' '.join((description, '(p='+pvalue+')')) # make associations to the EFO terms; there can be >1 if mapped_trait_uri.strip() != '': for t in re.split(r',', mapped_trait_uri): t = t.strip() cu = CurieUtil(curie_map.get()) tid = cu.get_curie(t) assoc = G2PAssoc( self.name, rs_id, tid, gu.object_properties['contributes_to']) assoc.add_source(pubmed_id) # combinatorial evidence # used in automatic assertion eco_id = 'ECO:0000213' assoc.add_evidence(eco_id) # assoc.set_description(description) # FIXME score should get added to provenance/study # assoc.set_score(pvalue) assoc.add_association_to_graph(g) if not self.testMode and\ (limit is not None and line_counter > limit): break Assoc(self.name).load_all_properties(g) # loop through the location hash, # and make all snps at that location equivalent for l in loc_to_id_hash: snp_ids = loc_to_id_hash[l] if len(snp_ids) > 1: logger.info("%s has >1 snp id: %s", l, str(snp_ids)) return
def add_disease_drug_variant_to_graph(self, table): """ Takes an iterable of iterables as input with the following structure, optional indices can be Null: [[variant_key, variant_label, diagnoses_key, diagnoses, specific_diagnosis, organ, relationship, drug_key, drug, therapy_status (optional), pubmed_id(optional)]] See ongoing discussion of how to best model here: https://github.com/monarch-initiative/mckb/issues/9 :param table: iterable of iterables, for example, a tuple of tuples from _get_disease_drug_variant_relationship :return: None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) for row in table: (variant_key, variant_label, diagnoses_key, diagnoses, specific_diagnosis, organ, relationship, drug_key, drug_label, therapy_status, pubmed_id) = row if specific_diagnosis is not None: diagnoses_label = specific_diagnosis else: diagnoses_label = diagnoses # Arbitrary IDs to be replaced by ontology mappings variant_id = self.make_cgd_id('variant{0}'.format(variant_key)) disease_id = self._get_disease_id(diagnoses_key, diagnoses_label) therapy_status_id = self.make_cgd_id('{0}'.format(therapy_status)) relationship_id = "RO:has_environment" disease_quality = ("CGD:{0}".format(relationship)).replace(" ", "_") has_quality_property = "BFO:0000159" drug_id = self._get_drug_id(drug_key, drug_label) geno.addGenotype(variant_id, variant_label, geno.genoparts['sequence_alteration']) disease_instance_id = self.make_cgd_id('disease{0}{1}'.format( diagnoses_label, variant_key)) phenotype_instance_id = self.make_cgd_id('phenotype{0}{1}{2}'.format( diagnoses_label, variant_key, relationship)) phenotype_instance_label = "{0} with {1} to therapy".format(diagnoses_label, relationship) if relationship == "detrimental effect": phenotype_instance_label = "{0} with therapeutic response {1} to health"\ .format(diagnoses_label, relationship) # Reified association for disease caused_by genotype variant_disease_annot = self.make_cgd_id("assoc{0}{1}".format(variant_key, diagnoses_label)) # Add individuals/classes gu.addClassToGraph(self.graph, disease_id, diagnoses_label, 'DOID:4') gu.addClassToGraph(self.graph, drug_id, drug_label, 'CHEBI:23888') gu.addIndividualToGraph(self.graph, phenotype_instance_id, phenotype_instance_label, disease_id) gu.loadObjectProperties(self.graph, {relationship: relationship_id}) if pubmed_id is not None: source_id = "PMID:{0}".format(pubmed_id) ref = Reference(source_id, Reference.ref_types['journal_article']) ref.addRefToGraph(self.graph) evidence = 'ECO:0000033' else: source_id = None evidence = None rel_id = gu.object_properties['has_phenotype'] variant_phenotype_assoc = G2PAssoc(self.name, variant_id, phenotype_instance_id, rel_id) variant_phenotype_assoc.set_association_id(variant_disease_annot) if evidence: variant_phenotype_assoc.add_evidence(evidence) if source_id: variant_phenotype_assoc.add_source(source_id) variant_phenotype_assoc.add_association_to_graph(self.graph) gu.addTriple(self.graph, variant_disease_annot, relationship_id, drug_id) gu.addTriple(self.graph, phenotype_instance_id, has_quality_property, disease_quality) # Add therapy-disease association and approval status marker_relation = "RO:has_biomarker" disease_instance_label = "{0} with biomarker {1}".format(diagnoses_label, variant_label) gu.addIndividualToGraph(self.graph, disease_instance_id, disease_instance_label, disease_id) gu.addTriple(self.graph, disease_instance_id, marker_relation, variant_id) gu.addClassToGraph(self.graph, therapy_status_id, therapy_status) self._add_therapy_drug_association(drug_id, disease_instance_id, therapy_status_id) return
def _process_genes(self, limit=None): gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) raw = '/'.join((self.rawdir, self.files['genes']['file'])) line_counter = 0 logger.info("Processing HGNC genes") with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: (hgnc_id, symbol, name, locus_group, locus_type, status, location, location_sortable, alias_symbol, alias_name, prev_symbol, prev_name, gene_family, gene_family_id, date_approved_reserved, date_symbol_changed, date_name_changed, date_modified, entrez_id, ensembl_gene_id, vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids, pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase, homeodb, snornabase, bioparadigms_slc, orphanet, pseudogene_org, horde_id, merops, imgt, iuphar, kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id, intermediate_filament_db) = row line_counter += 1 # skip header if line_counter <= 1: continue if self.testMode and entrez_id != '' \ and int(entrez_id) not in self.gene_ids: continue if name == '': name = None gene_type_id = self._get_gene_type(locus_type) gu.addClassToGraph(g, hgnc_id, symbol, gene_type_id, name) if locus_type == 'withdrawn': gu.addDeprecatedClass(g, hgnc_id) if entrez_id != '': gu.addEquivalentClass( g, hgnc_id, 'NCBIGene:' + entrez_id) if ensembl_gene_id != '': gu.addEquivalentClass( g, hgnc_id, 'ENSEMBL:' + ensembl_gene_id) geno.addTaxon('NCBITaxon:9606', hgnc_id) # add pubs as "is about" if pubmed_id != '': for p in re.split(r'\|', pubmed_id.strip()): if str(p) != '': gu.addTriple( g, 'PMID:' + str(p.strip()), gu.object_properties['is_about'], hgnc_id) # add chr location # sometimes two are listed, like: 10p11.2 or 17q25 # -- there are only 2 of these FRA10A and MPFD # sometimes listed like "1 not on reference assembly" # sometimes listed like 10q24.1-q24.3 # sometimes like 11q11 alternate reference locus band = chrom = None chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]' chr_match = re.match(chr_pattern, location) if chr_match is not None and len(chr_match.groups()) > 0: chrom = chr_match.group(1) chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR') band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)' band_match = re.search(band_pattern, location) f = Feature(hgnc_id, None, None) if band_match is not None and len(band_match.groups()) > 0: band = band_match.group(1) band = chrom + band # add the chr band as the parent to this gene # as a feature but assume that the band is created # as a class with properties elsewhere in Monochrom # TEC Monoch? Monarchdom?? band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR') gu.addClassToGraph(g, band_id, None) f.addSubsequenceOfFeature(g, band_id) else: gu.addClassToGraph(g, chrom_id, None) f.addSubsequenceOfFeature(g, chrom_id) if not self.testMode \ and limit is not None and line_counter > limit: break # end loop through file gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
class Pathway(): """ This provides convenience methods to deal with gene and protein collections in the context of pathways. """ pathway_parts = { 'signal_transduction': 'GO:0007165', 'cellular_process': 'GO:0009987', 'pathway': 'PW:0000001', 'gene_product': 'CHEBI:33695' # bioinformation molecule } object_properties = { 'involved_in': 'RO:0002331', 'gene_product_of': 'RO:0002204', 'has_gene_product': 'RO:0002205' } properties = object_properties.copy() def __init__(self, graph, nobnodes=False): self.gu = GraphUtils(curie_map.get()) self.graph = graph self.nobnodes = nobnodes self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP) return def addPathway( self, pathway_id, pathway_label, pathway_type=None, pathway_description=None): """ Adds a pathway as a class. If no specific type is specified, it will default to a subclass of "GO:cellular_process" and "PW:pathway". :param pathway_id: :param pathway_label: :param pathway_type: :param pathway_description: :return: """ if pathway_type is None: pathway_type = self.pathway_parts['cellular_process'] self.gu.addClassToGraph( self.graph, pathway_id, pathway_label, pathway_type, pathway_description) self.gu.addSubclass( self.graph, self.pathway_parts['pathway'], pathway_id) return def addGeneToPathway(self, pathway_id, gene_id): """ When adding a gene to a pathway, we create an intermediate 'gene product' that is involved in the pathway, through a blank node. gene_id RO:has_gene_product _gene_product _gene_product RO:involved_in pathway_id :param pathway_id: :param gene_id: :return: """ gene_product = '_'+re.sub(r':', '', gene_id)+'product' if self.nobnodes: gene_product = ':'+gene_product self.gu.addIndividualToGraph( self.graph, gene_product, None, self.pathway_parts['gene_product']) self.gu.addTriple( self.graph, gene_id, self.object_properties['has_gene_product'], gene_product) self.addComponentToPathway(pathway_id, gene_product) return def addComponentToPathway(self, pathway_id, component_id): """ This can be used directly when the component is directly involved in the pathway. If a transforming event is performed on the component first, then the addGeneToPathway should be used instead. :param pathway_id: :param component_id: :return: """ self.gu.addTriple(self.graph, component_id, self.object_properties['involved_in'], pathway_id) return
def _process_data(self, raw, limit=None): logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) line_counter = 0 gu.loadAllProperties(g) gu.loadObjectProperties(g, geno.object_properties) # Add the taxon as a class taxon_id = 'NCBITaxon:10090' # map to Mus musculus gu.addClassToGraph(g, taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: with gzip.open(raw, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: line_counter += 1 (marker_accession_id, marker_symbol, phenotyping_center, colony, sex, zygosity, allele_accession_id, allele_symbol, allele_name, strain_accession_id, strain_name, project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, top_level_mp_term_id, top_level_mp_term_name, mp_term_id, mp_term_name, p_value, percentage_change, effect_size, statistical_method, resource_name) = row if self.testMode and marker_accession_id not in self.test_ids: continue # ##### cleanup some of the identifiers ###### zygosity_id = self._map_zygosity(zygosity) # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_'+re.sub(r'\W+', '_', colony) if self.nobnodes: colony_id = ':'+colony_id if not re.match(r'MGI', allele_accession_id): allele_accession_id = \ '_IMPC-'+re.sub(r':', '', allele_accession_id) if self.nobnodes: allele_accession_id = ':'+allele_accession_id if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC strain_accession_id = '_'+strain_accession_id if self.nobnodes: strain_accession_id = ':'+strain_accession_id elif not re.match(r'MGI', strain_accession_id): logger.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = \ re.match(r'.*<(.*)>', allele_symbol).group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and \ marker_accession_id == '': logger.warning( "Marker unspecified on row %d", line_counter) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = geno.genoparts['variant_locus'] geno.addGene(marker_accession_id, marker_symbol, geno.genoparts['gene']) geno.addAllele(variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) sequence_alteration_id = \ '_seqalt'+re.sub(r':', '', allele_accession_id) if self.nobnodes: sequence_alteration_id = ':'+sequence_alteration_id geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration(sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, # with unknown zygosity stem_cell_class = 'ERO:0002002' gu.addIndividualToGraph(g, colony_id, colony, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = \ '_'+allele_accession_id+geno.zygosity['indeterminate'] vslc_colony = re.sub(r':', '', vslc_colony) if self.nobnodes: vslc_colony = ':'+vslc_colony vslc_colony_label = allele_symbol+'/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts(allele_accession_id, colony_genotype_id, geno.object_properties['has_alternate_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part']) gu.addTriple( g, colony_id, geno.object_properties['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = geno.object_properties['has_alternate_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: logger.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '_' + '-'.join((marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) if self.nobnodes: vslc_id = ':'+vslc_id gu.addIndividualToGraph( g, vslc_id, vslc_name, geno.genoparts['variant_single_locus_complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, geno.object_properties['has_alternate_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc gu.addType( g, vslc_id, Genotype.genoparts['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, geno.genoparts['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = \ strain_name + '/' + phenotyping_center pheno_center_strain_id = \ '-'.join((re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center))) if not re.match(r'^_', pheno_center_strain_id): pheno_center_strain_id = '_'+pheno_center_strain_id if self.nobnodes: pheno_center_strain_id = ':'+pheno_center_strain_id geno.addGenotype(pheno_center_strain_id, pheno_center_strain_label, geno.genoparts['genomic_background']) geno.addSequenceDerivesFrom(pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name+' ['+pheno_center_strain_label+']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(pheno_center_strain_id, taxon_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) genotype_name += '['+colony+']' geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id+sex)) sex_qualified_genotype_label = genotype_name+' ('+sex+')' if sex == 'male': sq_type_id = geno.genoparts['male_genotype'] elif sex == 'female': sq_type_id = geno.genoparts['female_genotype'] else: sq_type_id = geno.genoparts['sex_qualified_genotype'] geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, geno.object_properties['has_alternate_part']) if genomic_background_id is not None and \ genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': logger.warning( "No phenotype id specified for row %d: %s", line_counter, str(row)) continue # experimental_phenotypic_evidence This was used in ZFIN eco_id = "ECO:0000059" # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc(self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph(g) assoc_id = assoc.get_association_id() # add a free-text description description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) gu.addDescription(g, assoc_id, description) # TODO add provenance information # resource_id = resource_name # assoc.addSource(g, assoc_id, resource_id) if not self.testMode and \ limit is not None and line_counter > limit: break gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP) gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP) return
class GeneReviews(Source): """ Here we process the GeneReviews mappings to OMIM, plus inspect the GeneReviews (html) books to pull the clinical descriptions in order to populate the definitions of the terms in the ontology. We define the GeneReviews items as classes that are either grouping classes over OMIM disease ids (gene ids are filtered out), or are made as subclasses of DOID:4 (generic disease). Note that GeneReviews [copyright policy](http://www.ncbi.nlm.nih.gov/books/NBK138602/) (as of 2015.11.20) says: GeneReviews® chapters are owned by the University of Washington, Seattle, © 1993-2015. Permission is hereby granted to reproduce, distribute, and translate copies of content materials provided that (i) credit for source (www.ncbi.nlm.nih.gov/books/NBK1116/) and copyright (University of Washington, Seattle) are included with each copy; (ii) a link to the original material is provided whenever the material is published elsewhere on the Web; and (iii) reproducers, distributors, and/or translators comply with this copyright notice and the GeneReviews Usage Disclaimer. This script doesn't pull the GeneReviews books from the NCBI Bookshelf directly; scripting this task is expressly prohibited by [NCBIBookshelf policy](http://www.ncbi.nlm.nih.gov/books/NBK45311/). However, assuming you have acquired the books (in html format) via permissible means, a parser for those books is provided here to extract the clinical descriptions to define the NBK identified classes. """ files = { 'idmap': {'file': 'NBKid_shortname_OMIM.txt', 'url': GRDL + '/NBKid_shortname_OMIM.txt'}, 'titles': {'file': 'GRtitle_shortname_NBKid.txt', 'url': GRDL + '/GRtitle_shortname_NBKid.txt'} } def __init__(self): Source.__init__(self, 'genereviews') self.load_bindings() self.dataset = Dataset( 'genereviews', 'Gene Reviews', 'http://genereviews.org/', None, 'http://www.ncbi.nlm.nih.gov/books/NBK138602/') self.dataset.set_citation('GeneReviews:NBK1116') self.gu = GraphUtils(curie_map.get()) self.book_ids = set() self.all_books = {} if 'test_ids' not in config.get_config() or\ 'disease' not in config.get_config()['test_ids']: logger.warning("not configured with disease test ids.") self.test_ids = list() else: # select ony those test ids that are omim's. self.test_ids = config.get_config()['test_ids']['disease'] return def fetch(self, is_dl_forced=False): """ We fetch GeneReviews id-label map and id-omim mapping files from NCBI. :return: None """ self.get_files(is_dl_forced) return def parse(self, limit=None): """ :return: None """ if self.testOnly: self.testMode = True self._get_titles(limit) self._get_equivids(limit) self.create_books() self.process_nbk_html(limit) self.load_bindings() # no test subset for now; test == full graph self.testgraph = self.graph logger.info("Found %d nodes", len(self.graph)) return def _get_equivids(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes(not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['idmap']['file'])) gu = GraphUtils(curie_map.get()) line_counter = 0 # we look some stuff up in OMIM, so initialize here omim = OMIM() id_map = {} allomimids = set() with open(raw, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter == 1: # skip header continue (nbk_num, shortname, omim_num) = row gr_id = 'GeneReviews:'+nbk_num omim_id = 'OMIM:'+omim_num if not ( (self.testMode and len(self.test_ids) > 0 and omim_id in self.test_ids) or not self.testMode): continue # sometimes there's bad omim nums if len(omim_num) > 6: logger.warning( "OMIM number incorrectly formatted " + "in row %d; skipping:\n%s", line_counter, '\t'.join(row)) continue # build up a hashmap of the mappings; then process later if nbk_num not in id_map: id_map[nbk_num] = set() id_map[nbk_num].add(omim_num) # add the class along with the shortname gu.addClassToGraph(self.graph, gr_id, None) gu.addSynonym(self.graph, gr_id, shortname) allomimids.add(omim_num) if not self.testMode and \ limit is not None and line_counter > limit: break # end looping through file # get the omim ids that are not genes entries_that_are_phenotypes = \ omim.process_entries( list(allomimids), filter_keep_phenotype_entry_ids, None, None, limit) logger.info("Filtered out %d/%d entries that are genes or features", len(allomimids)-len(entries_that_are_phenotypes), len(allomimids)) for nbk_num in self.book_ids: gr_id = 'GeneReviews:'+nbk_num if nbk_num in id_map: omim_ids = id_map.get(nbk_num) for omim_num in omim_ids: omim_id = 'OMIM:'+omim_num # add the gene reviews as a superclass to the omim id, # but only if the omim id is not a gene if omim_id in entries_that_are_phenotypes: gu.addClassToGraph(self.graph, omim_id, None) gu.addSubclass(self.graph, gr_id, omim_id) # add this as a generic subclass of DOID:4 gu.addSubclass(self.graph, 'DOID:4', gr_id) return def _get_titles(self, limit): """ The file processed here is of the format: #NBK_id GR_shortname OMIM NBK1103 trimethylaminuria 136132 NBK1103 trimethylaminuria 602079 NBK1104 cdls 122470 Where each of the rows represents a mapping between a gr id and an omim id. These are a 1:many relationship, and some of the omim ids are genes (not diseases). Therefore, we need to create a loose coupling here. We make the assumption that these NBKs are generally higher-level grouping classes; therefore the OMIM ids are treated as subclasses. (This assumption is poor for those omims that are actually genes, but we have no way of knowing what those are here... we will just have to deal with that for now.) :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['titles']['file'])) gu = GraphUtils(curie_map.get()) line_counter = 0 with open(raw, 'r', encoding='latin-1') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter == 1: # skip header continue (shortname, title, nbk_num) = row gr_id = 'GeneReviews:'+nbk_num self.book_ids.add(nbk_num) # a global set of the book nums if limit is None or line_counter < limit: gu.addClassToGraph(self.graph, gr_id, title) gu.addSynonym(self.graph, gr_id, shortname) return def create_books(self): # note that although we put in the url to the book, # NCBI Bookshelf does not allow robots to download content book_item = {'file': 'books/', 'url': ''} for nbk in self.book_ids: b = book_item.copy() b['file'] = '/'.join(('books', nbk+'.html')) b['url'] = 'http://www.ncbi.nlm.nih.gov/books/'+nbk self.all_books[nbk] = b return def process_nbk_html(self, limit): """ Here we process the gene reviews books to fetch the clinical descriptions to include in the ontology. We only use books that have been acquired manually, as NCBI Bookshelf does not permit automated downloads. This parser will only process the books that are found in the ```raw/genereviews/books``` directory, permitting partial completion. :param limit: :return: """ c = 0 books_not_found = set() for nbk in self.book_ids: c += 1 nbk_id = 'GeneReviews:'+nbk book_item = self.all_books.get(nbk) url = '/'.join((self.rawdir, book_item['file'])) # figure out if the book is there; if so, process, otherwise skip book_dir = '/'.join((self.rawdir, 'books')) book_files = os.listdir(book_dir) if ''.join((nbk, '.html')) not in book_files: # logger.warning("No book found locally for %s; skipping", nbk) books_not_found.add(nbk) continue logger.info("Processing %s", nbk) page = open(url) soup = BeautifulSoup(page.read()) # sec0 == clinical description clin_summary = \ soup.find( 'div', id=re.compile(".*Summary.sec0")) if clin_summary is not None: p = clin_summary.find('p') ptext = p.text ptext = re.sub(r'\s+', ' ', ptext) ul = clin_summary.find('ul') if ul is not None: item_text = list() for li in ul.find_all('li'): item_text.append(re.sub(r'\s+', ' ', li.text)) ptext += ' '.join(item_text) # add in the copyright and citation info to description ptext = \ ' '.join( (ptext, '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' + nbk_id+']')) self.gu.addDefinition(self.graph, nbk_id, ptext.strip()) # get the pubs pmid_set = set() pub_div = soup.find('div', id=re.compile(r".*Literature_Cited")) if pub_div is not None: ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"}) for r in ref_list: for a in r.find_all( 'a', attrs={'href': re.compile(r"pubmed")}): if re.match(r'PubMed:', a.text): pmnum = re.sub(r'PubMed:\s*', '', a.text) else: pmnum = \ re.search( r'\/pubmed\/(\d+)$', a['href']).group(1) if pmnum is not None: pmid = 'PMID:'+str(pmnum) self.gu.addTriple( self.graph, pmid, self.gu.object_properties['is_about'], nbk_id) pmid_set.add(pmnum) r = Reference( pmid, Reference.ref_types['journal_article']) r.addRefToGraph(self.graph) # TODO add author history, copyright, license to dataset # TODO get PMID-NBKID equivalence (near foot of page), # and make it "is about" link # self.gu.addTriple( # self.graph, pmid, # self.gu.object_properties['is_about'], nbk_id) # for example: NBK1191 PMID:20301370 # add the book to the dataset self.dataset.setFileAccessUrl(book_item['url']) if limit is not None and c > limit: break # finish looping through books l = len(books_not_found) if len(books_not_found) > 0: if l > 100: logger.warning("There were %d books not found.", l) else: logger.warning( "The following %d books were not found locally: %s", l, str(books_not_found)) logger.info( "Finished processing %d books for clinical descriptions", c-l) return def getTestSuite(self): import unittest from tests.test_genereviews import GeneReviewsTestCase test_suite = \ unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase) return test_suite
class OMIA(Source): """ This is the parser for the [Online Mendelian Inheritance in Animals (OMIA)](http://www.http://omia.angis.org.au), from which we process inherited disorders, other (single-locus) traits, and genes in >200 animal species (other than human and mouse and rats). We generate the omia graph to include the following information: * genes * animal taxonomy, and breeds as instances of those taxa (breeds are akin to "strains" in other taxa) * animal diseases, along with species-specific subtypes of those diseases * publications (and their mapping to PMIDs, if available) * gene-to-phenotype associations (via an anonymous variant-locus * breed-to-phenotype associations We make links between OMIA and OMIM in two ways: 1. mappings between OMIA and OMIM are created as OMIA --> hasdbXref OMIM 2. mappings between a breed and OMIA disease are created to be a model for the mapped OMIM disease, IF AND ONLY IF it is a 1:1 mapping. there are some 1:many mappings, and these often happen if the OMIM item is a gene. Because many of these species are not covered in the PANTHER orthology datafiles, we also pull any orthology relationships from the gene_group files from NCBI. """ files = { 'data': { 'file': 'omia.xml.gz', 'url': 'http://omia.angis.org.au/dumps/omia.xml.gz'}, } def __init__(self): Source.__init__(self, 'omia') self.load_bindings() self.dataset = Dataset( 'omia', 'Online Mendelian Inheritance in Animals', 'http://omia.angis.org.au', None, None, 'http://sydney.edu.au/disclaimer.shtml') self.id_hash = { 'article': {}, 'phene': {}, 'breed': {}, 'taxon': {}, 'gene': {} } self.label_hash = {} self.gu = GraphUtils(curie_map.get()) # used to store the omia to omim phene mappings self.omia_omim_map = {} # used to store the unique genes that have phenes # (for fetching orthology) self.annotated_genes = set() self.test_ids = { 'disease': [ 'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201', 'OMIA:000810', 'OMIA:001400'], 'gene': [ 492297, 434, 492296, 3430235, 200685834, 394659996, 200685845, 28713538, 291822383], 'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825], # to be filled in during parsing of breed table # for lookup by breed-associations 'breed': [] } # to store a map of omia ids and any molecular info # to write a report for curation self.stored_omia_mol_gen = {} self.g = self.graph self.geno = Genotype(self.g) return def fetch(self, is_dl_forced=False): """ :param is_dl_forced: :return: """ self.get_files(is_dl_forced) ncbi = NCBIGene() # ncbi.fetch() gene_group = ncbi.files['gene_group'] self.fetch_from_url( gene_group['url'], '/'.join((ncbi.rawdir, gene_group['file'])), False) return def parse(self, limit=None): # names of tables to iterate - probably don't need all these: # Article_Breed, Article_Keyword, Article_Gene, Article_Keyword, # Article_People, Article_Phene, Articles, Breed, Breed_Phene, # Genes_gb, Group_Categories, Group_MPO, Inherit_Type, Keywords, # Landmark, Lida_Links, OMIA_Group, OMIA_author, Omim_Xref, People, # Phene, Phene_Gene, Publishers, Resources, Species_gb, Synonyms self.scrub() if limit is not None: logger.info("Only parsing first %d rows", limit) logger.info("Parsing files...") if self.testOnly: self.testMode = True if self.testMode: self.g = self.testgraph else: self.g = self.graph self.geno = Genotype(self.g) # we do three passes through the file # first process species (two others reference this one) self.process_species(limit) # then, process the breeds, genes, articles, and other static stuff self.process_classes(limit) # next process the association data self.process_associations(limit) # process the vertebrate orthology for genes # that are annotated with phenotypes ncbi = NCBIGene() ncbi.add_orthologs_by_gene_group(self.g, self.annotated_genes) self.load_core_bindings() self.load_bindings() logger.info("Done parsing.") self.write_molgen_report() return def scrub(self): """ The XML file seems to have mixed-encoding; we scrub out the control characters from the file for processing. :return: """ logger.info( "Scrubbing out the nasty characters that break our parser.") myfile = '/'.join((self.rawdir, self.files['data']['file'])) tmpfile = '/'.join((self.rawdir, self.files['data']['file']+'.tmp.gz')) t = gzip.open(tmpfile, 'wb') du = DipperUtil() with gzip.open(myfile, 'rb') as f: filereader = io.TextIOWrapper(f, newline="") for l in filereader: l = du.remove_control_characters(l) + '\n' t.write(l.encode('utf-8')) t.close() # move the temp file logger.info("Replacing the original data with the scrubbed file.") shutil.move(tmpfile, myfile) return # ###################### XML LOOPING FUNCTIONS ################## def process_species(self, limit): """ Loop through the xml file and process the species. We add elements to the graph, and store the id-to-label in the label_hash dict. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line for event, elem in ET.iterparse(filereader): # Species ids are == genbank species ids! self.process_xml_table( elem, 'Species_gb', self._process_species_table_row, limit) f.close() return def process_classes(self, limit): """ Loop through the xml file and process the articles, breed, genes, phenes, and phenotype-grouping classes. We add elements to the graph, and store the id-to-label in the label_hash dict, along with the internal key-to-external id in the id_hash dict. The latter are referenced in the association processing functions. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line parser = ET.XMLParser(encoding='utf-8') for event, elem in ET.iterparse(filereader, parser=parser): self.process_xml_table( elem, 'Articles', self._process_article_row, limit) self.process_xml_table( elem, 'Breed', self._process_breed_row, limit) self.process_xml_table( elem, 'Genes_gb', self._process_gene_row, limit) self.process_xml_table( elem, 'OMIA_Group', self._process_omia_group_row, limit) self.process_xml_table( elem, 'Phene', self._process_phene_row, limit) self.process_xml_table( elem, 'Omim_Xref', self._process_omia_omim_map, limit) f.close() # post-process the omia-omim associations to filter out the genes # (keep only phenotypes/diseases) self.clean_up_omim_genes() return def process_associations(self, limit): """ Loop through the xml file and process the article-breed, article-phene, breed-phene, phene-gene associations, and the external links to LIDA. :param limit: :return: """ myfile = '/'.join((self.rawdir, self.files['data']['file'])) f = gzip.open(myfile, 'rb') filereader = io.TextIOWrapper(f, newline="") filereader.readline() # remove the xml declaration line for event, elem in ET.iterparse(filereader): self.process_xml_table( elem, 'Article_Breed', self._process_article_breed_row, limit) self.process_xml_table( elem, 'Article_Phene', self._process_article_phene_row, limit) self.process_xml_table( elem, 'Breed_Phene', self._process_breed_phene_row, limit) self.process_xml_table( elem, 'Lida_Links', self._process_lida_links_row, limit) self.process_xml_table( elem, 'Phene_Gene', self._process_phene_gene_row, limit) self.process_xml_table( elem, 'Group_MPO', self._process_group_mpo_row, limit) f.close() return # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################ def _process_species_table_row(self, row): # gb_species_id, sci_name, com_name, added_by, date_modified tax_id = 'NCBITaxon:'+str(row['gb_species_id']) sci_name = row['sci_name'] com_name = row['com_name'] if self.testMode and \ (int(row['gb_species_id']) not in self.test_ids['taxon']): return self.gu.addClassToGraph(self.g, tax_id, sci_name) if com_name != '': self.gu.addSynonym(self.g, tax_id, com_name) self.label_hash[tax_id] = com_name # for lookup later else: self.label_hash[tax_id] = sci_name return def _process_breed_row(self, row): # in test mode, keep all breeds of our test species if self.testMode and \ (int(row['gb_species_id']) not in self.test_ids['taxon']): return # save the breed keys in the test_ids for later processing self.test_ids['breed'] += [int(row['breed_id'])] breed_id = self.make_breed_id(row['breed_id']) self.id_hash['breed'][row['breed_id']] = breed_id tax_id = 'NCBITaxon:'+str(row['gb_species_id']) breed_label = row['breed_name'] species_label = self.label_hash.get(tax_id) if species_label is not None: breed_label = breed_label + ' ('+species_label+')' self.gu.addIndividualToGraph(self.g, breed_id, breed_label, tax_id) self.label_hash[breed_id] = breed_label return def _process_phene_row(self, row): phenotype_id = None sp_phene_label = row['phene_name'] if sp_phene_label == '': sp_phene_label = None if 'omia_id' not in row: logger.info("omia_id not present for %s", row['phene_id']) omia_id = self._make_internal_id('phene', phenotype_id) else: omia_id = 'OMIA:'+str(row['omia_id']) if self.testMode and not\ (int(row['gb_species_id']) in self.test_ids['taxon'] and omia_id in self.test_ids['disease']): return # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = omia_id descr = row['summary'] if descr == '': descr = None # omia label omia_label = self.label_hash.get(omia_id) # add the species-specific subclass (TODO please review this choice) gb_species_id = row['gb_species_id'] if gb_species_id != '': sp_phene_id = '-'.join((omia_id, gb_species_id)) else: logger.error( "No species supplied in species-specific phene table for %s", omia_id) return species_id = 'NCBITaxon:'+str(gb_species_id) # use this instead species_label = self.label_hash.get('NCBITaxon:'+gb_species_id) if sp_phene_label is None and \ omia_label is not None and species_label is not None: sp_phene_label = ' '.join((omia_label, 'in', species_label)) self.gu.addClassToGraph( self.g, sp_phene_id, sp_phene_label, omia_id, descr) # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = sp_phene_id self.label_hash[sp_phene_id] = sp_phene_label # add each of the following descriptions, # if they are populated, with a tag at the end. for item in [ 'clin_feat', 'history', 'pathology', 'mol_gen', 'control']: if row[item] is not None and row[item] != '': self.gu.addDescription( self.g, sp_phene_id, row[item] + ' ['+item+']') # if row['symbol'] is not None: # species-specific # CHECK ME - sometimes spaces or gene labels # gu.addSynonym(g, sp_phene, row['symbol']) self.gu.addOWLPropertyClassRestriction( self.g, sp_phene_id, self.gu.object_properties['in_taxon'], species_id) # add inheritance as an association inheritance_id = self._map_inheritance_term_id(row['inherit']) if inheritance_id is not None: assoc = DispositionAssoc(self.name, sp_phene_id, inheritance_id) assoc.add_association_to_graph(self.g) if row['characterised'] == 'Yes': self.stored_omia_mol_gen[omia_id] = { 'mol_gen': row['mol_gen'], 'map_info': row['map_info'], 'species': row['gb_species_id']} return def write_molgen_report(self): import csv logger.info("Writing G2P report for OMIA") f = '/'.join((self.outdir, 'omia_molgen_report.txt')) with open(f, 'w', newline='\n') as csvfile: writer = csv.writer(csvfile, delimiter='\t') # write header h = ['omia_id', 'molecular_description', 'mapping_info', 'species'] writer.writerow(h) for phene in self.stored_omia_mol_gen: writer.writerow((str(phene), self.stored_omia_mol_gen[phene]['mol_gen'], self.stored_omia_mol_gen[phene]['map_info'], self.stored_omia_mol_gen[phene]['species'])) logger.info( "Wrote %d potential G2P descriptions for curation to %s", len(self.stored_omia_mol_gen), f) return def _process_article_row(self, row): # don't bother in test mode if self.testMode: return iarticle_id = self._make_internal_id('article', row['article_id']) self.id_hash['article'][row['article_id']] = iarticle_id rtype = None if row['journal'] != '': rtype = Reference.ref_types['journal_article'] r = Reference(iarticle_id, rtype) if row['title'] is not None: r.setTitle(row['title'].strip()) if row['year'] is not None: r.setYear(row['year']) r.addRefToGraph(self.g) if row['pubmed_id'] is not None: pmid = 'PMID:'+str(row['pubmed_id']) self.id_hash['article'][row['article_id']] = pmid self.gu.addSameIndividual(self.g, iarticle_id, pmid) self.gu.addComment(self.g, pmid, iarticle_id) return def _process_omia_group_row(self, row): omia_id = 'OMIA:'+row['omia_id'] if self.testMode and omia_id not in self.test_ids['disease']: return group_name = row['group_name'] group_summary = row['group_summary'] disease_id = None group_category = row.get('group_category') disease_id = \ self.map_omia_group_category_to_ontology_id(group_category) if disease_id is not None: self.gu.addClassToGraph(self.g, disease_id, None) if disease_id == 'MP:0008762': # embryonic lethal # add this as a phenotype association # add embryonic onset assoc = D2PAssoc(self.name, omia_id, disease_id) assoc.add_association_to_graph(self.g) disease_id = None else: logger.info( "No disease superclass defined for %s: %s", omia_id, group_name) # default to general disease FIXME this may not be desired disease_id = 'DOID:4' if group_summary == '': group_summary = None if group_name == '': group_name = None self.gu.addClassToGraph( self.g, omia_id, group_name, disease_id, group_summary) self.label_hash[omia_id] = group_name return def _process_gene_row(self, row): if self.testMode and row['gene_id'] not in self.test_ids['gene']: return gene_id = 'NCBIGene:'+str(row['gene_id']) self.id_hash['gene'][row['gene_id']] = gene_id gene_label = row['symbol'] self.label_hash[gene_id] = gene_label tax_id = 'NCBITaxon:'+str(row['gb_species_id']) gene_type_id = NCBIGene.map_type_of_gene(row['gene_type']) self.gu.addClassToGraph(self.g, gene_id, gene_label, gene_type_id) self.geno.addTaxon(tax_id, gene_id) return def _process_article_breed_row(self, row): # article_id, breed_id, added_by # don't bother putting these into the test... too many! # and int(row['breed_id']) not in self.test_ids['breed']: if self.testMode: return article_id = self.id_hash['article'].get(row['article_id']) breed_id = self.id_hash['breed'].get(row['breed_id']) # there's some missing data (article=6038). in that case skip if article_id is not None: self.gu.addTriple( self.g, article_id, self.gu.object_properties['is_about'], breed_id) else: logger.warning("Missing article key %s", str(row['article_id'])) return def _process_article_phene_row(self, row): """ Linking articles to species-specific phenes. :param row: :return: """ # article_id, phene_id, added_by # look up the article in the hashmap phenotype_id = self.id_hash['phene'].get(row['phene_id']) article_id = self.id_hash['article'].get(row['article_id']) omia_id = self._get_omia_id_from_phene_id(phenotype_id) if self.testMode and omia_id not in self.test_ids['disease'] \ or phenotype_id is None or article_id is None: return # make a triple, where the article is about the phenotype self.gu.addTriple( self.g, article_id, self.gu.object_properties['is_about'], phenotype_id) return def _process_breed_phene_row(self, row): # Linking disorders/characteristic to breeds # breed_id, phene_id, added_by breed_id = self.id_hash['breed'].get(row['breed_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) # get the omia id omia_id = self._get_omia_id_from_phene_id(phene_id) if (self.testMode and not ( omia_id in self.test_ids['disease'] and int(row['breed_id']) in self.test_ids['breed']) or breed_id is None or phene_id is None): return # FIXME we want a different relationship here assoc = G2PAssoc( self.name, breed_id, phene_id, self.gu.object_properties['has_phenotype']) assoc.add_association_to_graph(self.g) # add that the breed is a model of the human disease # use the omia-omim mappings for this # we assume that we have already scrubbed out the genes # from the omim list, so we can make the model associations here omim_ids = self.omia_omim_map.get(omia_id) eco_id = "ECO:0000214" # biological aspect of descendant evidence if omim_ids is not None and len(omim_ids) > 0: if len(omim_ids) > 1: logger.info( "There's 1:many omia:omim mapping: %s, %s", omia_id, str(omim_ids)) for i in omim_ids: assoc = G2PAssoc( self.name, breed_id, i, self.gu.object_properties['model_of']) assoc.add_evidence(eco_id) assoc.add_association_to_graph(self.g) aid = assoc.get_association_id() breed_label = self.label_hash.get(breed_id) if breed_label is None: breed_label = "this breed" m = re.search(r'\((.*)\)', breed_label) if m: sp_label = m.group(1) else: sp_label = '' phene_label = self.label_hash.get(phene_id) if phene_label is None: phene_label = "phenotype" elif phene_label.endswith(sp_label): # some of the labels we made already include the species; # remove it to make a cleaner desc phene_label = re.sub(r' in '+sp_label, '', phene_label) desc = ' '.join( ("High incidence of", phene_label, "in", breed_label, "suggests it to be a model of disease", i + ".")) self.gu.addDescription(self.g, aid, desc) return def _process_lida_links_row(self, row): # lidaurl, omia_id, added_by omia_id = 'OMIA:'+row['omia_id'] lidaurl = row['lidaurl'] if self.testMode and omia_id not in self.test_ids['disease']: return self.gu.addXref(self.g, omia_id, lidaurl, True) return def _process_phene_gene_row(self, row): gene_id = self.id_hash['gene'].get(row['gene_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) omia_id = self._get_omia_id_from_phene_id(phene_id) if self.testMode and not ( omia_id in self.test_ids['disease'] and row['gene_id'] in self.test_ids['gene']) or\ gene_id is None or phene_id is None: return # occasionally some phenes are missing! (ex: 406) if phene_id is None: logger.warning("Phene id %s is missing", str(row['phene_id'])) return gene_label = self.label_hash[gene_id] # some variant of gene_id has phenotype d vl = '_'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL' if self.nobnodes: vl = ':'+vl self.geno.addAllele(vl, 'some variant of ' + gene_label) self.geno.addAlleleOfGene(vl, gene_id) assoc = G2PAssoc(self.name, vl, phene_id) assoc.add_association_to_graph(self.g) # add the gene id to the set of annotated genes # for later lookup by orthology self.annotated_genes.add(gene_id) return def _process_omia_omim_map(self, row): """ Links OMIA groups to OMIM equivalents. :param row: :return: """ # omia_id, omim_id, added_by omia_id = 'OMIA:'+row['omia_id'] omim_id = 'OMIM:'+row['omim_id'] # also store this for use when we say that a given animal is # a model of a disease if omia_id not in self.omia_omim_map: self.omia_omim_map[omia_id] = set() self.omia_omim_map[omia_id].add(omim_id) if self.testMode and omia_id not in self.test_ids['disease']: return self.gu.addXref(self.g, omia_id, omim_id) return def map_omia_group_category_to_ontology_id(self, category_num): """ Using the category number in the OMIA_groups table, map them to a disease id. This may be superceeded by other MONDO methods. Platelet disorders will be more specific once https://github.com/obophenotype/human-disease-ontology/issues/46 is fulfilled. :param category_num: :return: """ category_map = { 1: 'DOID:0014667', # Inborn error of metabolism 2: 'MESH:D004392', # Dwarfism 3: 'DOID:1682', # congenital heart disease 4: 'DOID:74', # blood system disease 5: 'DOID:3211', # lysosomal storage disease 6: 'DOID:16', # integumentary system disease # --> retinal degeneration ==> OMIA:000830 7: 'DOID:8466', # progressive retinal atrophy 8: 'DOID:0050572', # Cone–rod dystrophy 9: 'MESH:C536122', # stationary night blindness 10: 'Orphanet:98553', # developmental retinal disorder 11: 'DOID:5679', # retinal disorder 12: 'Orphanet:90771', # Disorder of Sex Development # - what to do about this one? 13: 'MP:0008762', # embryonic lethal # - not sure what to do with this 14: None, # blood group # FIXME make me more specific 15: 'DOID:2218', # intrinsic platelet disorder # FIXME make me more specific 16: 'DOID:2218', # extrinsic platelet disorder 17: None # transgenic ??? } disease_id = None if category_num is not None and int(category_num) in category_map: disease_id = category_map.get(int(category_num)) logger.info( "Found %s for category %s", str(disease_id), str(category_num)) else: logger.info( "There's a group category I don't know anything about: %s", str(category_num)) return disease_id def _process_group_mpo_row(self, row): """ Make OMIA to MP associations :param row: :return: """ omia_id = 'OMIA:'+row['omia_id'] mpo_num = int(row['MPO_no']) mpo_id = 'MP:'+str(mpo_num).zfill(7) assoc = D2PAssoc(self.name, omia_id, mpo_id) assoc.add_association_to_graph(self.g) return def clean_up_omim_genes(self): omim = OMIM() # get all the omim ids allomimids = set() for omia in self.omia_omim_map: allomimids.update(self.omia_omim_map[omia]) entries_that_are_phenotypes = omim.process_entries( list(allomimids), filter_keep_phenotype_entry_ids, None, None) logger.info( "Filtered out %d/%d entries that are genes or features", len(allomimids)-len(entries_that_are_phenotypes), len(allomimids)) # now iterate again and remove those non-phenotype ids removed_count = 0 for omia in self.omia_omim_map: ids = self.omia_omim_map[omia] cleanids = set() for i in ids: if i in entries_that_are_phenotypes: cleanids.add(i) else: removed_count += 1 # keep track of how many we've removed self.omia_omim_map[omia] = cleanids logger.info( "Removed %d omim ids from the omia-to-omim map", removed_count) return def _make_internal_id(self, prefix, key): iid = '_'+''.join(('omia', prefix, 'key', str(key))) if self.nobnodes: iid = ':'+iid return iid def make_breed_id(self, key): breed_id = 'OMIA-breed:'+str(key) return breed_id @staticmethod def _get_omia_id_from_phene_id(phene_id): omia_id = None if phene_id is not None: m = re.match(r'OMIA:\d+', str(phene_id)) if m: omia_id = m.group(0) return omia_id @staticmethod def _map_inheritance_term_id(inheritance_symbol): inherit_map = { 'A': None, # Autosomal 'ACD': 'GENO:0000143', # Autosomal co-dominant 'ADV': None, # autosomal dominant with variable expressivity 'AID': 'GENO:0000259', # autosomal incompletely dominant 'ASD': 'GENO:0000145', # autosomal semi-dominant # autosomal recessive, semi-lethal # using generic autosomal recessive 'ASL': 'GENO:0000150', 'D': 'GENO:0000147', # autosomal dominant 'M': None, # multifactorial 'MAT': None, # Maternal # probably autosomal recessive # using generic autosomal recessive 'PR': 'GENO:0000150', 'R': 'GENO:0000150', # Autosomal Recessive # Recessive Embryonic Lethal # using plain recessive 'REL': 'GENO:0000148', # Autosomal Recessive Lethal # using plain autosomal recessive 'RL': 'GENO:0000150', 'S': 'GENO:0000146', # Sex-linked <--using allosomal dominant 'SLi': None, # Sex-limited 'UD': 'GENO:0000144', # Dominant 'X': None, # x-linked # HP:0001417 ? # X-linked Dominant <-- temp using allosomal dominant FIXME 'XLD': 'GENO:0000146', # X-linked Recessive <-- temp using allosomal recessive FIXME 'XLR': 'GENO:0000149', 'Y': None, # Y-linked 'Z': None, # Z-linked # Z-linked recessive <-- temp using allosomal recessive FIXME 'ZR': 'GENO:0000149', '999': None, # Z-linked incompletely dominant } inheritance_id = inherit_map.get(inheritance_symbol) if inheritance_id is None and inheritance_symbol is not None: logger.warning( "No inheritance id is mapped for %s", inheritance_symbol) return inheritance_id def getTestSuite(self): import unittest from tests.test_omia import OMIATestCase test_suite = unittest.TestLoader().loadTestsFromTestCase(OMIATestCase) return test_suite
def _get_gene2pubmed(self, limit): """ Loops through the gene2pubmed file and adds a simple triple to say that a given publication is_about a gene. Publications are added as NamedIndividuals. These are filtered on the taxon. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file'])) logger.info("FILE: %s", myfile) assoc_counter = 0 with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match(r'^#', line): continue (tax_num, gene_num, pubmed_num) = line.split('\t') # ## set filter=None in init if you don't want to have a filter # if self.filter is not None: # if ((self.filter == 'taxids' and \ # (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and \ # (int(gene_num) not in self.gene_ids))): # continue # #### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if not self.testMode and int(tax_num) not in self.tax_ids: continue if gene_num == '-' or pubmed_num == '-': continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) pubmed_id = ':'.join(('PMID', pubmed_num)) if self.class_or_indiv.get(gene_id) == 'C': gu.addClassToGraph(g, gene_id, None) else: gu.addIndividualToGraph(g, gene_id, None) # add the publication as a NamedIndividual # add type publication gu.addIndividualToGraph(g, pubmed_id, None, None) r = Reference( pubmed_id, Reference.ref_types['journal_article']) r.addRefToGraph(g) gu.addTriple( g, pubmed_id, gu.object_properties['is_about'], gene_id) assoc_counter += 1 if not self.testMode and \ limit is not None and line_counter > limit: break logger.info( "Processed %d pub-gene associations", assoc_counter) return
class Feature(): """ Dealing with genomic features here. By default they are all faldo:Regions. We use SO for typing genomic features. At the moment, RO:has_subsequence is the default relationship between the regions, but this should be tested/verified. TODO: the graph additions are in the addXToFeature functions, but should be separated. TODO: this will need to be extended to properly deal with fuzzy positions in faldo. """ object_properties = { 'location': 'faldo:location', 'begin': 'faldo:begin', 'end': 'faldo:end', 'reference': 'faldo:reference', 'gene_product_of': 'RO:0002204', 'has_gene_product': 'RO:0002205', 'is_about': 'IAO:00000136', 'has_subsequence': 'RO:0002524', 'is_subsequence_of': 'RO:0002525', 'has_staining_intensity': 'GENO:0000207', # was GENO:0000626 (staining_intensity), # but changing to has_sequence_attribute 'upstream_of_sequence_of': 'RO:0002528', 'downstream_of_sequence_of': 'RO:0002529' } data_properties = { 'position': 'faldo:position', } annotation_properties = {} properties = object_properties.copy() properties.update(data_properties) properties.update(annotation_properties) types = { 'region': 'faldo:Region', 'Position': 'faldo:Position', # big P for Position type. little p for position property 'FuzzyPosition': 'faldo:FuzzyPosition', 'chromosome': 'SO:0000340', 'chromosome_arm': 'SO:0000105', 'chromosome_band': 'SO:0000341', 'chromosome_part': 'SO:0000830', 'long_chromosome_arm': 'GENO:0000629', 'short_chromosome_arm': 'GENO:0000628', 'chromosome_region': 'GENO:0000614', 'chromosome_subband': 'GENO:0000616', 'centromere': 'SO:0000577', 'plus_strand': 'faldo:PlusStrandPosition', 'minus_strand': 'faldo:MinusStrandPosition', 'both_strand': 'faldo:BothStrandPosition', 'score': 'SO:0001685', # FIXME - score is not a good solution, too generic 'reference_genome': 'SO:0001505', 'genome': 'SO:0001026', 'assembly_component': 'SO:0000143', 'SNP': 'SO:0000694', # the following are sequence attributes: 'band_intensity': 'GENO:0000618', 'gneg': 'GENO:0000620', 'gpos': 'GENO:0000619', 'gpos100': 'GENO:0000622', 'gpos75': 'GENO:0000623', 'gpos50': 'GENO:0000624', 'gpos25': 'GENO:0000625', 'gvar': 'GENO:0000621', 'gpos33': 'GENO:0000633', 'gpos66': 'GENO:0000632' } def __init__(self, id, label, type, description=None): self.id = id self.label = label self.type = type self.description = description self.gu = GraphUtils(curie_map.get()) self.start = None self.stop = None self.nobnodes = True # TODO remove this before official release return def addFeatureStartLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds coordinate details for the start of this feature. :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ # make an object for the start, which has: # {coordinate : integer, reference : reference_id, types = []} self.start = self._getLocation(coordinate, reference_id, strand, position_types) return def addFeatureEndLocation( self, coordinate, reference_id, strand=None, position_types=None): """ Adds the coordinate details for the end of this feature :param coordinate: :param reference_id: :param strand: :return: """ self.stop = self._getLocation(coordinate, reference_id, strand, position_types) return def _getLocation(self, coordinate, reference_id, strand, position_types): """ Make an object for the location, which has: {coordinate : integer, reference : reference_id, types = []} where the strand is indicated in the type array :param coordinate: :param reference_id: :param strand: :param position_types: :return: """ loc = {} loc['coordinate'] = coordinate loc['reference'] = reference_id loc['type'] = [] strand_id = self._getStrandType(strand) if strand_id is not None: loc['type'].append(strand_id) if position_types is not None: loc['type'] += position_types if position_types == []: loc['type'].append(self.types['Position']) return loc def _getStrandType(self, strand): """ :param strand: :return: """ # TODO make this a dictionary/enum: PLUS, MINUS, BOTH, UNKNOWN strand_id = None if strand == '+': strand_id = self.types['plus_strand'] elif strand == '-': strand_id = self.types['minus_strand'] elif strand == '.': strand_id = self.types['both_strand'] elif strand is None: # assume this is Unknown pass else: logger.warning("strand type could not be mapped: %s", str(strand)) return strand_id def addFeatureToGraph( self, graph, add_region=True, region_id=None, feature_as_class=False): """ We make the assumption here that all features are instances. The features are located on a region, which begins and ends with faldo:Position The feature locations leverage the Faldo model, which has a general structure like: Triples: feature_id a feature_type (individual) faldo:location region_id region_id a faldo:region faldo:begin start_position faldo:end end_position start_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id end_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :return: """ if feature_as_class: self.gu.addClassToGraph(graph, self.id, self.label, self.type, self.description) else: self.gu.addIndividualToGraph(graph, self.id, self.label, self.type, self.description) if self.start is None and self.stop is None: add_region = False if add_region: # create a region that has the begin/end positions regionchr = re.sub(r'\w+\:_?', '', self.start['reference']) if region_id is None: # in case the values are undefined # if we know only one of the coordinates, # then we'll add an "unknown" other. st = sp = 'UN' strand = None if self.start is not None and \ self.start['coordinate'] is not None: st = str(self.start['coordinate']) strand = self._getStrandStringFromPositionTypes( self.start['type']) if self.stop is not None and\ self.stop['coordinate'] is not None: sp = str(self.stop['coordinate']) if strand is not None: strand = self._getStrandStringFromPositionTypes( self.stop['type']) # assume that the strand is the same for both start and stop. # this will need to be fixed in the future region_items = [regionchr, st, sp] if strand is not None: region_items += [strand] region_id = '-'.join(region_items) rid = region_id rid = re.sub(r'\w+\:', '', rid, 1) # replace the id prefix rid = '_'+rid+"-Region" region_id = rid if self.nobnodes: region_id = ':'+region_id self.gu.addTriple(graph, self.id, self.properties['location'], region_id) self.gu.addIndividualToGraph( graph, region_id, None, 'faldo:Region') else: region_id = self.id self.gu.addType(graph, region_id, 'faldo:Region') # add the start/end positions to the region beginp = endp = None if self.start is not None: beginp = self._makePositionId(self.start['reference'], self.start['coordinate'], self.start['type']) self.addPositionToGraph(graph, self.start['reference'], self.start['coordinate'], self.start['type']) if self.stop is not None: endp = self._makePositionId(self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addPositionToGraph(graph, self.stop['reference'], self.stop['coordinate'], self.stop['type']) self.addRegionPositionToGraph(graph, region_id, beginp, endp) # {coordinate : integer, reference : reference_id, types = []} return def _getStrandStringFromPositionTypes(self, tylist): strand = None if self.types['plus_strand'] in tylist: strand = 'plus' elif self.types['minus_strand'] in tylist: strand = 'minus' elif self.types['both_strand'] in tylist: strand = 'both' else: strand = None # it is stranded, but we don't know what it is return strand def _makePositionId(self, reference, coordinate, types=None): """ Note that positions should have a reference (we will enforce). Only exact positions need a coordinate. :param reference: :param coordinate: :param types: :return: """ if reference is None: logger.error("Trying to make position with no reference.") return None i = '_' if self.nobnodes: i = ':'+i reference = re.sub(r'\w+\:', '', reference, 1) if re.match(r'^_', reference): # this is in the case if the reference is a bnode reference = re.sub(r'^_', '', reference) i += reference if coordinate is not None: # just in case it isn't a string already i = '-'.join((i, str(coordinate))) if types is not None: tstring = self._getStrandStringFromPositionTypes(types) if tstring is not None: i = '-'.join((i, tstring)) return i def addRegionPositionToGraph( self, graph, region_id, begin_position_id, end_position_id): if begin_position_id is None: pass # logger.warn( # "No begin position specified for region %s", region_id) else: self.gu.addTriple(graph, region_id, self.properties['begin'], begin_position_id) if end_position_id is None: pass # logger.warn("No end position specified for region %s", region_id) else: self.gu.addTriple(graph, region_id, self.properties['end'], end_position_id) return def addPositionToGraph( self, graph, reference_id, position, position_types=None, strand=None): """ Add the positional information to the graph, following the faldo model. We assume that if the strand is None, we give it a generic "Position" only. Triples: my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position) faldo:position Integer(numeric position) faldo:reference reference_id :param graph: :param reference_id: :param position: :param position_types: :param strand: :return: Identifier of the position created """ iid = self._makePositionId(reference_id, position, position_types) n = self.gu.getNode(iid) pos = self.gu.getNode(self.properties['position']) ref = self.gu.getNode(self.properties['reference']) if position is not None: graph.add((n, pos, Literal(position, datatype=XSD['integer']))) graph.add((n, ref, self.gu.getNode(reference_id))) if position_types is not None: for t in position_types: graph.add((n, RDF['type'], self.gu.getNode(t))) s = None if strand is not None: s = strand if not re.match(r'faldo', strand): # not already mapped to faldo, so expect we need to map it s = self._getStrandType(strand) # else: # s = self.types['both_strand'] if s is None and (position_types is None or position_types == []): s = self.types['Position'] if s is not None: graph.add((n, RDF['type'], self.gu.getNode(s))) return iid def addSubsequenceOfFeature(self, graph, parentid): """ This will add reciprocal triples like: feature is_subsequence_of parent parent has_subsequence feature :param graph: :param parentid: :return: """ self.gu.addTriple( graph, self.id, self.properties['is_subsequence_of'], parentid) self.gu.addTriple( graph, parentid, self.properties['has_subsequence'], self.id) return def addTaxonToFeature(self, graph, taxonid): """ Given the taxon id, this will add the following triple: feature in_taxon taxonid :param graph: :param taxonid: :return: """ # TEC: should taxon be set in __init__()? self.taxon = taxonid self.gu.addTriple( graph, self.id, Assoc.properties['in_taxon'], self.taxon) return def loadAllProperties(self, graph): prop_dict = { Assoc(None).ANNOTPROP: self.annotation_properties, Assoc(None).OBJECTPROP: self.object_properties, Assoc(None).DATAPROP: self.data_properties } for p in prop_dict: self.gu.loadProperties(graph, prop_dict.get(p), p) return def addFeatureProperty(self, graph, property_type, property): self.gu.addTriple(graph, self.id, property_type, property) return def setNoBNodes(self, nobnodes): self.nobnodes = nobnodes return
def _get_gene_info(self, limit): """ Currently loops through the gene_info file and creates the genes as classes, typed with SO. It will add their label, any alternate labels as synonyms, alternate ids as equivlaent classes. HPRDs get added as protein products. The chromosome and chr band get added as blank node regions, and the gene is faldo:located on the chr band. :param limit: :return: """ gu = GraphUtils(curie_map.get()) if self.testMode: g = self.testgraph else: g = self.graph geno = Genotype(g) # not unzipping the file logger.info("Processing Gene records") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['gene_info']['file'])) logger.info("FILE: %s", myfile) # Add taxa and genome classes for those in our filter for tax_num in self.tax_ids: tax_id = ':'.join(('NCBITaxon', str(tax_num))) geno.addGenome(tax_id, str(tax_num)) # tax label can get added elsewhere gu.addClassToGraph(g, tax_id, None) # label added elsewhere with gzip.open(myfile, 'rb') as f: for line in f: # skip comments line = line.decode().strip() if re.match('^#', line): continue (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chr, map_loc, desc, gtype, authority_symbol, name, nomenclature_status, other_designations, modification_date) = line.split('\t') ##### set filter=None in init if you don't want to have a filter #if self.filter is not None: # if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids)) # or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))): # continue ##### end filter if self.testMode and int(gene_num) not in self.gene_ids: continue if int(tax_num) not in self.tax_ids: continue line_counter += 1 gene_id = ':'.join(('NCBIGene', gene_num)) tax_id = ':'.join(('NCBITaxon', tax_num)) gene_type_id = self._map_type_of_gene(gtype) if symbol == 'NEWENTRY': label = None else: label = symbol # TODO might have to figure out if things aren't genes, and make them individuals gu.addClassToGraph(g, gene_id, label, gene_type_id, desc) # we have to do special things here for genes, because they're classes not individuals # f = Feature(gene_id,label,gene_type_id,desc) if name != '-': gu.addSynonym(g, gene_id, name) if synonyms.strip() != '-': for s in synonyms.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) if other_designations.strip() != '-': for s in other_designations.split('|'): gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym']) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 if xrefs.strip() != '-': for r in xrefs.strip().split('|'): fixedr = self._cleanup_id(r) if fixedr is not None and fixedr.strip() != '': if re.match('HPRD', fixedr): # proteins are not == genes. gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr) else: # skip some of these for now if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']: gu.addEquivalentClass(g, gene_id, fixedr) # edge cases of id | symbol | chr | map_loc: # 263 AMD1P2 X|Y with Xq28 and Yq12 # 438 ASMT X|Y with Xp22.3 or Yp11.3 # in PAR # 419 ART3 4 with 4q21.1|4p15.1-p14 # no idea why there's two bands listed - possibly 2 assemblies # 28227 PPP2R3B X|Y Xp22.33; Yp11.3 # in PAR # 619538 OMS 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 #this is of "unknown" type == susceptibility # 101928066 LOC101928066 1|Un - # unlocated scaffold # 11435 Chrna1 2 2 C3|2 43.76 cM # mouse --> 2C3 # 11548 Adra1b 11 11 B1.1|11 25.81 cM # mouse --> 11B1.1 # 11717 Ampd3 7 7 57.85 cM|7 E2-E3 # mouse # 14421 B4galnt1 10 10 D3|10 74.5 cM # mouse # 323212 wu:fb92e12 19|20 - # fish # 323368 ints10 6|18 - # fish # 323666 wu:fc06e02 11|23 - # fish # feel that the chr placement can't be trusted in this table when there is > 1 listed # with the exception of human X|Y, i will only take those that align to one chr # FIXME remove the chr mapping below when we pull in the genomic coords if str(chr) != '-' and str(chr) != '': if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']: # this means that there's uncertainty in the mapping. skip it # TODO we'll need to figure out how to deal with >1 loc mapping logger.info('%s is non-uniquely mapped to %s. Skipping for now.', gene_id, str(chr)) continue # X|Y Xp22.33;Yp11.3 # if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())): # print('odd chr=',str(chr)) if str(chr) == 'X; Y': chr = 'X|Y' # rewrite the PAR regions for processing # do this in a loop to allow PAR regions like X|Y for c in re.split('\|',str(chr)) : geno.addChromosomeClass(c, tax_id, None) # assume that the chromosome label will get added elsewhere mychrom = makeChromID(c, tax_num, 'CHR') mychrom_syn = makeChromLabel(c, tax_num) # temporarily use the taxnum for the disambiguating label gu.addSynonym(g, mychrom, mychrom_syn) band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc) if band_match is not None and len(band_match.groups()) > 0: # if tax_num != '9606': # continue # this matches the regular kind of chrs, so make that kind of band # not sure why this matches? chrX|Y or 10090chr12|Un" # TODO we probably need a different regex per organism # the maploc_id already has the numeric chromosome in it, strip it first bid = re.sub('^'+c, '', map_loc) maploc_id = makeChromID(c+bid, tax_num, 'CHR') # the generic location (no coordinates) # print(map_loc,'-->',bid,'-->',maploc_id) band = Feature(maploc_id, None, None) # Assume it's type will be added elsewhere band.addFeatureToGraph(g) # add the band as the containing feature gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id) else: # TODO handle these cases # examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24, ## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1, 12cen-q21, 22q13.3|22q13.3 logger.debug('not regular band pattern for %s: %s', gene_id, map_loc) # add the gene as a subsequence of the chromosome gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom) geno.addTaxon(tax_id, gene_id) if not self.testMode and limit is not None and line_counter > limit: break gu.loadProperties(g, Feature.object_properties, gu.OBJPROP) gu.loadProperties(g, Feature.data_properties, gu.DATAPROP) gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP) gu.loadAllProperties(g) return
def _process_phenotype_data(self, limit): """ NOTE: If a Strain carries more than one mutation, then each Mutation description, i.e., the set: ( Mutation Type - Chromosome - Gene Symbol - Gene Name - Allele Symbol - Allele Name) will require a separate line. Note that MMRRC curates phenotypes to alleles, even though they distribute only one file with the phenotypes appearing to be associated with a strain. So, here we process the allele-to-phenotype relationships separately from the strain-to-allele relationships. :param limit: :return: """ if self.testMode: g = self.testgraph else: g = self.graph line_counter = 0 gu = GraphUtils(curie_map.get()) fname = '/'.join((self.rawdir, self.files['catalog']['file'])) self.strain_hash = {} self.id_label_hash = {} genes_with_no_ids = set() stem_cell_class = 'CL:0000034' mouse_taxon = 'NCBITaxon:10090' geno = Genotype(g) with open(fname, 'r', encoding="utf8") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') for row in filereader: line_counter += 1 # skip the first 3 lines which are header, etc. if line_counter < 4: continue (strain_id, strain_label, strain_type_symbol, strain_state, mgi_allele_id, mgi_allele_symbol, mgi_allele_name, mutation_type, chrom, mgi_gene_id, mgi_gene_symbol, mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums, research_areas) = row if self.testMode and (strain_id not in self.test_ids): continue # strip off stuff after the dash - # is the holding center important? # MMRRC:00001-UNC --> MMRRC:00001 strain_id = re.sub(r'-\w+$', '', strain_id) self.id_label_hash[strain_id] = strain_label # get the variant or gene to save for later building of # the genotype if strain_id not in self.strain_hash: self.strain_hash[strain_id] = {'variants': set(), 'genes': set()} # clean up the bad one if mgi_allele_id == 'multiple mutation': logger.error("Erroneous gene id: %s", mgi_allele_id) mgi_allele_id = '' if mgi_allele_id != '': self.strain_hash[strain_id]['variants'].add(mgi_allele_id) self.id_label_hash[mgi_allele_id] = mgi_allele_symbol # use the following if needing to add the # sequence alteration types # var_type = # self._get_variant_type_from_abbrev(mutation_type) # make a sequence alteration for this variant locus, # and link the variation type to it # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA' # if self.nobnodes: # sa_id = ':'+sa_id # gu.addIndividualToGraph(g, sa_id, None, var_type) # geno.addSequenceAlterationToVariantLocus(sa_id, # mgi_allele_id) # scrub out any spaces mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id) if mgi_gene_id.strip() != '': if re.match(r'Gene\s*ID:', mgi_gene_id, re.I): mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:', mgi_gene_id) elif not re.match(r'MGI', mgi_gene_id): logger.info("Gene id not recognized: %s", mgi_gene_id) if re.match(r'\d+$', mgi_gene_id): # assume that if it's all numbers, then it's MGI mgi_gene_id = 'MGI:'+str(mgi_gene_id) logger.info("Assuming numerics are MGI.") self.strain_hash[strain_id]['genes'].add(mgi_gene_id) self.id_label_hash[mgi_gene_id] = mgi_gene_symbol # catch some errors - # some things have gene labels, but no identifiers - report if mgi_gene_symbol.strip() != '' and mgi_gene_id == '': logger.error( "Gene label with no identifier for strain %s: %s", strain_id, mgi_gene_symbol) genes_with_no_ids.add(mgi_gene_symbol.strip()) # make a temp id for genes that aren't identified # tmp_gene_id = '_'+mgi_gene_symbol # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol # self.strain_hash[strain_id]['genes'].add(tmp_gene_id) # split apart the mp ids # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ... # mp_ids are now a comma delimited list # with MP terms in brackets phenotype_ids = [] if mp_ids != '': for i in re.split(r',', mp_ids): i = i.strip() mps = re.search(r'\[(.*)\]', i) if mps is not None: mp_id = mps.group(1).strip() phenotype_ids.append(mp_id) # pubmed ids are space delimited pubmed_ids = [] if pubmed_nums.strip() != '': for i in re.split(r'\s+', pubmed_nums): pmid = 'PMID:'+i.strip() pubmed_ids.append(pmid) r = Reference(pmid, Reference.ref_types['journal_article']) r.addRefToGraph(g) # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001 # is a good example of 4 genotype parts gu.addClassToGraph(g, mouse_taxon, None) if research_areas.strip() == '': research_areas = None else: research_areas = 'Research Areas: '+research_areas strain_type = mouse_taxon if strain_state == 'ES': strain_type = stem_cell_class gu.addIndividualToGraph( g, strain_id, strain_label, strain_type, research_areas) # an inst of mouse?? gu.makeLeader(g, strain_id) # phenotypes are associated with the alleles for pid in phenotype_ids: # assume the phenotype label is in the ontology gu.addClassToGraph(g, pid, None) if mgi_allele_id is not None and mgi_allele_id != '': assoc = G2PAssoc(self.name, mgi_allele_id, pid, gu.object_properties['has_phenotype']) for p in pubmed_ids: assoc.add_source(p) assoc.add_association_to_graph(g) else: logger.info("Phenotypes and no allele for %s", strain_id) if not self.testMode and ( limit is not None and line_counter > limit): break # now that we've collected all of the variant information, build it # we don't know their zygosities for s in self.strain_hash: h = self.strain_hash.get(s) variants = h['variants'] genes = h['genes'] vl_set = set() # make variant loci for each gene if len(variants) > 0: for v in variants: vl_id = v vl_symbol = self.id_label_hash[vl_id] geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) vl_set.add(vl_id) if len(variants) == 1 and len(genes) == 1: for gene in genes: geno.addAlleleOfGene(vl_id, gene) else: geno.addAllele(vl_id, vl_symbol) else: # len(vars) == 0 # it's just anonymous variants in some gene for gene in genes: vl_id = '_'+gene+'-VL' vl_id = re.sub(r':', '', vl_id) if self.nobnodes: vl_id = ':'+vl_id vl_symbol = self.id_label_hash[gene]+'<?>' self.id_label_hash[vl_id] = vl_symbol geno.addAllele(vl_id, vl_symbol, geno.genoparts['variant_locus']) geno.addGene(gene, self.id_label_hash[gene]) geno.addAlleleOfGene(vl_id, gene) vl_set.add(vl_id) # make the vslcs vl_list = sorted(vl_set) vslc_list = [] for vl in vl_list: # for unknown zygosity vslc_id = '_'+re.sub(r'^_', '', vl)+'U' vslc_id = re.sub(r':', '', vslc_id) if self.nobnodes: vslc_id = ':' + vslc_id vslc_label = self.id_label_hash[vl] + '/?' self.id_label_hash[vslc_id] = vslc_label vslc_list.append(vslc_id) geno.addPartsToVSLC( vslc_id, vl, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part'], None) gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts['variant_single_locus_complement']) if len(vslc_list) > 0: if len(vslc_list) > 1: gvc_id = '-'.join(vslc_list) gvc_id = re.sub(r':', '', gvc_id) if self.nobnodes: gvc_id = ':'+gvc_id gvc_label = \ '; '.join(self.id_label_hash[v] for v in vslc_list) gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) for vslc_id in vslc_list: geno.addVSLCtoParent(vslc_id, gvc_id) else: # the GVC == VSLC, so don't have to make an extra piece gvc_id = vslc_list.pop() gvc_label = self.id_label_hash[gvc_id] genotype_label = gvc_label + ' [n.s.]' bkgd_id = \ '_' + re.sub(r':', '', '-'.join( (geno.genoparts['unspecified_genomic_background'], s))) genotype_id = '-'.join((gvc_id, bkgd_id)) if self.nobnodes: bkgd_id = ':'+bkgd_id geno.addTaxon(mouse_taxon, bkgd_id) geno.addGenomicBackground( bkgd_id, 'unspecified ('+s+')', geno.genoparts['unspecified_genomic_background'], "A placeholder for the " + "unspecified genetic background for "+s) geno.addGenomicBackgroundToGenotype( bkgd_id, genotype_id, geno.genoparts['unspecified_genomic_background']) geno.addParts( gvc_id, genotype_id, geno.object_properties['has_alternate_part']) geno.addGenotype(genotype_id, genotype_label) gu.addTriple( g, s, geno.object_properties['has_genotype'], genotype_id) else: # logger.debug( # "Strain %s is not making a proper genotype.", s) pass gu.loadProperties( g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP) gu.loadProperties( g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP) gu.loadProperties( g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP) gu.loadAllProperties(g) logger.warning( "The following gene symbols did not list identifiers: %s", str(sorted(list(genes_with_no_ids)))) return
def _get_var_citations(self, limit): # Generated weekly, the first of the week # A tab-delimited report of citations associated with data in ClinVar, connected to the AlleleID, the VariationID, and either rs# from dbSNP or nsv in dbVar. # # AlleleID integer value as stored in the AlleleID field in ClinVar (//Measure/@ID in the XML) # VariationID The identifier ClinVar uses to anchor its default display. (in the XML, //MeasureSet/@ID) # rs rs identifier from dbSNP # nsv nsv identifier from dbVar # citation_source The source of the citation, either PubMed, PubMedCentral, or the NCBI Bookshelf # citation_id The identifier used by that source gu = GraphUtils(curie_map.get()) logger.info("Processing Citations for variants") line_counter = 0 myfile = '/'.join((self.rawdir, self.files['variant_citations']['file'])) if self.testMode: g = self.testgraph else: g = self.graph with open(myfile, 'r', encoding="utf8") as f: filereader = csv.reader(f, delimiter='\t', quotechar='\"') for line in filereader: # skip comments line = line if re.match('^#', line[0]): continue (allele_num, variant_num, rs_num, nsv_num, citation_source, citation_id) = line line_counter += 1 if self.testMode: if int(variant_num) not in self.variant_ids: continue if citation_id.strip() == '': logger.info("Skipping blank citation for ClinVarVariant:%s", str(variant_num)) continue # the citation for a variant is made to some kind of combination of the ids here. # but i'm not sure which we don't know what the citation is for exactly, other # than the variant. so use mentions var_id = 'ClinVarVariant:'+variant_num # citation source: PubMed | PubMedCentral | citation_source # citation id: # format the citation id: ref_id = None if citation_source == 'PubMed': ref_id = 'PMID:'+str(citation_id) elif citation_source == 'PubMedCentral': ref_id = 'PMCID:'+str(citation_id) if ref_id is not None: r = Reference(ref_id, Reference.ref_types['journal_article']) r.addRefToGraph(g) gu.addTriple(g, ref_id, self.properties['is_about'], var_id) if not self.testMode and (limit is not None and line_counter > limit): break logger.info("Finished processing citations for variants") return
def _process_data(self, raw, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ logger.info("Processing Data from %s", raw) gu = GraphUtils(curie_map.get()) if self.testMode: # set the graph to build g = self.testgraph else: g = self.graph line_counter = 0 geno = Genotype(g) du = DipperUtil() gu.loadProperties(g, geno.object_properties, gu.OBJPROP) gu.loadAllProperties(g) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: if not row: pass else: line_counter += 1 (catalog_id, description, omim_number, sample_type, cell_line_available, dna_in_stock, dna_ref, gender, age, race, ethnicity, affected, karyotype, relprob, mutation, gene, family_id, collection, url, cat_remark, pubmed_ids, family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,, # 2,,18343,H**o sapiens if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:'+catalog_id.strip() # Map the cell/sample type cell_type = self._map_cell_type(sample_type) # Make a cell line label line_label = \ collection.partition(' ')[0]+'-'+catalog_id.strip() # Map the repository/collection repository = self._map_collection(collection) # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_person' if self.nobnodes: patient_id = ':'+patient_id if family_id != '': patient_id = \ '-'.join((patient_id, family_id, family_member)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id.strip())) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. short_desc = (description.split(';')[0]).capitalize() if affected == 'Yes': affected = 'affected' elif affected == 'No': affected = 'unaffected' gender = gender.lower() patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = \ ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = \ ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = 'CLO:0000031' gu.addIndividualToGraph( g, cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:'+dna_ref # some of the equivalent ids are not defined # in the source data; so add them gu.addIndividualToGraph( g, equiv_cell_line, None, cell_line_reagent_id) gu.addSameIndividual(g, cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository gu.addMember(g, repository, cell_line_id) if cat_remark != '': gu.addDescription(g, cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # g,age_id,age,self.terms['age']) # gu.addTriple( # g,age_id,self.properties['has_measurement'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. gu.addPerson(g, patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self._map_race(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.terms['race'],mapped_race) # gu.addSubclass( # g,self.terms['ethnic_group'],mapped_race) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if family_id != '': family_comp_id = 'CoriellFamily:'+family_id family_label = \ ' '.join(('Family of proband with', short_desc)) # Add the family ID as a named individual gu.addIndividualToGraph( g, family_comp_id, family_label, geno.genoparts['family']) # Add the patient as a member of the family gu.addMemberOf(g, patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! if species is None or species == '': species = 'H**o sapiens' taxon = self._map_species(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None if dbsnp_id != '': genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip() omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = du.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = \ '_'+re.sub('MONARCH:', '', self.make_id(karyotype)) if self.nobnodes: karyotype_id = ':'+karyotype_id # add karyotype as karyotype_variation_complement gu.addIndividualToGraph( g, karyotype_id, karyotype, geno.genoparts['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = \ self._get_affected_chromosomes_from_karyotype( karyotype) for c in karyo_chrs: chr_id = makeChromID(c, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, c)) karyotype_feature_label = \ 'some karyotype alteration on chr'+str(c) f = Feature( karyotype_feature_id, karyotype_feature_label, geno.genoparts['sequence_alteration']) f.addFeatureStartLocation(None, chr_id) f.addFeatureToGraph(g) f.loadAllProperties(g) geno.addParts( karyotype_feature_id, karyotype_id, geno.object_properties['has_alternate_part']) if gene != '': vl = gene+'('+mutation+')' # fix the variant_id so it's always in the same order vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' \ and not self._is_normal_karyotype(karyotype): mutation = mutation.strip() gvc_id = karyotype_id if variant_id != '': gvc_id = '_' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass if gvc_id is not None and gvc_id != karyotype_id \ and self.nobnodes: gvc_id = ':'+gvc_id # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = geno.object_properties['has_alternate_part'] if self._is_normal_karyotype(karyotype): karyo_rel = \ geno.object_properties['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for v in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X m = re.match(r'(\d+)\.+(.*)', v.strip()) if m is not None and len(m.groups()) == 2: (locus_num, var_num) = m.groups() if locus_num is not None \ and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for o in omim_map: # gene_id = 'OMIM:' + o # TODO unused vslc_id = \ '_' + '-'.join( [o + '.' + a for a in omim_map.get(o)]) if self.nobnodes: vslc_id = ':'+vslc_id vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them gu.addIndividualToGraph( g, vslc_id, vslc_label, geno.genoparts[ 'variant_single_locus_complement']) for v in omim_map.get(o): # this is actually a sequence alt allele1_id = 'OMIM:'+o+'.'+v geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, geno.zygosity['indeterminate'], geno.object_properties[ 'has_alternate_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype gu.addType(g, patient_id, geno.genoparts['wildtype']) elif genotype_id is None: # make an anonymous genotype id genotype_id = '_geno'+catalog_id.strip() if self.nobnodes: genotype_id = ':'+genotype_id # add the gvc if gvc_id is not None: gu.addIndividualToGraph( g, gvc_id, gvc_label, geno.genoparts['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = \ geno.object_properties[ 'has_reference_part'] else: rel = \ geno.object_properties[ 'has_alternate_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = \ '; '.join((gvc_label, karyotype)) else: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts( karyotype_id, genotype_id, geno.object_properties[ 'has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' ['+catalog_id.strip()+']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype( genotype_id, genotype_label, geno.genoparts['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient gu.addTriple( g, patient_id, geno.properties['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # ############# DEAL WITH THE DISEASES ############# # we associate the disease to the patient if affected == 'affected': if omim_number != '': for d in omim_number.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno if d not in omim_map: disease_id = 'OMIM:'+d.strip() # assume the label is taken care of gu.addClassToGraph(g, disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc( self.name, patient_id, disease_id) assoc.add_association_to_graph(g) # this line is a model of this disease # TODO abstract out model into # it's own association class? gu.addTriple( g, cell_line_id, gu.properties['model_of'], disease_id) else: logger.info( 'removing %s from disease list ' + 'since it is a gene', d) # ############# ADD PUBLICATIONS ############# if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:'+s.strip() ref = Reference(pubmed_id) ref.setType(Reference.ref_types['journal_article']) ref.addRefToGraph(g) gu.addTriple( g, pubmed_id, gu.properties['mentions'], cell_line_id) if not self.testMode \ and (limit is not None and line_counter > limit): break Assoc(self.name).load_all_properties(g) return
def _add_variant_protein_variant_assoc_to_graph(self, row): """ Generates relationships between variants and protein variants given a row of data :param iterable: row of data, see add_variant_info_to_graph() docstring for expected structure :return None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) is_missense = False is_literal = True (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source) = row[0:11] variant_id = self.make_cgd_id('variant{0}'.format(variant_key)) transcript_curie = self._make_transcript_curie(transcript_id) uniprot_curie = self._make_uniprot_polypeptide_curie(transcript_id) ncbi_protein_curie = self._make_ncbi_polypeptide_curie(transcript_id) geno.addGenotype(variant_id, variant_label, geno.genoparts['sequence_alteration']) # Make fake amino acid sequence in case we # can't get a CCDS to Uniprot and/or NCBI Protein mapping aa_seq_id = self.make_cgd_id('transcript{0}'.format(amino_acid_variant)) # Add Transcript: geno.addTranscript(variant_id, transcript_curie, transcript_id, geno.genoparts['transcript']) # Add polypeptide if ncbi_protein_curie is not None: geno.addPolypeptide(ncbi_protein_curie, self.transcript_xrefs['RefSeq'][transcript_id], transcript_curie) aa_seq_id = ncbi_protein_curie if uniprot_curie is not None: geno.addPolypeptide(uniprot_curie, self.transcript_xrefs['UniProt'][transcript_id], transcript_curie) # Overrides ncbi_protein_curie, # but we set them as equal individuals below aa_seq_id = uniprot_curie if ncbi_protein_curie is not None and uniprot_curie is not None: gu.addSameIndividual(self.graph, ncbi_protein_curie, uniprot_curie) else: aa_seq_id = self.make_cgd_id('transcript{0}'.format(amino_acid_variant)) if protein_variant_type == 'nonsynonymous - missense' \ or re.search(r'missense', variant_label): is_missense = True geno.addGenotype(variant_id, variant_label, geno.genoparts['missense_variant']) # Get gene ID from gene map self._add_variant_gene_relationship(variant_id, transcript_gene) amino_acid_regex = re.compile(r'^p\.([A-Za-z]{1,3})(\d+)([A-Za-z]{1,3})$') if is_missense: match = re.match(amino_acid_regex, amino_acid_variant.rstrip()) else: match = None if match is not None: ref_amino_acid = match.group(1) position = match.group(2) altered_amino_acid = match.group(3) else: logger.debug("Could not parse amino acid information" " from {0} variant:" " {1} type: {2}".format(amino_acid_variant, variant_label, protein_variant_type)) # Add amino acid change to model if is_missense is True and match is not None: gu.addTriple(self.graph, variant_id, geno.properties['reference_amino_acid'], ref_amino_acid, is_literal) gu.addTriple(self.graph, variant_id, geno.properties['results_in_amino_acid_change'], altered_amino_acid, is_literal) aa_region_id = ":_{0}{1}{2}Region".format(position, position, aa_seq_id) self._add_feature_with_coords(variant_id, position, position, aa_seq_id, aa_region_id) return
class Environment(): """ These methods provide convenient methods to add items related to an experimental environment and it's parts to a supplied graph. This is a stub ready for expansion. """ # special genotype parts mapped to their GENO and SO classes # that we explicitly reference here environment_parts = { 'environmental_system': 'ENVO:01000254', 'environmental_condition': 'XCO:0000000', 'morpholio_reagent': 'REO:0000042', 'talen_reagent': 'REO:0001022', 'crispr_reagent': 'REO:crispr_TBD' } object_properties = { 'has_part': 'BFO:0000051', } annotation_properties = { } properties = object_properties.copy() properties.update(annotation_properties) def __init__(self, graph): self.gu = GraphUtils(curie_map.get()) self.graph = graph self.gu.loadProperties( self.graph, self.object_properties, self.gu.OBJPROP) return def addEnvironment( self, env_id, env_label, env_type=None, env_description=None): if env_type is None: env_type = self.environment_parts['environmental_system'] self.gu.addIndividualToGraph( self.graph, env_id, env_label, env_type, env_description) return def addEnvironmentalCondition( self, cond_id, cond_label, cond_type=None, cond_description=None): if cond_type is None: cond_type = self.environment_parts['environmental_condition'] self.gu.addIndividualToGraph( self.graph, cond_id, cond_label, cond_type, cond_description) return def addComponentToEnvironment(self, env_id, component_id): self.gu.addTriple( self.graph, env_id, self.gu.object_properties['has_part'], # TODO cbeck if cself component_id) return def addComponentAttributes( self, component_id, entity_id, value=None, unit=None): self.gu.addTriple( self.graph, component_id, self.gu.object_properties['has_part'], entity_id) # TODO add value and units return
def _add_variant_cdna_variant_assoc_to_graph(self, row): """ Generates relationships between variants and cDNA variants given a row of data :param iterable: row of data, see add_variant_info_to_graph() docstring for expected structure. Only applicable for structure 2. :return None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) is_literal = True (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base, primary_transcript_exons, primary_transcript_variant_sub_types, variant_type, chromosome, genome_build, build_version, build_date) = row variant_id = self.make_cgd_id('variant{0}'.format(variant_key)) # Add gene self._add_variant_gene_relationship(variant_id, variant_gene) # Transcript reference for nucleotide position transcript_curie = self._make_transcript_curie(transcript_id) # Make region IDs cdna_region_id = ":_{0}Region".format(transcript_curie) chrom_region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build, chromosome, genome_pos_start, genome_pos_end) # Add the genome build genome_label = "Human" build_id = "UCSC:{0}".format(genome_build) taxon_id = 'NCBITaxon:9606' geno.addGenome(taxon_id, genome_label) geno.addReferenceGenome(build_id, genome_build, taxon_id) # Add chromosome chrom_class_id = makeChromID(chromosome, '9606', 'CHR') # the chrom class (generic) id chrom_instance_id = makeChromID(chromosome, build_id, 'MONARCH') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chromosome, taxon_id, 'Human') # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chromosome, build_id, genome_build, chrom_class_id) # Add variant coordinates in reference to chromosome self._add_feature_with_coords(variant_id,genome_pos_start, genome_pos_end, chrom_instance_id, chrom_region_id) # Add mutation coordinates in reference to gene self._add_feature_with_coords(variant_id, bp_pos, bp_pos, transcript_curie, cdna_region_id) # Add nucleotide mutation gu.addTriple(self.graph, variant_id, geno.properties['reference_nucleotide'], ref_base, is_literal) gu.addTriple(self.graph, variant_id, geno.properties['altered_nucleotide'], variant_base, is_literal) """ Here we update any internal cgd variant IDS with a cosmic ID or dbSNP ID. Alternatively we could do this using sql rather than a sparql update which may be safer """ # Add SNP xrefs if cosmic_id is not None: cosmic_id_list = cosmic_id.split(', ') cosmic_curie_list = [] for c_id in cosmic_id_list: cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id) cosmic_curie_list.append(cosmic_curie) gu.addIndividualToGraph(self.graph, cosmic_curie, c_id, geno.genoparts['missense_variant']) # If there are multiple ids set them equivalent to the first for curie in cosmic_curie_list[1:]: gu.addSameIndividual(self.graph, cosmic_curie_list[0], curie) self._replace_entity(self.graph, variant_id, cosmic_curie_list[0], self.bindings) if db_snp_id is not None: db_snp_curie = re.sub(r'rs(\d+)', r'dbSNP:\1', db_snp_id) gu.addIndividualToGraph(self.graph, db_snp_curie, db_snp_id, geno.genoparts['missense_variant']) if cosmic_id is None: self._replace_entity(self.graph, variant_id, db_snp_curie, self.bindings) else: cosmic_id_list = cosmic_id.split(', ') for c_id in cosmic_id_list: cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id) gu.addSameIndividual(self.graph, cosmic_curie, db_snp_curie) return