def _process_article_row(self, row): model = Model(self.g) # don't bother in test mode if self.testMode: return iarticle_id = self._make_internal_id('article', row['article_id']) self.id_hash['article'][row['article_id']] = iarticle_id rtype = None if row['journal'] != '': rtype = Reference.ref_types['journal_article'] reference = Reference(self.g, iarticle_id, rtype) if row['title'] is not None: reference.setTitle(row['title'].strip()) if row['year'] is not None: reference.setYear(row['year']) reference.addRefToGraph() if row['pubmed_id'] is not None: pmid = 'PMID:'+str(row['pubmed_id']) self.id_hash['article'][row['article_id']] = pmid model.addSameIndividual(iarticle_id, pmid) model.addComment(pmid, iarticle_id.replace("_:", '')) return
def _process_article_row(self, row): # don't bother in test mode if self.testMode: return iarticle_id = self._make_internal_id('article', row['article_id']) self.id_hash['article'][row['article_id']] = iarticle_id rtype = None if row['journal'] != '': rtype = Reference.ref_types['journal_article'] r = Reference(iarticle_id, rtype) if row['title'] is not None: r.setTitle(row['title'].strip()) if row['year'] is not None: r.setYear(row['year']) r.addRefToGraph(self.g) if row['pubmed_id'] is not None: pmid = 'PMID:'+str(row['pubmed_id']) self.id_hash['article'][row['article_id']] = pmid self.gu.addSameIndividual(self.g, iarticle_id, pmid) self.gu.addComment(self.g, pmid, iarticle_id) return
def _get_pubs(self, entry, g): """ Extract mentioned publications from the reference list :param entry: :return: """ ref_to_pmid = {} entry_num = entry['mimNumber'] model = Model(g) if 'referenceList' in entry: reflist = entry['referenceList'] for r in reflist: if 'pubmedID' in r['reference']: pub_id = 'PMID:' + str(r['reference']['pubmedID']) ref = \ Reference( g, pub_id, Reference.ref_types['journal_article']) else: # make blank node for internal reference pub_id = \ '_:OMIM' + str(entry_num) + 'ref' + \ str(r['reference']['referenceNumber']) ref = Reference(g, pub_id) title = author_list = source = citation = None if 'title' in r['reference']: title = r['reference']['title'] ref.setTitle(title) if 'authors' in r['reference']: author_list = r['reference']['authors'] ref.setAuthorList(author_list) citation = re.split(r'\.\,', author_list)[0] + ' et al' if 'source' in r['reference']: source = r['reference']['source'] citation = '; '.join( list(filter(None.__ne__, [citation, title, source]))) ref.setShortCitation(citation) ref.addRefToGraph() ref_to_pmid[r['reference']['referenceNumber']] = pub_id # add is_about for the pub omim_id = 'OMIM:'+str(entry_num) g.addTriple(omim_id, model.object_properties['mentions'], pub_id) return ref_to_pmid
def _get_pubs(self, entry, graph): """ Extract mentioned publications from the reference list :param entry: :return: """ ref_to_pmid = {} entry_num = entry['mimNumber'] if 'referenceList' in entry: reflist = entry['referenceList'] for rlst in reflist: if 'pubmedID' in rlst['reference']: pub_id = 'PMID:' + str(rlst['reference']['pubmedID']) ref = Reference(graph, pub_id, self.globaltt['journal article']) else: # make blank node for internal reference pub_id = '_:OMIM' + str(entry_num) + 'ref' + str( rlst['reference']['referenceNumber']) ref = Reference(graph, pub_id) title = author_list = source = citation = None if 'title' in rlst['reference']: title = rlst['reference']['title'] ref.setTitle(title) if 'authors' in rlst['reference']: author_list = rlst['reference']['authors'] ref.setAuthorList(author_list) citation = re.split(r'\.\,', author_list)[0] + ' et al' if 'source' in rlst['reference']: source = rlst['reference']['source'] citation = '; '.join([ tok for tok in [citation, title, source] if tok is not None ]) ref.setShortCitation(citation) ref.addRefToGraph() ref_to_pmid[rlst['reference']['referenceNumber']] = pub_id # add is_about for the pub omim_id = 'OMIM:' + str(entry_num) graph.addTriple(omim_id, self.globaltt['mentions'], pub_id) return ref_to_pmid
def _get_pubs(self, entry, g): """ Extract mentioned publications from the reference list :param entry: :return: """ ref_to_pmid = {} du = DipperUtil() entry_num = entry['mimNumber'] gu = GraphUtils(curie_map.get()) if 'referenceList' in entry: reflist = entry['referenceList'] for r in reflist: if 'pubmedID' in r['reference']: pub_id = 'PMID:' + str(r['reference']['pubmedID']) ref = Reference(pub_id, Reference.ref_types['journal_article']) else: # make blank node for internal reference pub_id = '_OMIM' + str(entry_num) + 'ref' + str(r['reference']['referenceNumber']) if self.nobnodes: pub_id = ':' + pub_id ref = Reference(pub_id) title = author_list = source = citation = None if 'title' in r['reference']: title = r['reference']['title'] ref.setTitle(title) if 'authors' in r['reference']: author_list = r['reference']['authors'] ref.setAuthorList(author_list) citation = re.split('\.\,', author_list)[0] + ' et al' if 'source' in r['reference']: source = r['reference']['source'] citation = '; '.join(du.flatten([citation, title, source])) ref.setShortCitation(citation) ref.addRefToGraph(g) ref_to_pmid[r['reference']['referenceNumber']] = pub_id # add is_about for the pub omim_id = 'OMIM:'+str(entry_num) gu.addTriple(g, omim_id, gu.object_properties['mentions'], pub_id) return ref_to_pmid
def _process_allele_phenotype(self, limit): """ Make allele to phenotype associations using derived_pheno_class and derived_pheno_manifest cvterm in the flybase db, an example entry is: FBal0257663 @FBcv0000351:lethal@ | @FBcv0000308:female limited@, with @FBal0130657:Scer\GAL4<up>dome-PG14</up>@ The first term is the phenotype, and all follow up terms are qualifiers, self.globaltt['has_qualifier']) Our previous approach was to use the genotype id associated with FBal0257663/FBal0130657 , however, this required us to create blank nodes and was considered unnecessarily granular Note that sometimes identifiers do not exist for a term, eg @:heat sensitive | tetracycline conditional@ derived_pheno_class - FBcv terms, these are phenotypes derived_pheno_manifest - Anatomy terms FBbt, we currently make phenotype IRI equivalents that end up in UPheno, but this is being developed and updated, see https://github.com/monarch-initiative/dipper/issues/770 Adds triples to self.graph :param limit: number of rows to process :return: None """ model = Model(self.graph) src_key = 'allele_phenotype' raw = '/'.join((self.rawdir, self.queries[src_key]['file'])) LOG.info("processing allele phenotype associations") col = self.queries[src_key]['columns'] transgenic_alleles = self._get_foreign_transgenic_alleles() # flybase terms - terms we prefix with FlyBase: fly_prefixes = ['FBal', 'FBti', 'FBab', 'FBba', 'FBtp'] # a alphanumeric id followed by a colon then # any character but a colon bordered by @s term_regex = re.compile(r'@([\w]*):([^:@]*)@') id_regex = re.compile(r'([a-zA-Z]+)(\d+)') with open(raw, 'r') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') row = next(reader) # headers self.check_fileheader(col, row) for row in reader: allele_id = row[col.index('allele_id')] pheno_desc = row[col.index('pheno_desc')] pheno_type = row[col.index('pheno_type')] pub_id = row[col.index('pub_id')] pub_title = row[col.index('pub_title')] pmid_id = row[col.index('pmid_id')] # Don't get phenotypes for transgenic alleles if allele_id in transgenic_alleles: continue allele_curie = 'FlyBase:' + allele_id terms = re.findall(term_regex, pheno_desc) if not terms: LOG.warning('Could not @terms@ in description: %s', pheno_desc) continue term_ids, term_labels = zip(*terms) id_match = re.match(id_regex, term_ids[0]) if id_match is not None: prefix, reference = id_match.group(1, 2) else: raise ValueError("Could not parse id {}".format( term_ids[0])) # derived_pheno_class should all start with a FBcv term if pheno_type == 'derived_pheno_class' and prefix != 'FBcv': LOG.warning( 'derived_pheno_class does not ' 'start with FBcv: %s', pheno_desc) continue # Create phenotype curie if pheno_type == 'derived_pheno_class': phenotype_curie = prefix + ':' + reference elif pheno_type == 'derived_pheno_manifest': # These are not proper FBcv phenotype terms # but rather anatomical entities, go terms, sometimes free text # skip parsing for now continue else: raise ValueError( "Unexpected phenotype type: {}".format(pheno_type)) if pmid_id: ref_curie = 'PMID:' + pmid_id else: ref_curie = 'FlyBase:' + pub_id reference = Reference(self.graph, ref_curie) reference.setTitle(pub_title) reference.addRefToGraph() assoc = G2PAssoc(self.graph, self.name, allele_curie, phenotype_curie, self.globaltt['has phenotype']) assoc.add_source(ref_curie) # Associations need to be disambiguated via their qualifiers # see http://flybase.org/reports/FBal0207398 as an example assoc.set_association_id( assoc.make_association_id(self.name, allele_curie, self.globaltt['has phenotype'], phenotype_curie, term_ids[1:])) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() # add the rest as qualifiers for term in term_ids[1:]: if term: # FBal, GO, FBti, FBab ... id_match = re.match(id_regex, term) if id_match is not None: prefix, reference = id_match.group(1, 2) if prefix in fly_prefixes: term_curie = 'FlyBase:' + term else: term_curie = prefix + ':' + reference else: raise ValueError( "Could not parse id {}".format(term)) else: # There is not an id for a term, # eg @:heat sensitive | tetracycline conditional@ continue self.graph.addTriple(assoc_id, self.globaltt['has_qualifier'], term_curie) if limit is not None and reader.line_num > limit: break