def _process_collection(self, collection_id, label, page): """ This function will process the data supplied internally about the repository from Coriell. Triples: Repository a ERO:collection rdf:label Literal(label) foaf:page Literal(page) :param collection_id: :param label: :param page: :return: """ # ############# BUILD THE CELL LINE REPOSITORY ############# for graph in [self.graph, self.testgraph]: # TODO: How to devise a label for each repository? model = Model(graph) reference = Reference(graph) repo_id = 'CoriellCollection:' + collection_id repo_label = label repo_page = page model.addIndividualToGraph( repo_id, repo_label, self.globaltt['collection']) reference.addPage(repo_id, repo_page) return
def _process_collection(self, collection_id, label, page): """ This function will process the data supplied internally about the repository from Coriell. Triples: Repository a ERO:collection rdf:label Literal(label) foaf:page Literal(page) :param collection_id: :param label: :param page: :return: """ # ############# BUILD THE CELL LINE REPOSITORY ############# for graph in [self.graph, self.testgraph]: # TODO: How to devise a label for each repository? model = Model(graph) reference = Reference(graph) repo_id = 'CoriellCollection:' + collection_id repo_label = label repo_page = page model.addIndividualToGraph(repo_id, repo_label, self.globaltt['collection']) reference.addPage(repo_id, repo_page) return
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data . Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Turtle: <eom id> a owl:Class rdf:label Literal(eom label) oboInOwl:has_related_synonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) TEC_note: URL are not literals. :param raw: :param limit: :return: """ src_key = 'tables' model = Model(self.graph) col = self.resources[src_key]['columns'] with open(raw, 'r') as rawread: reader = csv.reader(rawread, delimiter='\t', quotechar='\"') row = next(reader) if not self.check_fileheader(col, row): pass for row in reader: # head -1 dvp.pr_nlx_157874_1|tr '\t' '\n'| # sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" morphology_term_id = row[col.index( 'morphology_term_id')].strip() # morphology_term_num = row[col.index('morphology_term_num')] morphology_term_label = row[col.index( 'morphology_term_label')].strip() morphology_term_url = row[col.index( 'morphology_term_url')].strip() # terminology_category_label = row[ # col.index('terminology_category_label')] # terminology_category_url = row[col.index('terminology_category_url')] # subcategory = row[col.index('subcategory')] objective_definition = row[col.index( 'objective_definition')].strip() subjective_definition = row[col.index( 'subjective_definition')].strip() comments = row[col.index('comments')].strip() synonyms = row[col.index('synonyms')].strip() replaces = row[col.index('replaces')].strip() small_figure_url = row[col.index('small_figure_url')].strip() large_figure_url = row[col.index('large_figure_url')].strip() # e_uid = row[col.index('e_uid')] # v_uid = row[col.index('v_uid')] # v_uuid = row[col.index('v_uuid')] # v_lastmodified = row[col.index('v_lastmodified')] # v_status = row[col.index('v_status')] # v_lastmodified_epoch = row[col.index('v_lastmodified_epoch')] # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not (re.match( r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition + '.' if objective_definition != '' and not (re.match( r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition + '.' definition = ' '.join( (objective_definition, subjective_definition)) model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments) for syn in synonyms.split(';'): model.addSynonym(morphology_term_id, syn.strip(), self.globaltt['has_exact_synonym']) # morphology_term_id has_related_synonym replaces (; delimited) if replaces not in ['', synonyms]: for syn in replaces.split(';'): model.addSynonym(morphology_term_id, syn.strip(), self.globaltt['has_related_synonym']) # <morphology_term_id> <foaf:page> morphology_term_url if morphology_term_id is not None: reference = Reference(self.graph, morphology_term_id, self.globaltt['web page']) # TEC 201905: # Not so sure we need explicit <eom_uri> <webpage> <eom_url>. # since <eom_uri> IS the <eom_url>. reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and reader.line_num > limit: break
def _get_process_allelic_variants(self, entry, g): model = Model(g) reference = Reference(g) geno = Genotype(g) if entry is not None: # to hold the entry-specific publication mentions # for the allelic variants publist = {} entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, g) if 'allelicVariantList' in entry: allelicVariantList = entry['allelicVariantList'] for al in allelicVariantList: al_num = al['allelicVariant']['number'] al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4) al_label = None al_description = None if al['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in al['allelicVariant']: al_label = al['allelicVariant']['mutations'] if 'text' in al['allelicVariant']: al_description = al['allelicVariant']['text'] m = re.findall(r'\{(\d+)\:', al_description) publist[al_id] = set(m) geno.addAllele( al_id, al_label, geno.genoparts['variant_locus'], al_description) geno.addAlleleOfGene( al_id, 'OMIM:'+str(entry_num), geno.object_properties[ 'is_sequence_variant_instance_of']) for r in publist[al_id]: pmid = ref_to_pmid[int(r)] g.addTriple( pmid, model.object_properties['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in al['allelicVariant']: dbsnp_ids = \ re.split(r',', al['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:'+dnum.strip() model.addIndividualToGraph(did, None) model.addSameIndividual(al_id, did) if 'clinvarAccessions' in al['allelicVariant']: # clinvarAccessions triple semicolon delimited # each >1 like RCV000020059;;; rcv_ids = \ re.split( r';;;', al['allelicVariant']['clinvarAccessions']) rcv_ids = [ (re.match(r'(RCV\d+);*', r)).group(1) for r in rcv_ids] for rnum in rcv_ids: rid = 'ClinVar:'+rnum model.addXref(al_id, rid) reference.addPage( al_id, "http://omim.org/entry/" + str(entry_num)+"#" + str(al_num).zfill(4)) elif re.search( r'moved', al['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in al['allelicVariant']: moved_id = 'OMIM:'+al['allelicVariant']['movedTo'] moved_ids = [moved_id] model.addDeprecatedIndividual(al_id, moved_ids) else: logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status']) # end loop allelicVariantList return
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data that has been screen-scraped into DISCO. Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Triples: <eom id> a owl:Class rdf:label Literal(eom label) OIO:hasRelatedSynonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip filereader = csv.reader(f1, delimiter='\t', quotechar='\"') for line in filereader: line_counter += 1 (morphology_term_id, morphology_term_num, morphology_term_label, morphology_term_url, terminology_category_label, terminology_category_url, subcategory, objective_definition, subjective_definition, comments, synonyms, replaces, small_figure_url, large_figure_url, e_uid, v_uid, v_uuid, v_last_modified, v_status, v_lastmodified_epoch) = line # note: # e_uid v_uuid v_last_modified terminology_category_url # subcategory v_uid morphology_term_num # terminology_category_label hp_label notes # are currently unused. # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not ( re.match(r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition.strip() + '.' if objective_definition != '' and not ( re.match(r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition.strip() + '.' definition = \ ' '.join( (objective_definition, subjective_definition)).strip() model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments.strip()) if synonyms != '': for s in synonyms.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasExactSynonym']) # morphology_term_id hasRelatedSynonym replaces (; delimited) if replaces != '' and replaces != synonyms: for s in replaces.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasRelatedSynonym']) # morphology_term_id has page morphology_term_url reference = Reference(self.graph) reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and line_counter > limit: break return
def _get_process_allelic_variants(self, entry, graph): model = Model(graph) reference = Reference(graph) geno = Genotype(graph) if entry is not None: # to hold the entry-specific publication mentions # for the allelic variants publist = {} entry_num = entry['mimNumber'] # process the ref list just to get the pmids ref_to_pmid = self._get_pubs(entry, graph) if 'allelicVariantList' in entry: for alv in entry['allelicVariantList']: al_num = alv['allelicVariant']['number'] al_id = 'OMIM:' + str(entry_num) + '.' + str(al_num).zfill( 4) al_label = None al_description = None if alv['allelicVariant']['status'] == 'live': publist[al_id] = set() if 'mutations' in alv['allelicVariant']: al_label = alv['allelicVariant']['mutations'] if 'text' in alv['allelicVariant']: al_description = alv['allelicVariant']['text'] mch = re.findall(r'\{(\d+)\:', al_description) publist[al_id] = set(mch) geno.addAllele(al_id, al_label, self.globaltt['variant_locus'], al_description) geno.addAlleleOfGene(al_id, 'OMIM:' + str(entry_num), self.globaltt['is_allele_of']) for ref in publist[al_id]: pmid = ref_to_pmid[int(ref)] graph.addTriple(pmid, self.globaltt['is_about'], al_id) # look up the pubmed id in the list of references if 'dbSnps' in alv['allelicVariant']: dbsnp_ids = re.split( r',', alv['allelicVariant']['dbSnps']) for dnum in dbsnp_ids: did = 'dbSNP:' + dnum.strip() model.addIndividualToGraph(did, None) model.addSameIndividual(al_id, did) # Note that RCVs are variant to disease associations # in ClinVar, rather than variant entries # so we make these xrefs instead of equivalents if 'clinvarAccessions' in alv['allelicVariant']: # clinvarAccessions triple semicolon delimited # each >1 like RCV000020059;;; rcv_ids = \ alv['allelicVariant']['clinvarAccessions'].split(';;;') rcv_ids = [rcv[:12] for rcv in rcv_ids] # incase more cruft for rnum in rcv_ids: rid = 'ClinVar:' + rnum model.addXref(al_id, rid) reference.addPage( al_id, "http://omim.org/entry/" + '#'.join( (str(entry_num), str(al_num).zfill(4)))) elif re.search(r'moved', alv['allelicVariant']['status']): # for both 'moved' and 'removed' moved_ids = None if 'movedTo' in alv['allelicVariant']: moved_id = 'OMIM:' + alv['allelicVariant'][ 'movedTo'] moved_ids = [moved_id] model.addDeprecatedIndividual(al_id, moved_ids) else: LOG.error('Uncaught alleleic variant status %s', alv['allelicVariant']['status'])
def _process_nlx_157874_1_view(self, raw, limit=None): """ This table contains the Elements of Morphology data that has been screen-scraped into DISCO. Note that foaf:depiction is inverse of foaf:depicts relationship. Since it is bad form to have two definitions, we concatenate the two into one string. Triples: <eom id> a owl:Class rdf:label Literal(eom label) OIO:hasRelatedSynonym Literal(synonym list) IAO:definition Literal(objective_def. subjective def) foaf:depiction Literal(small_image_url), Literal(large_image_url) foaf:page Literal(page_url) rdfs:comment Literal(long commented text) :param raw: :param limit: :return: """ model = Model(self.graph) line_counter = 0 with open(raw, 'r') as f1: f1.readline() # read the header row; skip filereader = csv.reader(f1, delimiter='\t', quotechar='\"') for line in filereader: line_counter += 1 (morphology_term_id, morphology_term_num, morphology_term_label, morphology_term_url, terminology_category_label, terminology_category_url, subcategory, objective_definition, subjective_definition, comments, synonyms, replaces, small_figure_url, large_figure_url, e_uid, v_uid, v_uuid, v_last_modified, v_status, v_lastmodified_epoch) = line # note: # e_uid v_uuid v_last_modified terminology_category_url # subcategory v_uid morphology_term_num # terminology_category_label hp_label notes # are currently unused. # Add morphology term to graph as a class # with label, type, and description. model.addClassToGraph(morphology_term_id, morphology_term_label) # Assemble the description text if subjective_definition != '' and not (re.match( r'.+\.$', subjective_definition)): # add a trailing period. subjective_definition = subjective_definition.strip() + '.' if objective_definition != '' and not (re.match( r'.+\.$', objective_definition)): # add a trailing period. objective_definition = objective_definition.strip() + '.' definition = \ ' '.join( (objective_definition, subjective_definition)).strip() model.addDefinition(morphology_term_id, definition) # <term id> FOAF:depicted_by literal url # <url> type foaf:depiction # do we want both images? # morphology_term_id has depiction small_figure_url if small_figure_url != '': model.addDepiction(morphology_term_id, small_figure_url) # morphology_term_id has depiction large_figure_url if large_figure_url != '': model.addDepiction(morphology_term_id, large_figure_url) # morphology_term_id has comment comments if comments != '': model.addComment(morphology_term_id, comments.strip()) if synonyms != '': for s in synonyms.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasExactSynonym']) # morphology_term_id hasRelatedSynonym replaces (; delimited) if replaces != '' and replaces != synonyms: for s in replaces.split(';'): model.addSynonym( morphology_term_id, s.strip(), model.annotation_properties['hasRelatedSynonym']) # morphology_term_id has page morphology_term_url reference = Reference(self.graph) reference.addPage(morphology_term_id, morphology_term_url) if limit is not None and line_counter > limit: break return