def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple(subject_id=package['unii'], predicate_id=target['action'], obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple(subject_id=target['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) model.addTriple(subject_id=package['drugbank_id'], predicate_id=self.globaltt['equivalent_class'], obj=package['unii']) model.addTriple( subject_id=target['action'], predicate_id=self.globaltt['subPropertyOf'], obj=self.globaltt['molecularly_interacts_with']) model.addTriple(subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) if source == 'drugcentral': for indication in package['indications']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['is substance that treats'], obj=indication['snomed_id']) model.addTriple(subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addTriple(subject_id=indication['snomed_id'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['disease']) model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['molecularly_interacts_with'], obj=interaction['uniprot']) # model.addLabel( # subject_id=interaction['uniprot'], # label='Protein_{}'.format(interaction['uniprot'])) model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple(subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple(subject_id=interaction['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) return
def process_gene_desc(self, limit): raw = '/'.join((self.rawdir, self.files['gene_desc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Gene descriptions") line_counter = 0 # geno = Genotype(g) # TODO unused with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue line_counter += 1 if line_counter == 1: continue (gene_num, public_name, molecular_name, concise_description, provisional_description, detailed_description, automated_description, gene_class_description) = row if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:' + gene_num if concise_description != 'none available': model.addDefinition(gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text == concise_description \ or re.match(r'none', text) or text == '': pass # don't use it else: text = ' '.join((text, '[' + d + ']')) descs[d] = text model.addDescription(gene_id, text) if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_gene_desc(self, limit): raw = '/'.join((self.rawdir, self.files['gene_desc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Gene descriptions") line_counter = 0 # geno = Genotype(g) # TODO unused with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue line_counter += 1 if line_counter == 1: continue (gene_num, public_name, molecular_name, concise_description, provisional_description, detailed_description, automated_description, gene_class_description) = row if self.testMode and gene_num not in self.test_ids['gene']: continue gene_id = 'WormBase:'+gene_num if concise_description != 'none available': model.addDefinition(gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text == concise_description \ or re.match(r'none', text) or text == '': pass # don't use it else: text = ' '.join((text, '['+d+']')) descs[d] = text model.addDescription(gene_id, text) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') f.readline() # read the header row; skip for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:' + str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:' + str(mpd_strainid) model.addIndividualToGraph(strain_id, strain_name, tax_id) if mpdshortname.strip() != '': model.addSynonym(strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum model.addSameIndividual(strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) model.addSameIndividual(strain_id, reiken_id) else: if url != '': model.addXref(strain_id, url, True) if vendor != '': model.addXref( strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' model.addDescription(strain_id, desc) # TODO make the panels as a resource collection return
def _process_straininfo(self, limit): # line_counter = 0 # TODO unused if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing measurements ...") raw = '/'.join((self.rawdir, self.files['straininfo']['file'])) tax_id = 'NCBITaxon:10090' with open(raw, 'r') as f: reader = csv.reader(f, delimiter=',', quotechar='\"') self.check_header(self.files['straininfo']['file'], f.readline()) for row in reader: (strain_name, vendor, stocknum, panel, mpd_strainid, straintype, n_proj, n_snp_datasets, mpdshortname, url) = row # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.testMode and \ 'MPD:' + str(mpd_strainid) not in self.test_ids: continue strain_id = 'MPD-strain:' + str(mpd_strainid) model.addIndividualToGraph(strain_id, strain_name, tax_id) if mpdshortname.strip() != '': model.addSynonym(strain_id, mpdshortname.strip()) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:'+stocknum model.addSameIndividual(strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum) model.addSameIndividual(strain_id, reiken_id) else: if url != '': model.addXref(strain_id, url, True) if vendor != '': model.addXref( strain_id, ':'.join((vendor, stocknum)), True) # add the panel information if panel != '': desc = panel+' [panel]' model.addDescription(strain_id, desc) # TODO make the panels as a resource collection return
def process_gene_desc(self, limit): # currently unsupported src_key = 'gene_desc' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing: %s", self.files[src_key]['file']) graph = self.graph model = Model(graph) col = self.files[src_key]['columns'] with gzip.open(raw, 'rb') as csvfile: reader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') row = next(reader) for row in reader: if re.match(r'\#', ''.join(row)): continue gene_num = row[col.index('gene_num')] # public_name = row[col.index('public_name')] # molecular_name = row[col.index('molecular_name')] concise_description = row[col.index('concise_description')] provisional_description = row[col.index( 'provisional_description')] detailed_description = row[col.index('detailed_description')] automated_description = row[col.index('automated_description')] gene_class_description = row[col.index( 'gene_class_description')] gene_id = 'WormBase:' + gene_num if concise_description not in ('none available', '', None): model.addDefinition(gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text != concise_description and \ text[:4] != 'none' and text != '': text = ' '.join((text, '[' + d + ']')) descs[d] = text model.addDescription(gene_id, text) if limit is not None and reader.line_num > limit: break
def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple(subject_id=package['unii'], predicate_id=target['action'], obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple(subject_id=target['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') model.addTriple(subject_id=package['drugbank_id'], predicate_id=Model.object_properties['equivalent_class'], obj=package['unii']) model.addTriple(subject_id=target['action'], predicate_id='rdfs:subPropertyOf', obj='RO:0002436') model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') if source == 'drugcentral': for indication in package['indications']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002606', obj=indication['snomed_id']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addTriple(subject_id=indication['snomed_id'], predicate_id=Model.object_properties['subclass_of'], obj='DOID:4') model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002436', obj=interaction['uniprot']) # model.addLabel(subject_id=interaction['uniprot'], label='Protein_{}'.format(interaction['uniprot'])) model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple(subject_id=interaction['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') return
def process_gene_desc(self, limit): raw = '/'.join((self.rawdir, self.files['gene_desc']['file'])) graph = self.graph model = Model(graph) LOG.info("Processing: %s", self.files['gene_desc']['file']) line_counter = 0 # geno = Genotype(graph) # TODO unused with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue line_counter += 1 if line_counter == 1: continue (gene_num, public_name, molecular_name, concise_description, provisional_description, detailed_description, automated_description, gene_class_description) = row gene_id = 'WormBase:' + gene_num if concise_description not in ('none available', '', None): model.addDefinition(gene_id, concise_description) # remove the description if it's identical to the concise descs = { 'provisional': provisional_description, 'automated': automated_description, 'detailed': detailed_description, 'gene class': gene_class_description } for d in descs: text = descs.get(d) if text != concise_description and \ text[:4] != 'none' and text != '': text = ' '.join((text, '[' + d + ']')) descs[d] = text model.addDescription(gene_id, text) if limit is not None and line_counter > limit: break
def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple(subject_id=package['unii'],predicate_id=target['action'],obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple(subject_id=target['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') model.addTriple(subject_id=package['drugbank_id'], predicate_id=Model.object_properties['equivalent_class'], obj=package['unii']) model.addTriple(subject_id=target['action'], predicate_id='rdfs:subPropertyOf', obj='RO:0002436') model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') if source == 'drugcentral': for indication in package['indications']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002606', obj=indication['snomed_id']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addTriple(subject_id=indication['snomed_id'], predicate_id=Model.object_properties['subclass_of'], obj='DOID:4') model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002436', obj=interaction['uniprot']) # model.addLabel(subject_id=interaction['uniprot'], label='Protein_{}'.format(interaction['uniprot'])) model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple(subject_id=interaction['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') return
def process_gaf(self, file, limit, id_map=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) logger.info("Processing Gene Associations from %s", file) line_counter = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) elif 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation " + "on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue db = self.clean_db_prefix(db) uniprotid = None gene_id = None if db == 'UniProtKB': mapped_ids = id_map.get(gene_num) if id_map is not None and mapped_ids is not None: if len(mapped_ids) == 1: gene_id = mapped_ids[0] uniprotid = ':'.join((db, gene_num)) gene_num = re.sub(r'\w+\:', '', gene_id) elif len(mapped_ids) > 1: # logger.warning( # "Skipping gene id mapped for >1 gene %s -> %s", # gene_num, str(mapped_ids)) continue else: continue elif db == 'MGI': gene_num = re.sub(r'MGI:', '', gene_num) gene_id = ':'.join((db, gene_num)) gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id) else: gene_id = ':'.join((db, gene_num)) if self.testMode \ and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(g, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) eco_id = self.map_go_evidence_code_to_eco(eco_symbol) if eco_id is not None: assoc.add_evidence(eco_id) refs = re.split(r'\|', ref) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) ref = Reference(g, r) if re.match(r'PMID', r): ref_type = Reference.ref_types['journal_article'] ref.setType(ref_type) ref.addRefToGraph() assoc.add_source(r) # TODO add the source of the annotations from assigned by? aspect_rel_map = { 'P': model.object_properties['involved_in'], # involved in 'F': model.object_properties['enables'], # enables 'C': model.object_properties['part_of'] # part of } if aspect not in aspect_rel_map: logger.error("Aspect not recognized: %s", aspect) rel = aspect_rel_map.get(aspect) if aspect == 'F' and re.search(r'contributes_to', qualifier): rel = model.object_properties['contributes_to'] assoc.set_relationship(rel) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used assoc.add_association_to_graph() # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or \ re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = \ wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(g, self.name, i, phenotypeid) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) assoc.add_source(r) # experimental phenotypic evidence assoc.add_evidence("ECO:0000059") assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and \ limit is not None and line_counter > limit: break return
def process_gaf(self, gaffile, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", gaffile) uniprot_hit = 0 uniprot_miss = 0 col = self.gaf_columns with gzip.open(gaffile, 'rb') as csvfile: reader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in reader: # comments start with exclamation if row[0][0] == '!': continue if len(row) != len(col): LOG.error( "Wrong number of columns %i, expected ... got:\n\t%s", len(col), row) exit(1) dbase = row[col.index('DB')].strip() gene_num = row[col.index('DB_Object_ID')].strip() gene_symbol = row[col.index('DB_Object_Symbol')].strip() qualifier = row[col.index('Qualifier')] go_id = row[col.index('GO_ID')].strip() ref = row[col.index('DB:Reference')].strip() eco_symbol = row[col.index('Evidence Code')].strip() with_or_from = row[col.index('With (or) From')] aspect = row[col.index('Aspect')].strip() gene_name = row[col.index('DB_Object_Name')] gene_synonym = row[col.index('DB_Object_Synonym')] # object_type = row[col.index('DB_Object_Type')].strip() taxon = row[col.index('Taxon and Interacting taxon')].strip() # date = row[col.index('Date')].strip() # assigned_by = row[col.index('Assigned_By')].strip() # annotation_extension = row[col.index('Annotation_Extension')] # gene_product_form_id = row[col.index('Gene_Product_Form_ID')] # test for required fields if '' in [row[:10], row[12]]: LOG.error( "Missing required part of annotation on row %i:\n%s", reader.line_num, str(row[:-4])) continue # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and gene_id[:9] != 'NCBIGene:' and\ gene_num not in self.test_ids: continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): syn = syn.strip() if syn[:10] == 'UniProtKB:': model.addTriple( gene_id, self.globaltt['has gene product'], syn) elif re.fullmatch(graph.curie_regexp, syn) is not None: LOG.warning( 'possible curie "%s" as a literal synomym for %s', syn, gene_id) model.addSynonym(gene_id, syn) else: model.addSynonym(gene_id, syn) for txid in taxon.split('|'): tax_curie = re.sub(r'taxon:', 'NCBITaxon:', txid) geno.addTaxon(tax_curie, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ######################################################################## # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = with_or_from.split('|') phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for itm in withitems: if itm == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm): LOG.warning( "Skipping %s from or with %s", uniprotid, itm) continue itm = re.sub(r'MGI\:MGI\:', 'MGI:', itm) itm = re.sub(r'WB:', 'WormBase:', itm) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', itm): targeted_gene_id = self.zfin.make_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', itm): targeted_gene_id = self.wbase.make_reagent_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id) assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, itm, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be the evidence for the GO assoc? if not self.test_mode and limit is not None and \ reader.line_num > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download", uniprot_per, uniprot_tot)
def process_gaf(self, file, limit, id_map=None): if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) logger.info("Processing Gene Associations from %s", file) line_counter = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) elif 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue (db, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (db == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): logger.error( "Missing required part of annotation " + "on row %d:\n"+'\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue db = self.clean_db_prefix(db) uniprotid = None gene_id = None if db == 'UniProtKB': mapped_ids = id_map.get(gene_num) if id_map is not None and mapped_ids is not None: if len(mapped_ids) == 1: gene_id = mapped_ids[0] uniprotid = ':'.join((db, gene_num)) gene_num = re.sub(r'\w+\:', '', gene_id) elif len(mapped_ids) > 1: # logger.warning( # "Skipping gene id mapped for >1 gene %s -> %s", # gene_num, str(mapped_ids)) continue else: continue elif db == 'MGI': gene_num = re.sub(r'MGI:', '', gene_num) gene_id = ':'.join((db, gene_num)) gene_id = re.sub(r'MGI\:MGI\:', 'MGI:', gene_id) else: gene_id = ':'.join((db, gene_num)) if self.testMode \ and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for s in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, s.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon logger.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(g, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) eco_id = self.map_go_evidence_code_to_eco(eco_symbol) if eco_id is not None: assoc.add_evidence(eco_id) refs = re.split(r'\|', ref) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub(prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) ref = Reference(g, r) if re.match(r'PMID', r): ref_type = Reference.ref_types['journal_article'] ref.setType(ref_type) ref.addRefToGraph() assoc.add_source(r) # TODO add the source of the annotations from assigned by? aspect_rel_map = { 'P': model.object_properties['involved_in'], # involved in 'F': model.object_properties['enables'], # enables 'C': model.object_properties['part_of'] # part of } if aspect not in aspect_rel_map: logger.error("Aspect not recognized: %s", aspect) rel = aspect_rel_map.get(aspect) if aspect == 'F' and re.search(r'contributes_to', qualifier): rel = model.object_properties['contributes_to'] assoc.set_relationship(rel) if uniprotid is not None: assoc.set_description('Mapped from '+uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used assoc.add_association_to_graph() # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id+'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or \ re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): logger.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) # TODO PYLINT why is this: # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(g, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = \ wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc( g, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(g, self.name, i, phenotypeid) for r in refs: r = r.strip() if r != '': prefix = re.split(r':', r)[0] r = re.sub( prefix, self.clean_db_prefix(prefix), r) r = re.sub(r'MGI\:MGI\:', 'MGI:', r) assoc.add_source(r) # experimental phenotypic evidence assoc.add_evidence("ECO:0000059") assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.testMode and \ limit is not None and line_counter > limit: break return
class Dataset: """ this will produce the metadata about a dataset following the example laid out here: http://htmlpreview.github.io/? https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1 (mind the wrap) """ def __init__(self, identifier, title, url, description=None, license_url=None, data_rights=None, graph_type=None, file_handle=None): if graph_type is None: self.graph = RDFGraph(None, identifier) # elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph() self.model = Model(self.graph) self.identifier = ':' + identifier self.version = None self.date_issued = None # The data_accesed value is later used as an literal of properties # such as dct:issued, which needs to conform xsd:dateTime format. # TODO ... we need to have a talk about typed literals and SPARQL self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') self.citation = set() self.license = license_url self.model.addType(self.identifier, 'dctypes:Dataset') self.graph.addTriple(self.identifier, 'dct:title', title, True) self.graph.addTriple( self.identifier, 'dct:identifier', identifier, object_is_literal=True) self.graph.addTriple(self.identifier, 'foaf:page', url) # maybe in the future add the logo here: # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> . # TODO add the licence info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.addTriple( self.identifier, 'dct:license', license_url) else: logger.debug('No license provided.') if data_rights is not None: self.graph.addTriple( self.identifier, 'dct:rights', data_rights, object_is_literal=True) else: logger.debug('No rights provided.') if description is not None: self.model.addDescription(self.identifier, description) return def setVersion(self, date_issued, version_id=None): """ Legacy function... should use the other set_* for version and date as of 2016-10-20 used in: dipper/sources/HPOAnnotations.py 139: dipper/sources/CTD.py 99: dipper/sources/BioGrid.py 100: dipper/sources/MGI.py 255: dipper/sources/EOM.py 93: dipper/sources/Coriell.py 200: dipper/sources/MMRRC.py 77: # TODO set as deprecated :param date_issued: :param version_id: :return: """ if date_issued is not None: self.set_date_issued(date_issued) elif version_id is not None: self.set_version_by_num(version_id) else: logger.error("date or version not set!") # TODO throw error return if version_id is not None: self.set_version_by_num(version_id) else: logger.info("set version to %s", self.version) self.set_version_by_date(date_issued) logger.info("set version to %s", self.version) return def set_date_issued(self, date_issued): self.date_issued = date_issued self.graph.addTriple( self.identifier, 'dct:issued', date_issued, object_is_literal=True) logger.info("setting date to %s", date_issued) return def set_version_by_date(self, date_issued=None): """ This will set the version by the date supplied, the date already stored in the dataset description, or by the download date (today) :param date_issued: :return: """ if date_issued is not None: d = date_issued elif self.date_issued is not None: d = self.date_issued else: d = self.date_accessed logger.info( "No date supplied for setting version; " "using download timestamp for date_issued") logger.info("setting version by date") self.set_version_by_num(d) return def set_version_by_num(self, version_num): self.version = self.identifier+version_num self.graph.addTriple(self.version, 'dct:isVersionOf', self.identifier) self.graph.addTriple(self.version, 'pav:version', version_num, object_is_literal=True) logger.info("setting version to %s", self.version) # set the monarch-generated-version of the resource-version # TODO sync this up with the ontology version if version_num != self.date_accessed: dipperized_version = ':' + str(self.date_accessed) self.graph.addTriple( dipperized_version, 'dct:isVersionOf', self.version) self.graph.addTriple( dipperized_version, 'pav:version', self.date_accessed, object_is_literal=True) self.graph.addTriple( dipperized_version, 'dct:issued', self.date_accessed, object_is_literal=True, literal_type="xsd:dateTime") return def setFileAccessUrl(self, url, is_object_literal=False): self.graph.addTriple(self.identifier, 'dcat:accessURL', url, is_object_literal) def getGraph(self): return self.graph def set_license(self, license): self.license = license return def get_license(self): return self.license def set_citation(self, citation_id): self.citation.add(citation_id) # TODO # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id) return
def process_omia_phenotypes(self, limit): # process the whole directory # TODO get the file listing if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info( "Processing Monarch OMIA Animal disease-phenotype associations") # get file listing mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype')) file_list = [ f for f in listdir(mypath) if isfile(join(mypath, f)) and re.search(r'.txt$', f)] for f in file_list: logger.info("Processing %s", f) print(f) line_counter = 0 count_missing = 0 bad_rows = list() fname = '/'.join((mypath, f)) with open(fname, 'r') as csvfile: filereader = csv.reader( csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 if line_counter <= 1: continue # skip header if len(row) != 22: logger.info("Not enough cols (%d) in %s - please fix", len(row), f) continue (disease_num, species_id, breed_name, variant, inheritance, phenotype_id, phenotype_name, entity_id, entity_name, quality_id, quality_name, related_entity_id, related_entity_name, abnormal_id, abnormal_name, phenotype_description, assay, frequency, pubmed_id, pub_description, curator_notes, date_created) = row if phenotype_id == '': # logger.warning('Missing phenotype in row:\n%s', row) count_missing += 1 bad_rows.append(row) continue if len(str(disease_num)) < 6: disease_num = str(disease_num).zfill(6) disease_id = 'OMIA:'+disease_num.strip() species_id = species_id.strip() if species_id != '': disease_id = '-'.join((disease_id, species_id)) assoc = D2PAssoc(g, self.name, disease_id, phenotype_id) if pubmed_id != '': for p in re.split(r'[,;]', pubmed_id): pmid = 'PMID:'+p.strip() assoc.add_source(pmid) else: assoc.add_source( '/'.join(('http://omia.angis.org.au/OMIA' + disease_num.strip(), species_id.strip()))) assoc.add_association_to_graph() aid = assoc.get_association_id() if phenotype_description != '': model.addDescription(aid, phenotype_description) if breed_name != '': model.addDescription( aid, breed_name.strip()+' [observed in]') if assay != '': model.addDescription(aid, assay.strip()+' [assay]') if curator_notes != '': model.addComment(aid, curator_notes.strip()) if entity_id != '' or quality_id != '': logger.info("EQ not empty for %s: %s + %s", disease_id, entity_name, quality_name) if count_missing > 0: logger.warning( "You are missing %d/%d D2P annotations from id %s", count_missing, line_counter-1, f) # TODO PYLINT Used builtin function 'map'. # Using a list comprehension can be clearer. logger.warning("Bad rows:\n"+"\n".join(map(str, bad_rows))) # finish loop through all files return
class Assoc: """ A base class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ def __init__(self, graph, definedby, sub=None, obj=None, pred=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # core parts of the association self.definedby = definedby self.sub = sub self.obj = obj self.rel = pred self.assoc_id = None self.description = None self.source = [] self.evidence = [] self.date = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return def _is_valid(self): # check if sub/obj/rel are none...raise error if self.sub is None: raise ValueError( 'No subject set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) if self.obj is None: raise ValueError( 'No object set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) if self.rel is None: raise ValueError( 'No predicate set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) # Are subject & predicate, either a curie or IRI pfx = self.sub.split(':')[0] if pfx not in self.curie_map.keys() and \ pfx not in ['_', 'http', 'https', 'ftp']: raise ValueError( 'Invalid Subject for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) pfx = self.rel.split(':')[0] if pfx not in self.curie_map.keys() and \ pfx not in ['_', 'http', 'https', 'ftp']: raise ValueError( 'Invalid Predicate for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) return True def add_association_to_graph(self): if not self._is_valid(): return self.graph.addTriple(self.sub, self.rel, self.obj) if self.assoc_id is None: self.set_association_id() assert self.assoc_id is not None self.model.addType(self.assoc_id, self.model.globaltt['association']) self.graph.addTriple( self.assoc_id, self.globaltt['association has subject'], self.sub) self.graph.addTriple( self.assoc_id, self.globaltt['association has object'], self.obj) self.graph.addTriple( self.assoc_id, self.globaltt['association has predicate'], self.rel) if self.description is not None: self.model.addDescription(self.assoc_id, self.description) if self.evidence is not None and len(self.evidence) > 0: for evi in self.evidence: self.graph.addTriple(self.assoc_id, self.globaltt['has evidence'], evi) if self.source is not None and len(self.source) > 0: for src in self.source: # TODO assume that the source is a publication? use Reference class self.graph.addTriple(self.assoc_id, self.globaltt['source'], src) if self.provenance is not None and len(self.provenance) > 0: for prov in self.provenance: self.graph.addTriple( self.assoc_id, self.globaltt['has_provenance'], prov) if self.date is not None and len(self.date) > 0: for dat in self.date: self.graph.addTriple( self.assoc_id,self.globaltt['created_on'], dat, object_is_literal=True) if self.score is not None: self.graph.addTriple( self.assoc_id, self.globaltt['has measurement value'], self.score, True, 'xsd:float') # TODO # update with some kind of instance of scoring object # that has a unit and type return def add_predicate_object( self, predicate, object_node, object_type=None, datatype=None): if object_type == 'Literal': if datatype is not None: self.graph.addTriple( self.assoc_id, predicate, object_node, True, datatype) else: self.graph.addTriple(self.assoc_id, predicate, object_node, True) else: self.graph.addTriple(self.assoc_id, predicate, object_node, False) return # This isn't java, but predecessors favored the use of property decorators # and CamelCase and ... def set_subject(self, identifier): self.sub = identifier return def set_object(self, identifier): self.obj = identifier return def set_relationship(self, identifier): self.rel = identifier return def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id( self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return self.assoc_id def get_association_id(self): if self.assoc_id is None: self.set_association_id() return self.assoc_id def set_description(self, description): self.description = description return def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type return def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] return def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] return def add_date(self, date): if date is not None and date.strip() != '': self.date += [date] return def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] return @staticmethod def make_association_id(definedby, sub, pred, obj, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively digests the string of concatonated values. Subclasses of Assoc can submit an additional array of attributes that will be appeded to the ID. Note this is equivalent to a RDF blank node :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ items_to_hash = [definedby, sub, pred, obj] if attributes is not None and len(attributes) > 0: items_to_hash += attributes items_to_hash = [x for x in items_to_hash if x is not None] assoc_id = ':'.join(('MONARCH', GraphUtils.digest_id('+'.join(items_to_hash)))) assert assoc_id is not None return assoc_id
def _transform_entry(self, e, graph): g = graph model = Model(g) geno = Genotype(graph) tax_num = '9606' tax_id = 'NCBITaxon:9606' tax_label = 'Human' build_num = "GRCh38" build_id = "NCBIGenome:"+build_num # get the numbers, labels, and descriptions omimnum = e['entry']['mimNumber'] titles = e['entry']['titles'] label = titles['preferredTitle'] other_labels = [] if 'alternativeTitles' in titles: other_labels += self._get_alt_labels(titles['alternativeTitles']) if 'includedTitles' in titles: other_labels += self._get_alt_labels(titles['includedTitles']) # add synonyms of alternate labels # preferredTitle": "PFEIFFER SYNDROME", # "alternativeTitles": # "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME", # "includedTitles": # "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED" # remove the abbreviation (comes after the ;) from the preferredTitle, # and add it as a synonym abbrev = None if len(re.split(r';', label)) > 1: abbrev = (re.split(r';', label)[1].strip()) newlabel = self._cleanup_label(label) description = self._get_description(e['entry']) omimid = 'OMIM:'+str(omimnum) if e['entry']['status'] == 'removed': model.addDeprecatedClass(omimid) else: omimtype = self._get_omimtype(e['entry']) nodelabel = newlabel # this uses our cleaned-up label if omimtype == Genotype.genoparts['heritable_phenotypic_marker']: if abbrev is not None: nodelabel = abbrev # in this special case, # make it a disease by not declaring it as a gene/marker model.addClassToGraph(omimid, nodelabel, None, newlabel) elif omimtype == Genotype.genoparts['gene']: if abbrev is not None: nodelabel = abbrev model.addClassToGraph(omimid, nodelabel, omimtype, newlabel) else: model.addClassToGraph(omimid, newlabel, omimtype) # add the original screaming-caps OMIM label as a synonym model.addSynonym(omimid, label) # add the alternate labels and includes as synonyms for l in other_labels: model.addSynonym(omimid, l, 'OIO:hasRelatedSynonym') # for OMIM, we're adding the description as a definition model.addDefinition(omimid, description) if abbrev is not None: model.addSynonym(omimid, abbrev, 'OIO:hasRelatedSynonym') # if this is a genetic locus (but not sequenced) # then add the chrom loc info # but add it to the ncbi gene identifier, # not to the omim id (we reserve the omim id to be the phenotype) feature_id = None feature_label = None if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']: genemap = e['entry']['geneMap'] is_gene = False if omimtype == \ Genotype.genoparts['heritable_phenotypic_marker']: # get the ncbigene ids ncbifeature = self._get_mapped_gene_ids(e['entry'], g) if len(ncbifeature) == 1: feature_id = 'NCBIGene:'+str(ncbifeature[0]) # add this feature as a cause for the omim disease # TODO SHOULD I EVEN DO THIS HERE? assoc = G2PAssoc(g, self.name, feature_id, omimid) assoc.add_association_to_graph() elif len(ncbifeature) > 1: logger.info( "Its ambiguous when %s maps to >1 gene id: %s", omimid, str(ncbifeature)) else: # no ncbi feature, make an anonymous one feature_id = self._make_anonymous_feature(str(omimnum)) feature_label = abbrev elif omimtype == Genotype.genoparts['gene']: feature_id = omimid is_gene = True else: # 158900 falls into this category feature_id = self._make_anonymous_feature(str(omimnum)) if abbrev is not None: feature_label = abbrev omimtype = \ Genotype.genoparts[ 'heritable_phenotypic_marker'] if feature_id is not None: if 'comments' in genemap: # add a comment to this feature comment = genemap['comments'] if comment.strip() != '': model.addDescription(feature_id, comment) if 'cytoLocation' in genemap: cytoloc = genemap['cytoLocation'] # parse the cytoloc. # add this omim thing as # a subsequence of the cytofeature # 18p11.3-p11.2 # FIXME # add the other end of the range, # but not sure how to do that # not sure if saying subsequence of feature # is the right relationship f = Feature(g, feature_id, feature_label, omimtype) if 'chromosomeSymbol' in genemap: chrom_num = str(genemap['chromosomeSymbol']) chrom = makeChromID(chrom_num, tax_num, 'CHR') geno.addChromosomeClass( chrom_num, tax_id, tax_label) # add the positional information, if available fstart = fend = -1 if 'chromosomeLocationStart' in genemap: fstart = genemap['chromosomeLocationStart'] if 'chromosomeLocationEnd' in genemap: fend = genemap['chromosomeLocationEnd'] if fstart >= 0: # make the build-specific chromosome chrom_in_build = makeChromID(chrom_num, build_num, 'MONARCH') # then, add the chromosome instance # (from the given build) geno.addChromosomeInstance( chrom_num, build_id, build_num, chrom) if omimtype == \ Genotype.genoparts[ 'heritable_phenotypic_marker']: postypes = [Feature.types['FuzzyPosition']] else: postypes = None # NOTE that no strand information # is available in the API f.addFeatureStartLocation( fstart, chrom_in_build, None, postypes) if fend >= 0: f.addFeatureEndLocation( fend, chrom_in_build, None, postypes) if fstart > fend: logger.info( "start>end (%d>%d) for %s", fstart, fend, omimid) # add the cytogenic location too # for now, just take the first one cytoloc = cytoloc.split('-')[0] loc = makeChromID(cytoloc, tax_num, 'CHR') model.addClassToGraph(loc, None) f.addSubsequenceOfFeature(loc) f.addFeatureToGraph(True, None, is_gene) # end adding causative genes/features # check if moved, if so, # make it deprecated and # replaced consider class to the other thing(s) # some entries have been moved to multiple other entries and # use the joining raw word "and" # 612479 is movedto: "603075 and 603029" OR # others use a comma-delimited list, like: # 610402 is movedto: "609122,300870" if e['entry']['status'] == 'moved': if re.search(r'and', str(e['entry']['movedTo'])): # split the movedTo entry on 'and' newids = re.split(r'and', str(e['entry']['movedTo'])) elif len(str(e['entry']['movedTo']).split(',')) > 0: # split on the comma newids = str(e['entry']['movedTo']).split(',') else: # make a list of one newids = [str(e['entry']['movedTo'])] # cleanup whitespace and add OMIM prefix to numeric portion fixedids = [] for i in newids: fixedids.append('OMIM:'+i.strip()) model.addDeprecatedClass(omimid, fixedids) self._get_phenotypicseries_parents(e['entry'], g) self._get_mappedids(e['entry'], g) self._get_mapped_gene_ids(e['entry'], g) self._get_pubs(e['entry'], g) self._get_process_allelic_variants(e['entry'], g) # temp gag return
def process_feature_loc(self, limit): src_key = 'feature_loc' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing: %s", self.files[src_key]['file']) strain_to_variant_map = {} build_num = self.version_num build_id = 'WormBase:' + build_num col = self.files[src_key]['columns'] with gzip.open(raw, 'rb') as csvfile: reader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in reader: if re.match(r'\#', ''.join(row)): continue chrom = row[col.index('seqid')] # db = row[col.index('source')] feature_type_label = row[col.index('type')] start = row[col.index('start')] # end = row[col.index('end')] # score = row[col.index('score')] strand = row[col.index('strand')] # phase = row[col.index('phase')] attributes = row[col.index('attributes')] ''' I interpolated_pmap_position gene 1 559768 . . . ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM) I WormBase gene 3747 3909 . - . ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6 I absolute_pmap_position gene 4119 10230 . . . ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM) ''' # dbs = re.split( # r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA') # # if db not in dbs: # continue if feature_type_label not in [ 'gene', 'point_mutation', 'deletion', 'RNAi_reagent', 'duplication', 'enhancer', 'binding_site', 'biological_region', 'complex_substitution', 'substitution', 'insertion', 'inverted_repeat' ]: # note biological_regions include balancers # other options here: promoter, regulatory_region, reagent continue attribute_dict = {} if attributes != '': attributes.replace('"', '') attribute_dict = dict( tuple(atv.split('=')) for atv in attributes.split(";")) fid = flabel = desc = None if 'ID' in attribute_dict: fid = attribute_dict['ID'] if re.search(r'WB(Gene|Var|sf)', fid): fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid) elif re.match(r'(gmap|landmark)', fid): continue else: LOG.info('other identifier %s', fid) fid = None elif 'variation' in attribute_dict: fid = 'WormBase:' + attribute_dict['variation'] flabel = attribute_dict.get('public_name') sub = attribute_dict.get('substitution') ins = attribute_dict.get('insertion') # if it's a variation: # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T desc = '' if sub is not None: desc = 'substitution=' + sub if ins is not None: desc = 'insertion=' + ins # keep track of the strains with this variation, # for later processing strain_list = attribute_dict.get('strain') if strain_list is not None: for strn in strain_list.split(','): strn = strn.strip() if strn not in strain_to_variant_map: strain_to_variant_map[strn] = set() strain_to_variant_map[strn].add(fid) # if feature_type_label == 'RNAi_reagent': # Target=WBRNAi00096030 1 4942 # this will tell us where the RNAi is actually binding # target = attribute_dict.get('Target') # TODO unused # rnai_num = re.split(r' ', target)[0] # TODO unused # it will be the reagent-targeted-gene that has a position, # (i think) # TODO finish the RNAi binding location name = attribute_dict.get('Name') polymorphism = attribute_dict.get('polymorphism') if fid is None: if name is not None and re.match(r'WBsf', name): fid = 'WormBase:' + name name = None else: continue # these really aren't that interesting if polymorphism is not None: continue if name is not None and not re.search(name, fid): if flabel is None: flabel = name else: model.addSynonym(fid, name) if desc is not None and desc != '': model.addDescription(fid, desc) alias = attribute_dict.get('Alias') biotype = attribute_dict.get('biotype') note = attribute_dict.get('Note') other_name = attribute_dict.get('other_name') for n in [alias, other_name]: if n is not None: model.addSynonym(fid, other_name) if feature_type_label == 'gene': ftype_id = self.resolve(biotype) else: # so far, they all come with SO label syntax. resolve if need be. ftype_id = self.globaltt[feature_type_label] chr_id = makeChromID(chrom, build_id, 'CHR') geno.addChromosomeInstance(chrom, build_id, build_num) feature = Feature(graph, fid, flabel, ftype_id) feature.addFeatureStartLocation(start, chr_id, strand) feature.addFeatureEndLocation(start, chr_id, strand) feature_is_class = False if feature_type_label == 'gene': feature_is_class = True feature.addFeatureToGraph(True, None, feature_is_class) if note is not None and note != '': model.addDescription(fid, note) if limit is not None and reader.line_num > limit: break # RNAi reagents: ''' I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10 I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052 I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH ''' # TODO TF binding sites and network: '''
def _process_data(self, src_key, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info("Processing Data from %s", raw) if self.test_mode: # set the graph to build graph = self.testgraph else: graph = self.graph family = Family(graph) model = Model(graph) line_counter = 1 geno = Genotype(graph) diputil = DipperUtil() col = self.files[src_key]['columns'] # affords access with # x = row[col.index('x')].strip() with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"') # we can keep a close watch on changing file formats fileheader = next(filereader, None) fileheader = [c.lower() for c in fileheader] if col != fileheader: # assert LOG.error('Expected %s to have columns: %s', raw, col) LOG.error('But Found %s to have columns: %s', raw, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: line_counter += 1 if len(row) != len(col): LOG.warning( 'Expected %i values but find %i in row %i', len(col), len(row), line_counter) continue # (catalog_id, description, omim_number, sample_type, # cell_line_available, dna_in_stock, dna_ref, gender, age, # race, ethnicity, affected, karyotype, relprob, mutation, # gene, family_id, collection, url, cat_remark, pubmed_ids, # family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No, # ,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; # proband not in Repository,, # 2,,18343,H**o sapiens catalog_id = row[col.index('catalog_id')].strip() if self.test_mode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:' + catalog_id # Map the cell/sample type cell_type = self.resolve(row[col.index('sample_type')].strip()) # on fail cell_type = self.globaltt['cell'] ? # Make a cell line label collection = row[col.index('collection')].strip() line_label = collection.partition(' ')[0] + '-' + catalog_id # Map the repository/collection repository = self.localtt[collection] # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_:person' fam_id = row[col.index('fam')].strip() fammember = row[col.index('fammember')].strip() if fam_id != '': patient_id = '-'.join((patient_id, fam_id, fammember)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id)) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. description = row[col.index('description')].strip() short_desc = (description.split(';')[0]).capitalize() gender = row[col.index('gender')].strip().lower() affected = row[col.index('affected')].strip() relprob = row[col.index('relprob')].strip() if affected == '': affected = 'unspecified' elif affected in self.localtt: affected = self.localtt[affected] else: LOG.warning( 'Novel Affected status %s at row: %i of %s', affected, line_counter, raw) patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = ' '.join(( patient_label.strip(), 'with', short_desc)) else: patient_label = ' '.join(( patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = self.globaltt['cell line'] model.addIndividualToGraph( cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref dna_ref = row[col.index('dna_ref')].strip() if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:' + dna_ref # some of the equivalent ids are not defined # in the source data; so add them model.addIndividualToGraph( equiv_cell_line, None, cell_line_reagent_id) model.addSameIndividual(cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository family.addMember(repository, cell_line_id) cat_remark = row[col.index('cat_remark')].strip() if cat_remark != '': model.addDescription(cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # graph,age_id,age,self.globaltt['age']) # gu.addTriple( # graph,age_id,self.globaltt['has measurement value'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. model.addPerson(patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self.resolve(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.globaltt['race'], mapped_race) # model.addSubClass( # mapped_race,self.globaltt['ethnic_group']) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if fam_id != '': family_comp_id = 'CoriellFamily:' + fam_id family_label = ' '.join(('Family of proband with', short_desc)) # Add the family ID as a named individual model.addIndividualToGraph( family_comp_id, family_label, self.globaltt['family']) # Add the patient as a member of the family family.addMemberOf(patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! species = row[col.index('species')].strip() if species is None or species == '': species = 'H**o sapiens' taxon = self.resolve(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None dbsnp_id = row[col.index('dbsnp_id')].strip() if dbsnp_id != '': genotype_id = 'dbSNPIndividual:' + dbsnp_id omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = row[col.index('karyotype')].strip() karyotype = diputil.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = '_:'+re.sub( 'MONARCH:', '', self.make_id(karyotype)) # add karyotype as karyotype_variation_complement model.addIndividualToGraph( karyotype_id, karyotype, self.globaltt['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = self._get_affected_chromosomes_from_karyotype( karyotype) for chrom in karyo_chrs: chr_id = makeChromID(chrom, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, chrom)) karyotype_feature_label = \ 'some karyotype alteration on chr' + str(chrom) feat = Feature( graph, karyotype_feature_id, karyotype_feature_label, self.globaltt['sequence_alteration']) feat.addFeatureStartLocation(None, chr_id) feat.addFeatureToGraph() geno.addParts( karyotype_feature_id, karyotype_id, self.globaltt['has_variant_part']) gene = row[col.index('gene')].strip() mutation = row[col.index('mutation')].strip() if gene != '': varl = gene + '(' + mutation + ')' # fix the variant_id so it's always in the same order variant_id = row[col.index('variant_id')].strip() vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' and not self._is_normal_karyotype( karyotype): gvc_id = karyotype_id if variant_id != '': gvc_id = '_:' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((varl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_:' + variant_id.replace(';', '-') gvc_label = varl else: # wildtype? pass # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = self.globaltt['has_variant_part'] if self._is_normal_karyotype(karyotype): karyo_rel = self.globaltt['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for var in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X mch = re.match(r'(\d+)\.+(.*)', var.strip()) if mch is not None and len(mch.groups()) == 2: (locus_num, var_num) = mch.groups() if locus_num is not None and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for omim in omim_map: # gene_id = 'OMIM:' + omim # TODO unused vslc_id = '_:' + '-'.join( [omim + '.' + a for a in omim_map.get(omim)]) vslc_label = varl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) for var in omim_map.get(omim): # this is actually a sequence alt allele1_id = 'OMIM:' + omim + '.' + var geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype model.addType(patient_id, self.globaltt['wildtype']) elif genotype_id is None: # make an anonymous genotype id (aka blank node) genotype_id = '_:geno' + catalog_id.strip() # add the gvc if gvc_id is not None: model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = self.globaltt['has_reference_part'] else: rel = self.globaltt['has_variant_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = '; '.join((gvc_label, karyotype)) elif karyotype is not None: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts( karyotype_id, genotype_id, self.globaltt['has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' ['+catalog_id.strip()+']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype( genotype_id, genotype_label, self.globaltt['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient graph.addTriple( patient_id, self.globaltt['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # = row[col.index('')].strip() # ############# DEAL WITH THE DISEASES ############# omim_num = row[col.index('omim_num')].strip() # we associate the disease to the patient if affected == 'affected' and omim_num != '': for disease in omim_num.split(';'): if disease is not None and disease != '': # if the omim number is in omim_map, # then it is a gene not a pheno # TEC - another place to use the mimTitle omim # classifier omia & genereviews are using if disease not in omim_map: disease_id = 'OMIM:' + disease.strip() # assume the label is taken care of in OMIM model.addClassToGraph(disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc( graph, self.name, patient_id, disease_id) assoc.add_association_to_graph() # this line is a model of this disease # TODO abstract out model into # it's own association class? graph.addTriple( cell_line_id, self.globaltt['is model of'], disease_id) else: LOG.info('drop gene %s from disease list', disease) # ############# ADD PUBLICATIONS ############# pubmed_ids = row[col.index('pubmed_ids')].strip() if pubmed_ids != '': for pmid in pubmed_ids.split(';'): pubmed_id = 'PMID:' + pmid.strip() ref = Reference(graph, pubmed_id) ref.setType(self.globaltt['journal article']) ref.addRefToGraph() graph.addTriple( pubmed_id, self.globaltt['mentions'], cell_line_id) if not self.test_mode and ( limit is not None and line_counter > limit): break return
def _process_phene_row(self, row): model = Model(self.graph) phenotype_id = None sp_phene_label = row['phene_name'] if sp_phene_label == '': sp_phene_label = None if 'omia_id' not in row: LOG.info("omia_id not present for %s", row['phene_id']) omia_id = self._make_internal_id('phene', phenotype_id) else: omia_id = 'OMIA:' + str(row['omia_id']) if self.test_mode and not ( # demorgan this row['gb_species_id'] in self.test_ids['taxon'] and omia_id in self.test_ids['disease']): return # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = omia_id descr = row['summary'] if descr == '': descr = None # omia label omia_label = self.label_hash.get(omia_id) # add the species-specific subclass (TODO please review this choice) gb_species_id = row['gb_species_id'] if gb_species_id != '': sp_phene_id = '-'.join((omia_id, gb_species_id)) else: LOG.error( "No species supplied in species-specific phene table for %s", omia_id) return species_id = 'NCBITaxon:' + str(gb_species_id) # use this instead species_label = self.label_hash.get('NCBITaxon:' + gb_species_id) if sp_phene_label is None and omia_label is not None \ and species_label is not None: sp_phene_label = ' '.join((omia_label, 'in', species_label)) model.addClassToGraph(sp_phene_id, sp_phene_label, omia_id, descr, class_category=blv.terms['PhenotypicFeature']) # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = sp_phene_id self.label_hash[sp_phene_id] = sp_phene_label # add each of the following descriptions, # if they are populated, with a tag at the end. for item in [ 'clin_feat', 'history', 'pathology', 'mol_gen', 'control' ]: if row[item] is not None and row[item] != '': model.addDescription( sp_phene_id, row[item] + ' [' + item + ']', subject_category=blv.terms['PhenotypicFeature']) # if row['symbol'] is not None: # species-specific # CHECK ME - sometimes spaces or gene labels # gu.addSynonym(g, sp_phene, row['symbol']) model.addOWLPropertyClassRestriction( sp_phene_id, self.globaltt['in taxon'], species_id, class_category=blv.terms['PhenotypicFeature']) # add inheritance as an association inheritance_id = None if row['inherit'] is not None and row['inherit'] in self.localtt: inheritance_id = self.resolve(row['inherit']) elif row['inherit'] is not None and row['inherit'] != '': LOG.info('Unhandled inheritance type:\t%s', row['inherit']) if inheritance_id is not None: # observable related to genetic disposition assoc = D2PAssoc( # JR: not sure we should be using D2PAssoc for this self.graph, self.name, sp_phene_id, inheritance_id, rel=self.globaltt['has disposition'], disease_category=blv.terms['PhenotypicFeature']) assoc.add_association_to_graph() if row['characterised'] == 'Yes': self.stored_omia_mol_gen[omia_id] = { 'mol_gen': row['mol_gen'], 'map_info': row['map_info'], 'species': row['gb_species_id'] }
class Assoc: """ A base class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ assoc_types = {'association': 'OBAN:association'} annotation_properties = { 'replaced_by': 'IAO:0100001', 'consider': 'OIO:consider', 'hasExactSynonym': 'OIO:hasExactSynonym', 'hasRelatedSynonym': 'OIO:hasRelatedSynonym', 'definition': 'IAO:0000115', 'has_xref': 'OIO:hasDbXref', 'inchi_key': 'CHEBI:InChIKey', 'probabalistic_quantifier': 'GENO:0000867' } object_properties = { 'has disposition': 'RO:0000091', 'has_phenotype': 'RO:0002200', 'expressed_in': 'RO:0002206', 'in_taxon': 'RO:0002162', 'has_quality': 'RO:0000086', 'towards': 'RO:0002503', 'has_subject': 'OBAN:association_has_subject', 'has_object': 'OBAN:association_has_object', 'has_predicate': 'OBAN:association_has_predicate', 'is_about': 'IAO:0000136', 'has_evidence': 'RO:0002558', 'has_source': 'dc:source', 'has_provenance': 'OBAN:has_provenance', 'causes_or_contributes': 'RO:0003302' } datatype_properties = { 'position': 'faldo:position', 'has_measurement': 'IAO:0000004', 'has_quantifier': 'GENO:0000866', 'created_on': 'pav:createdOn' } properties = annotation_properties.copy() properties.update(object_properties) properties.update(datatype_properties) def __init__(self, graph, definedby, sub=None, obj=None, pred=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.model = Model(self.graph) # core parts of the association self.definedby = definedby self.sub = sub self.obj = obj self.rel = pred self.assoc_id = None self.description = None self.source = [] self.evidence = [] self.date = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return def get_properties(self): return self.properties def _is_valid(self): # check if sub/obj/rel are none...throw error if self.sub is None: raise ValueError('No subject set for this association') if self.obj is None: raise ValueError('No object set for this association') if self.rel is None: raise ValueError('No relation set for this association') return True def _add_basic_association_to_graph(self): if not self._is_valid(): return self.graph.addTriple(self.sub, self.rel, self.obj) if self.assoc_id is None: self.set_association_id() self.model.addType(self.assoc_id, self.assoc_types['association']) self.graph.addTriple(self.assoc_id, self.object_properties['has_subject'], self.sub) self.graph.addTriple(self.assoc_id, self.object_properties['has_object'], self.obj) self.graph.addTriple(self.assoc_id, self.object_properties['has_predicate'], self.rel) if self.description is not None: self.model.addDescription(self.assoc_id, self.description) if self.evidence is not None and len(self.evidence) > 0: for e in self.evidence: self.graph.addTriple(self.assoc_id, self.object_properties['has_evidence'], e) if self.source is not None and len(self.source) > 0: for s in self.source: if re.match('http', s): # TODO assume that the source is a publication? # use Reference class here self.graph.addTriple(self.assoc_id, self.object_properties['has_source'], s, True) else: self.graph.addTriple(self.assoc_id, self.object_properties['has_source'], s) if self.provenance is not None and len(self.provenance) > 0: for p in self.provenance: self.graph.addTriple(self.assoc_id, self.object_properties['has_provenance'], p) if self.date is not None and len(self.date) > 0: for d in self.date: self.graph.addTriple( object_is_literal=True, subject_id=self.assoc_id, predicate_id=self.datatype_properties['created_on'], obj=d) if self.score is not None: self.graph.addTriple(self.assoc_id, self.properties['has_measurement'], self.score, True, 'xsd:float') # TODO # update with some kind of instance of scoring object # that has a unit and type return def add_association_to_graph(self): self._add_basic_association_to_graph() return def add_predicate_object(self, predicate, object_node, object_type=None, datatype=None): if object_type == 'Literal': if datatype is not None: self.graph.addTriple(self.assoc_id, predicate, object_node, True, datatype) else: self.graph.addTriple(self.assoc_id, predicate, object_node, True) else: self.graph.addTriple(self.assoc_id, predicate, object_node, False) return # This isn't java, but if we must, # prefer use of property decorator def set_subject(self, identifier): self.sub = identifier return def set_object(self, identifier): self.obj = identifier return def set_relationship(self, identifier): self.rel = identifier return def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id(self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return def get_association_id(self): return self.assoc_id def set_description(self, description): self.description = description return def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type return def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] return def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] return def add_date(self, date): if date is not None and date.strip() != '': self.date += [date] return def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] return @staticmethod def make_association_id(definedby, subject, predicate, object, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively digests the string of concatonated values. Subclasses of Assoc can submit an additional array of attributes that will be appeded to the ID. :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ # note others available: # md5(), sha1(), sha224(), sha256(), sha384(), and sha512() # putting definedby first, # as this will usually be the datasource providing the annotation # this will end up making the first few parts of the id # be the same for all annotations in that resource # (although the point of a digest is to render such details moot). items_to_hash = [definedby, subject, predicate, object] if attributes is not None: items_to_hash += attributes for i, val in enumerate(items_to_hash): if val is None: items_to_hash[i] = '' byte_string = '+'.join(items_to_hash).encode("utf-8") # TODO put this in a util? return ':'.join( ('MONARCH', hashlib.sha1(byte_string).hexdigest()[0:16]))
def _process_ortholog_classes(self, limit=None): """ This method add the KEGG orthology classes to the graph. If there's an embedded enzyme commission number, that is added as an xref. Triples created: <orthology_class_id> is a class <orthology_class_id> has label <orthology_symbols> <orthology_class_id> has description <orthology_description> :param limit: :return: """ LOG.info("Processing ortholog classes") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in reader: (orthology_class_id, orthology_class_name) = row if self.test_mode and orthology_class_id \ not in self.test_ids['orthology_classes']: continue # The orthology class is essentially a KEGG gene ID # that is species agnostic. # Add the ID and label as a gene family class other_labels = re.split(r'[;,]', orthology_class_name) # the first one is the label we'll use orthology_label = other_labels[0] orthology_class_id = 'KEGG-'+orthology_class_id.strip() orthology_type = self.globaltt['gene_family'] model.addClassToGraph( orthology_class_id, orthology_label, orthology_type) if len(other_labels) > 1: # add the rest as synonyms # todo skip the first for s in other_labels: model.addSynonym(orthology_class_id, s.strip()) # add the last one as the description d = other_labels[len(other_labels)-1] model.addDescription(orthology_class_id, d) # add the enzyme commission number (EC:1.2.99.5)as an xref # sometimes there's two, like [EC:1.3.5.1 1.3.5.4] # can also have a dash, like EC:1.10.3.- ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d) if ec_matches is not None: for ecm in ec_matches: model.addXref(orthology_class_id, 'EC:' + ecm) if not self.test_mode and limit is not None and reader.line_num > limit: break LOG.info("Done with ortholog classes")
def process_gaf(self, file, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", file) line_counter = 0 uniprot_hit = 0 uniprot_miss = 0 if 7955 in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if 6239 in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue if len(row) > 17 or len(row) < 15: LOG.warning( "Wrong number of columns %i, expected 15 or 17\n%s", len(row), row) continue if 17 > len(row) >= 15: row += [""] * (17 - len(row)) (dbase, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (dbase == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): LOG.error( "Missing required part of annotation on row %d:\n"+'\t' .join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and not( re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, syn.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon LOG.info( ">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ####################################################################### # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id+'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): LOG.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id(gene_id, i) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene(i, gene_id, targeted_gene_id) assoc = G2PAssoc( graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, i, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence( self.globaltt['experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.test_mode and limit is not None and line_counter > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %f.2%% of %i benifited from the 1/4 day id mapping download", uniprot_per, uniprot_tot) return
class Dataset: """ This class produces metadata about a dataset that is compliant with the HCLS dataset specification: https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/#s4_4 Summary level: The summary level provides a description of a dataset that is independent of a specific version or format. (e.g. the Monarch ingest of CTD) CURIE for this is something like MonarchData:[SOURCE IDENTIFIER] Version level: The version level captures version-specific characteristics of a dataset. (e.g. the 01-02-2018 ingest of CTD) CURIE for this is something like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP] Distribution level: The distribution level captures metadata about a specific form and version of a dataset (e.g. turtle file for 01-02-2018 ingest of CTD). There is a [distribution level resource] for each different downloadable file we emit, i.e. one for the TTL file, one for the ntriples file, etc. CURIE for this is like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].ttl or MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].nt or MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].[whatever file format] We write out at least the following triples: SUMMARY LEVEL TRIPLES: [summary level resource] - rdf:type -> dctypes:Dataset [summary level resource] - dc:title -> title (literal) [summary level resource] - dc:description -> description (literal) (use docstring from Source class) [summary level resource] - dc:source -> [source web page, e.g. omim.org] [summary level resource] - schema:logo -> [source logo IRI] [summary level resource] - dc:publisher -> monarchinitiative.org n.b: about summary level resource triples: -- HCLS spec says we "should" link to our logo and web page, but I'm not, because it would confuse the issue of whether we are pointing to our logo/page or the logo/page of the data source for this ingest. Same below for [version level resource] and [distibution level resource] - I'm not linking to our page/logo down there either. - spec says we "should" include summary level triples describing Update frequency and SPARQL endpoint but I'm omitting this for now, because these are not clearly defined at the moment VERSION LEVEL TRIPLES: [version level resource] - rdf:type -> dctypes:Dataset [version level resource] - dc:title -> version title (literal) [version level resource] - dc:description -> version description (literal) [version level resource] - dc:created -> ingest timestamp [ISO 8601 compliant] [version level resource] - pav:version -> ingest timestamp (same one above) [version level resource] - dc:creator -> monarchinitiative.org [version level resource] - dc:publisher -> monarchinitiative.org [version level resource] - dc:isVersionOf -> [summary level resource] [version level resource] - dc:source -> [source file 1 IRI] [version level resource] - dc:source -> [source file 2 IRI] ... [source file 1 IRI] - pav:retrievedOn -> [download date timestamp] [source file 2 IRI] - pav:version -> [source version (if set, optional)] [source file 2 IRI] - pav:retrievedOn -> [download date timestamp] [source file 2 IRI] - pav:version -> [source version (if set, optional)] ... [version level resource] - pav:createdWith -> [Dipper github URI] [version level resource] - void:dataset -> [distribution level resource] [version level resource] - cito:citesAsAuthoriy -> [citation id 1] [version level resource] - cito:citesAsAuthoriy -> [citation id 2] [version level resource] - cito:citesAsAuthoriy -> [citation id 3] n.b: about version level resource triples: - spec says we "should" include Date of issue/dc:issued triple, but I'm not because it is redundant with this triple above: [version level resource] - dc:created -> time stamp and would introduce ambiguity and confusion if the two disagree. Same below for [distribution level resource] - dc:created -> tgiime stamp below Also omitting: - triples linking to our logo and page, see above. - License/dc:license triple, because we will make this triple via the [distribution level resource] below - Language/dc:language triple b/c it seems superfluous. Same below for [distribution level resource] - no language triple. - [version level resource] - pav:version triple is also a bit redundant with the pav:version triple below, but the spec requires both these triples - I'm omitting the [version level resource] -> pav:previousVersion because Dipper doesn't know this info for certain at run time. Same below for [distribution level resource] - pav:previousVersion. DISTRIBUTION LEVEL TRIPLES: [distribution level resource] - rdf:type -> dctypes:Dataset [distribution level resource] - rdf:type -> dcat:Distribution [distribution level resource] - dc:title -> distribution title (literal) [distribution level resource] - dc:description -> distribution description (lit.) [distribution level resource] - dc:created -> ingest timestamp[ISO 8601 compliant] [distribution level resource] - pav:version -> ingest timestamp (same as above) [distribution level resource] - dc:creator -> monarchinitiative.org [distribution level resource] - dc:publisher -> monarchinitiative.org [distribution level resource] - dc:license -> [license info, if available otherwise indicate unknown] [distribution level resource] - dc:rights -> [data rights IRI] [distribution level resource] - pav:createdWith -> [Dipper github URI] [distribution level resource] - dc:format -> [IRI of ttl|nt|whatever spec] [distribution level resource] - dcat:downloadURL -> [ttl|nt URI] [distribution level resource] - void:triples -> [triples count (literal)] [distribution level resource] - void:entities -> [entities count (literal)] [distribution level resource] - void:distinctSubjects -> [subject count (literal)] [distribution level resource] - void:distinctObjects -> [object count (literal)] [distribution level resource] - void:properties -> [properties count (literal)] ... n.b: about distribution level resource triples: - omitting Vocabularies used/void:vocabulary and Standards used/dc:conformTo triples, because they are described in the ttl file - also omitting Example identifier/idot:exampleIdentifier and Example resource/void:exampleResource, because we don't really have one canonical example of either - they're all very different. - [distribution level resource] - dc:created should have the exact same time stamp as this triple above: [version level resource] - dc:created -> time stamp - this [distribution level resource] - pav:version triple should have the same object as [version level resource] - pav:version triple above - Data source provenance/dc:source triples are above in the [version level resource] - omitting Byte size/dc:byteSize, RDF File URL/void:dataDump, and Linkset/void:subset triples because they probably aren't necessary for MI right now - these triples "should" be emitted, but we will do this in a later iteration: # of classes void:classPartition IRI # of literals void:classPartition IRI # of RDF graphs void:classPartition IRI Note: Do not use blank nodes in the dataset graph. This dataset graph is added to the main Dipper graph in Source.write() like so $ mainGraph = mainGraph + datasetGraph which apparently in theory could lead to blank node ID collisions between the two graphs. Note also that this implementation currently does not support producing metadata for StreamedGraph graphs (see dipper/graph/StreamedGraph.py). StreamedGraph is currently not being used for any ingests, so this isn't a problem. There was talk of using StreamedGraph for a rewrite/refactor of the Clinvar ingest, which would probably require adding support here for StreamedGraph's. """ def __init__( self, identifier, data_release_version, ingest_name, ingest_title, ingest_url, ingest_logo=None, ingest_description=None, license_url=None, data_rights=None, graph_type='rdf_graph', # rdf_graph, streamed_graph file_handle=None, distribution_type='ttl', dataset_curie_prefix='MonarchArchive'): if graph_type is None: self.graph = RDFGraph(None, ":".join([dataset_curie_prefix, identifier])) elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, ":".join( [dataset_curie_prefix, identifier]), file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph(True, ':'.join([dataset_curie_prefix, identifier])) if data_release_version is not None: self.data_release_version = data_release_version else: self.data_release_version = datetime.today().strftime("%Y%m%d") self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.identifier = ':'.join([dataset_curie_prefix, identifier]) self.citation = set() self.ingest_name = ingest_name self.ingest_title = ingest_title if self.ingest_title is None: self.ingest_title = ":".join([dataset_curie_prefix, identifier]) self.ingest_url = ingest_url self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo self.ingest_description = ingest_description self.date_issued = None self.license_url = license_url self.data_rights = data_rights self.distribution_type = distribution_type # set HCLS resource CURIEs self.summary_level_curie = ':'.join( [dataset_curie_prefix, '#' + identifier]) self.version_level_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/#' + identifier self.distribution_level_turtle_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/rdf/' + \ identifier + "." + self.distribution_type # The following might seem a little odd, but we need to set downloadURLs this # way in order for them to point to where they will end up in archive.MI.org as # of Sept 2019. URL is: # https://archive.MI.org/[release version]/[dist type]/[source].[dist type] self.download_url = \ self.curie_map.get("MonarchArchive") + self.data_release_version + \ "/rdf/" + self.ingest_name + "." + self.distribution_type self._set_summary_level_triples() self._set_version_level_triples() self._set_distribution_level_triples() def _set_summary_level_triples(self): self.model.addType(self.summary_level_curie, self.globaltt['Dataset']) self.graph.addTriple(self.summary_level_curie, self.globaltt['title'], self.ingest_title, True) self.model.addTriple(self.summary_level_curie, self.globaltt['Publisher'], self.curie_map.get("")) self.model.addTriple(self.summary_level_curie, "schema:logo", self.ingest_logo) self.graph.addTriple(self.summary_level_curie, self.globaltt['identifier'], self.summary_level_curie) if self.ingest_url is not None: self.graph.addTriple(self.summary_level_curie, self.globaltt["Source"], self.ingest_url) if self.ingest_description is not None: self.model.addDescription(self.summary_level_curie, self.ingest_description) def _set_version_level_triples(self): self.model.addType(self.version_level_curie, self.globaltt['Dataset']) self.graph.addTriple( self.version_level_curie, self.globaltt['title'], self.ingest_title + " Monarch version " + self.data_release_version, True) if self.ingest_description is not None: self.model.addDescription(self.version_level_curie, self.ingest_description) self.graph.addTriple( self.version_level_curie, self.globaltt['Date Created'], Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date)) self.graph.addTriple( self.version_level_curie, self.globaltt['version'], Literal(self.data_release_version, datatype=XSD.date)) self.graph.addTriple(self.version_level_curie, self.globaltt['creator'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.version_level_curie, self.globaltt['Publisher'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.version_level_curie, self.globaltt['isVersionOf'], self.summary_level_curie, object_is_literal=False) self.graph.addTriple(self.version_level_curie, self.globaltt['distribution'], self.distribution_level_turtle_curie, object_is_literal=False) def _set_distribution_level_triples(self): self.model.addType(self.distribution_level_turtle_curie, self.globaltt['Dataset']) self.model.addType(self.distribution_level_turtle_curie, self.globaltt['Distribution']) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['title'], self.ingest_title + " distribution " + self.distribution_type, True) if self.ingest_description is not None: self.model.addDescription(self.distribution_level_turtle_curie, self.ingest_description) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['version'], Literal(self.data_release_version, datatype=XSD.date)) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['Date Created'], Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date)) self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['creator'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['Publisher'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['created_with'], "https://github.com/monarch-initiative/dipper") self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['format'], "https://www.w3.org/TR/turtle/") self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['downloadURL'], self.download_url) if self.license_url is None: self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['license'], 'https://project-open-data.cio.gov/unknown-license/') else: self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['license'], self.license_url) if self.data_rights is not None: self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['rights'], self.data_rights) self._declare_as_ontology() def set_ingest_source_file_version_num(self, file_iri, version): """ This method sets the version of a remote file or resource that is used in the ingest. It writes this triple: file_iri - 'pav:version' -> version Version is an untyped literal Note: if your version is a date or timestamp, use set_ingest_source_file_version_date() instead :param file_iri: a remote file or resource used in ingest :param version: a number or string (e.g. v1.2.3) that the source (OMIM, CTD) uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['version'], version, object_is_literal=True) def set_ingest_source_file_version_date(self, file_iri, date, datatype=XSD.date): """ This method sets the version that the source (OMIM, CTD, whatever) uses to refer to this version of the remote file/resource that was used in the ingest It writes this triple: file_iri - 'pav:version' -> date or timestamp Version is added as a literal of datatype XSD date Note: if file_iri was retrieved using get_files(), then the following triple was created and you might not need this method: file_iri - 'pav:retrievedOn' -> download date :param file_iri: a remote file or resource used in ingest :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can add timestamp as a version by using a different datatype (below) :param datatype: an XSD literal datatype, default is XSD.date uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['version'], date, object_is_literal=True, literal_type=datatype) def set_ingest_source_file_version_retrieved_on(self, file_iri, date, datatype=XSD.date): """ This method sets the date on which a remote file/resource (from OMIM, CTD, etc) was retrieved. It writes this triple: file_iri - 'pav:retrievedOn' -> date or timestamp Version is added as a literal of datatype XSD date by default Note: if file_iri was retrieved using get_files(), then the following triple was created and you might not need this method: file_iri - 'pav:retrievedOn' -> download date :param file_iri: a remote file or resource used in ingest :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can add timestamp as a version by using a different datatype (below) :param datatype: an XSD literal datatype, default is XSD.date uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['retrieved_on'], date, object_is_literal=True, literal_type=datatype) def set_ingest_source(self, url, predicate=None, is_object_literal=False): """ This method writes a triple to the dataset graph indicating that the ingest used a file or resource at [url] during the ingest. Triple emitted is version_level_curie dc:source [url] This triple is likely to be redundant if Source.get_files() is used to retrieve the remote files/resources, since this triple should also be emitted as files/resources are being retrieved. This method is provided as a convenience method for sources that do their own downloading of files. :param url: a remote resource used as a source during ingest :param predicate: the predicate to use for the triple ["dc:source"] from spec (https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/) "Use dc:source when the source dataset was used in whole or in part. Use pav:retrievedFrom when the source dataset was used in whole and was not modified from its original distribution. Use prov:wasDerivedFrom when the source dataset was in whole or in part and was modified from its original distribution." :return: None """ if predicate is None: predicate = self.globaltt["Source"] self.graph.addTriple(self.version_level_curie, predicate, url, object_is_literal=is_object_literal, subject_category=blv.terms['DataSetVersion']) def get_graph(self): """ This method returns the dataset graph :param :return: dataset graph """ return self.graph def get_license(self): """ This method returns the license info :param :return: license info """ return self.license_url def set_citation(self, citation_id): """ This method adds [citaton_id] argument to the set of citations, and also adds a triple indicating that version level cito:citesAsAuthority [citation_id] :param: citation_id :return: none """ self.citation.add(citation_id) self.graph.addTriple(self.version_level_curie, self.globaltt['citesAsAuthority'], citation_id) def _declare_as_ontology(self, version_info=None): """ Declare the distribution level IRI as an ontology, and also make triple distribution level IRI - version_iri -> version level IRI TEC: I am not convinced dipper reformatting external data as RDF triples makes an OWL ontology (nor that it should be considered a goal). Proper ontologies are built by ontologists. Dipper reformats data and annotates/decorates it with a minimal set of carefully arranged terms drawn from from multiple proper ontologies. Which allows the whole (dipper's RDF triples and parent ontologies) to function as a single ontology we can reason over when combined in a store such as SciGraph. Including more than the minimal ontological terms in dipper's RDF output constitutes a liability as it allows greater divergence between dipper artifacts and the proper ontologies. :param version_info: a string describing version info for the ontology :return: """ model = Model(self.graph) model.addOntologyDeclaration(self.summary_level_curie) model.addOWLVersionIRI(self.summary_level_curie, self.version_level_curie) if version_info is not None: model.addOWLVersionInfo(self.distribution_level_turtle_curie, version_info) @staticmethod def make_id(long_string, prefix='MONARCH'): """ A method to create DETERMINISTIC identifiers based on a string's digest. currently implemented with sha1 Duplicated from Source.py to avoid circular imports. :param long_string: string to use to generate identifier :param prefix: prefix to prepend to identifier [Monarch] :return: a Monarch identifier """ return ':'.join((prefix, Dataset.hash_id(long_string))) @staticmethod def hash_id(word): # same as graph/GraphUtils.digest_id(wordage) """ Given a string, make a hash Duplicated from Source.py. :param word: str string to be hashed :return: hash of id """ return 'b' + hashlib.sha1(word.encode('utf-8')).hexdigest()[1:20]
def process_omia_phenotypes(self, limit): # process the whole directory # TODO get the file listing if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info( "Processing Monarch OMIA Animal disease-phenotype associations") src_key = 'omia_d2p' # get file listing mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype')) file_list = [ f for f in listdir(mypath) if isfile(join(mypath, f)) and re.search(r'.txt$', f) ] col = self.files[src_key]['columns'] # reusable initial code generator # for c in col: # print( # '# '+str.lower(c.replace(" ",""))+" = row[col.index('"+c+"')].strip()") for filename in file_list: LOG.info("Processing %s", filename) count_missing = 0 bad_rows = list() fname = '/'.join((mypath, filename)) with open(fname, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') row = next(filereader) if self.check_fileheader(col, row): pass for row in filereader: if len(row) != len(col): LOG.info("Not enough cols %d in %s - please fix", len(row), filename) continue disease_num = row[col.index('Disease ID')].strip() species_id = row[col.index('Species ID')].strip() breed_name = row[col.index('Breed Name')].strip() # variant = row[col.index('Variant')] # inheritance = row[col.index('Inheritance')] phenotype_id = row[col.index('Phenotype ID')].strip() # phenotype_name = row[col.index('Phenotype Name')] entity_id = row[col.index('Entity ID')].strip() entity_name = row[col.index('Entity Name')] quality_id = row[col.index('Quality ID')].strip() quality_name = row[col.index('Quality Name')] # related_entity_id = row[col.index('Related Entity ID')] # related_entity_name = row[col.index('Related Entity Name')] # abnormal_id = row[col.index('Abnormal ID')] # abnormal_name = row[col.index('Abnormal Name')] # phenotype_desc = row[col.index('Phenotype Desc')] assay = row[col.index('Assay')].strip() # frequency = row[col.index('Frequency')] pubmed_id = row[col.index('Pubmed ID')].strip() phenotype_description = row[col.index('Pub Desc')].strip() curator_notes = row[col.index('Curator Notes')].strip() # date_created = row[col.index('Date Created')] if phenotype_id == '': # LOG.warning('Missing phenotype in row:\n%s', row) count_missing += 1 bad_rows.append(row) continue if len(str(disease_num)) < 6: disease_num = str(disease_num).zfill(6) disease_id = 'OMIA:' + disease_num if species_id != '': disease_id = '-'.join((disease_id, species_id)) assoc = D2PAssoc(graph, self.name, disease_id, phenotype_id) if pubmed_id != '': for pnum in re.split(r'[,;]', pubmed_id): pnum = re.sub(r'[^0-9]', '', pnum) pmid = 'PMID:' + pnum assoc.add_source(pmid) else: assoc.add_source('/'.join( (self.curie_map['OMIA'] + disease_num, species_id))) assoc.add_association_to_graph() aid = assoc.get_association_id() if phenotype_description != '': model.addDescription(aid, phenotype_description) if breed_name != '': model.addDescription(aid, breed_name + ' [observed in]') if assay != '': model.addDescription(aid, assay + ' [assay]') if curator_notes != '': model.addComment(aid, curator_notes) if entity_id != '' or quality_id != '': LOG.info("EQ not empty for %s: %s + %s", disease_id, entity_name, quality_name) if count_missing > 0: LOG.warning( "We are missing %d of %d D2P annotations from id %s", count_missing, filereader.line_num - 1, filename) LOG.warning("Bad rows:\n%s", '\n'.join([str(x) for x in bad_rows])) # finish loop through all files return
class Assoc: """ A base class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ assoc_types = { 'association': 'OBAN:association' } annotation_properties = { 'replaced_by': 'IAO:0100001', 'consider': 'OIO:consider', 'hasExactSynonym': 'OIO:hasExactSynonym', 'hasRelatedSynonym': 'OIO:hasRelatedSynonym', 'definition': 'IAO:0000115', 'has_xref': 'OIO:hasDbXref', 'inchi_key': 'CHEBI:InChIKey', 'probabalistic_quantifier': 'GENO:0000867' } object_properties = { 'has disposition': 'RO:0000091', 'has_phenotype': 'RO:0002200', 'expressed_in': 'RO:0002206', 'in_taxon': 'RO:0002162', 'has_quality': 'RO:0000086', 'towards': 'RO:0002503', 'has_subject': 'OBAN:association_has_subject', 'has_object': 'OBAN:association_has_object', 'has_predicate': 'OBAN:association_has_predicate', 'is_about': 'IAO:0000136', 'has_evidence': 'RO:0002558', 'has_source': 'dc:source', 'has_provenance': 'OBAN:has_provenance', 'causes_or_contributes': 'RO:0003302' } datatype_properties = { 'position': 'faldo:position', 'has_measurement': 'IAO:0000004', 'has_quantifier': 'GENO:0000866', 'created_on': 'pav:createdOn' } properties = annotation_properties.copy() properties.update(object_properties) properties.update(datatype_properties) def __init__(self, graph, definedby, sub=None, obj=None, pred=None): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".graph) self.model = Model(self.graph) # core parts of the association self.definedby = definedby self.sub = sub self.obj = obj self.rel = pred self.assoc_id = None self.description = None self.source = [] self.evidence = [] self.date = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None return def get_properties(self): return self.properties def _is_valid(self): # check if sub/obj/rel are none...throw error if self.sub is None: raise ValueError('No subject set for this association') if self.obj is None: raise ValueError('No object set for this association') if self.rel is None: raise ValueError('No relation set for this association') return True def _add_basic_association_to_graph(self): if not self._is_valid(): return self.graph.addTriple(self.sub, self.rel, self.obj) if self.assoc_id is None: self.set_association_id() self.model.addType(self.assoc_id, self.assoc_types['association']) self.graph.addTriple( self.assoc_id, self.object_properties['has_subject'], self.sub) self.graph.addTriple( self.assoc_id, self.object_properties['has_object'], self.obj) self.graph.addTriple( self.assoc_id, self.object_properties['has_predicate'], self.rel) if self.description is not None: self.model.addDescription(self.assoc_id, self.description) if self.evidence is not None and len(self.evidence) > 0: for e in self.evidence: self.graph.addTriple( self.assoc_id, self.object_properties['has_evidence'], e) if self.source is not None and len(self.source) > 0: for s in self.source: if re.match('http', s): # TODO assume that the source is a publication? # use Reference class here self.graph.addTriple( self.assoc_id, self.object_properties['has_source'], s, True) else: self.graph.addTriple( self.assoc_id, self.object_properties['has_source'], s) if self.provenance is not None and len(self.provenance) > 0: for p in self.provenance: self.graph.addTriple( self.assoc_id, self.object_properties['has_provenance'], p) if self.date is not None and len(self.date) > 0: for d in self.date: self.graph.addTriple( object_is_literal=True, subject_id=self.assoc_id, predicate_id=self.datatype_properties['created_on'], obj=d) if self.score is not None: self.graph.addTriple( self.assoc_id, self.properties['has_measurement'], self.score, True, 'xsd:float') # TODO # update with some kind of instance of scoring object # that has a unit and type return def add_association_to_graph(self): self._add_basic_association_to_graph() return def add_predicate_object(self, predicate, object_node, object_type=None, datatype=None): if object_type == 'Literal': if datatype is not None: self.graph.addTriple(self.assoc_id, predicate, object_node, True, datatype) else: self.graph.addTriple(self.assoc_id, predicate, object_node, True) else: self.graph.addTriple(self.assoc_id, predicate, object_node, False) return # This isn't java, but if we must, # prefer use of property decorator def set_subject(self, identifier): self.sub = identifier return def set_object(self, identifier): self.obj = identifier return def set_relationship(self, identifier): self.rel = identifier return def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id(self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return def get_association_id(self): return self.assoc_id def set_description(self, description): self.description = description return def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type return def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] return def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] return def add_date(self, date): if date is not None and date.strip() != '': self.date += [date] return def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] return @staticmethod def make_association_id(definedby, subject, predicate, object, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively digests the string of concatonated values. Subclasses of Assoc can submit an additional array of attributes that will be appeded to the ID. :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ # note others available: # md5(), sha1(), sha224(), sha256(), sha384(), and sha512() # putting definedby first, # as this will usually be the datasource providing the annotation # this will end up making the first few parts of the id # be the same for all annotations in that resource # (although the point of a digest is to render such details moot). items_to_hash = [definedby, subject, predicate, object] if attributes is not None: items_to_hash += attributes for i, val in enumerate(items_to_hash): if val is None: items_to_hash[i] = '' byte_string = '+'.join(items_to_hash).encode("utf-8") # TODO put this in a util? return ':'.join(('MONARCH', hashlib.sha1(byte_string).hexdigest()[0:16]))
def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple( subject_id=package['unii'], predicate_id=target['action'], obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple( subject_id=target['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) model.addTriple( subject_id=package['drugbank_id'], predicate_id=self.globaltt['equivalent_class'], obj=package['unii']) model.addTriple( subject_id=target['action'], predicate_id=self.globaltt['subPropertyOf'], obj=self.globaltt['molecularly_interacts_with']) model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) if source == 'drugcentral': for indication in package['indications']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['is substance that treats'], obj=indication['snomed_id']) model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addTriple( subject_id=indication['snomed_id'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['disease']) model.addLabel( subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['molecularly_interacts_with'], obj=interaction['uniprot']) # model.addLabel( # subject_id=interaction['uniprot'], # label='Protein_{}'.format(interaction['uniprot'])) model.addLabel( subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addDescription( subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple( subject_id=interaction['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) return
def _process_straininfo(self, limit): src_key = 'straininfo' raw = '/'.join((self.rawdir, self.files[src_key]['file'])) LOG.info('Processing measurementsfrom file: %s', raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) tax_id = self.globaltt['Mus musculus'] col = self.files[src_key]['columns'] with open(raw, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') row = next(reader) if self.check_fileheader(col, row): pass for row in reader: if not row: continue # skip blank rows strain_name = row[col.index('strainname')] vendor = row[col.index('vendor')] stocknum = row[col.index('stocknum')] panel = row[col.index('panel')] mpd_strainid = str(row[col.index('mpd_strainid')]) # straintype = row[col.index('straintype')] # n_proj = row[col.index('n_proj')] # n_snp_datasets = row[col.index('n_snp_datasets')] mpdshortname = row[col.index('mpd_shortname')].strip() url = row[col.index('url')] # new? # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html # create the strain as an instance of the taxon if self.test_mode and 'MPD:' + mpd_strainid not in self.test_ids: continue strain_id = 'MPD-strain:' + mpd_strainid model.addIndividualToGraph(strain_id, strain_name, tax_id) if mpdshortname != '': model.addSynonym(strain_id, mpdshortname) self.idlabel_hash[strain_id] = strain_name # make it equivalent to the vendor+stock if stocknum != '': if vendor == 'J': jax_id = 'JAX:' + stocknum model.addSameIndividual(strain_id, jax_id) elif vendor == 'Rbrc': # reiken reiken_id = 'RBRC:' + stocknum model.addSameIndividual(strain_id, reiken_id) else: if url != '': model.addXref(strain_id, url, True) if vendor != '': model.addXref(strain_id, ':'.join( (vendor, stocknum)), True) # add the panel information if panel != '': desc = panel + ' [panel]' model.addDescription(strain_id, desc)
def _process_ortholog_classes(self, limit=None): """ This method add the KEGG orthology classes to the graph. If there's an embedded enzyme commission number, that is added as an xref. Triples created: <orthology_class_id> is a class <orthology_class_id> has label <orthology_symbols> <orthology_class_id> has description <orthology_description> :param limit: :return: """ LOG.info("Processing ortholog classes") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) line_counter = 0 raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file'])) with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 (orthology_class_id, orthology_class_name) = row if self.test_mode and orthology_class_id \ not in self.test_ids['orthology_classes']: continue # The orthology class is essentially a KEGG gene ID # that is species agnostic. # Add the ID and label as a gene family class other_labels = re.split(r'[;,]', orthology_class_name) # the first one is the label we'll use orthology_label = other_labels[0] orthology_class_id = 'KEGG-'+orthology_class_id.strip() orthology_type = self.globaltt['gene_family'] model.addClassToGraph( orthology_class_id, orthology_label, orthology_type) if len(other_labels) > 1: # add the rest as synonyms # todo skip the first for s in other_labels: model.addSynonym(orthology_class_id, s.strip()) # add the last one as the description d = other_labels[len(other_labels)-1] model.addDescription(orthology_class_id, d) # add the enzyme commission number (EC:1.2.99.5)as an xref # sometimes there's two, like [EC:1.3.5.1 1.3.5.4] # can also have a dash, like EC:1.10.3.- ec_matches = re.findall(r'((?:\d+|\.|-){5,7})', d) if ec_matches is not None: for ecm in ec_matches: model.addXref(orthology_class_id, 'EC:' + ecm) if not self.test_mode and limit is not None and line_counter > limit: break LOG.info("Done with ortholog classes") return
def _process_data(self, source, limit=None): """ This function will process the data files from Coriell. We make the assumption that any alleles listed are variants (alternates to w.t.) Triples: (examples) :NIGMSrepository a CLO_0000008 #repository label : NIGMS Human Genetic Cell Repository foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8 line_id a CL_0000057, #fibroblast line derives_from patient_id part_of :NIGMSrepository RO:model_of OMIM:disease_id patient id a foaf:person, label: "fibroblast from patient 12345 with disease X" member_of family_id #what is the right thing here? SIO:race EFO:caucasian #subclass of EFO:0001799 in_taxon NCBITaxon:9606 dc:description Literal(remark) RO:has_phenotype OMIM:disease_id GENO:has_genotype genotype_id family_id a owl:NamedIndividual foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM" genotype_id a intrinsic_genotype GENO:has_alternate_part allelic_variant_id we don't necessarily know much about the genotype, other than the allelic variant. also there's the sex here pub_id mentions cell_line_id :param raw: :param limit: :return: """ raw = '/'.join((self.rawdir, self.files[source]['file'])) LOG.info("Processing Data from %s", raw) if self.testMode: # set the graph to build graph = self.testgraph else: graph = self.graph family = Family(graph) model = Model(graph) line_counter = 1 geno = Genotype(graph) diputil = DipperUtil() col = self.files[source]['columns'] # affords access with # x = row[col.index('x')].strip() with open(raw, 'r', encoding="iso-8859-1") as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"') # we can keep a close watch on changing file formats fileheader = next(filereader, None) fileheader = [c.lower() for c in fileheader] if col != fileheader: # assert LOG.error('Expected %s to have columns: %s', raw, col) LOG.error('But Found %s to have columns: %s', raw, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: line_counter += 1 if len(row) != len(col): LOG.warning('Expected %i values but find %i in row %i', len(col), len(row), line_counter) continue # (catalog_id, description, omim_number, sample_type, # cell_line_available, dna_in_stock, dna_ref, gender, age, # race, ethnicity, affected, karyotype, relprob, mutation, # gene, family_id, collection, url, cat_remark, pubmed_ids, # family_member, variant_id, dbsnp_id, species) = row # example: # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No, # ,Female,26 YR,Caucasian,,,, # parent,,,39,NIGMS Human Genetic Cell Repository, # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003, # 46;XX; clinically normal mother of a child with Hurler syndrome; # proband not in Repository,, # 2,,18343,H**o sapiens catalog_id = row[col.index('catalog_id')].strip() if self.testMode and catalog_id not in self.test_lines: # skip rows not in our test lines, when in test mode continue # ########### BUILD REQUIRED VARIABLES ########### # Make the cell line ID cell_line_id = 'Coriell:' + catalog_id # Map the cell/sample type cell_type = self.resolve(row[col.index('sample_type')].strip()) # on fail cell_type = self.globaltt['cell'] ? # Make a cell line label collection = row[col.index('collection')].strip() line_label = collection.partition(' ')[0] + '-' + catalog_id # Map the repository/collection repository = self.localtt[collection] # patients are uniquely identified by one of: # dbsnp id (which is == an individual haplotype) # family id + family member (if present) OR # probands are usually family member zero # cell line id # since some patients have >1 cell line derived from them, # we must make sure that the genotype is attached to # the patient, and can be inferred to the cell line # examples of repeated patients are: # famid=1159, member=1; fam=152,member=1 # Make the patient ID # make an anonymous patient patient_id = '_:person' fam_id = row[col.index('fam')].strip() fammember = row[col.index('fammember')].strip() if fam_id != '': patient_id = '-'.join((patient_id, fam_id, fammember)) else: # make an anonymous patient patient_id = '-'.join((patient_id, catalog_id)) # properties of the individual patients: sex, family id, # member/relproband, description descriptions are # really long and ugly SCREAMING text, so need to clean up # the control cases are so odd with this labeling scheme; # but we'll deal with it as-is for now. description = row[col.index('description')].strip() short_desc = (description.split(';')[0]).capitalize() gender = row[col.index('gender')].strip().lower() affected = row[col.index('affected')].strip() relprob = row[col.index('relprob')].strip() if affected == '': affected = 'unspecified' elif affected in self.localtt: affected = self.localtt[affected] else: LOG.warning('Novel Affected status %s at row: %i of %s', affected, line_counter, raw) patient_label = ' '.join((affected, gender, relprob)) if relprob == 'proband': patient_label = ' '.join( (patient_label.strip(), 'with', short_desc)) else: patient_label = ' '.join( (patient_label.strip(), 'of proband with', short_desc)) # ############# BUILD THE CELL LINE ############# # Adding the cell line as a typed individual. cell_line_reagent_id = self.globaltt['cell line'] model.addIndividualToGraph(cell_line_id, line_label, cell_line_reagent_id) # add the equivalent id == dna_ref dna_ref = row[col.index('dna_ref')].strip() if dna_ref != '' and dna_ref != catalog_id: equiv_cell_line = 'Coriell:' + dna_ref # some of the equivalent ids are not defined # in the source data; so add them model.addIndividualToGraph(equiv_cell_line, None, cell_line_reagent_id) model.addSameIndividual(cell_line_id, equiv_cell_line) # Cell line derives from patient geno.addDerivesFrom(cell_line_id, patient_id) geno.addDerivesFrom(cell_line_id, cell_type) # Cell line a member of repository family.addMember(repository, cell_line_id) cat_remark = row[col.index('cat_remark')].strip() if cat_remark != '': model.addDescription(cell_line_id, cat_remark) # Cell age_at_sampling # TODO add the age nodes when modeled properly in #78 # if (age != ''): # this would give a BNode that is an instance of Age. # but i don't know how to connect # the age node to the cell line? we need to ask @mbrush # age_id = '_'+re.sub('\s+','_',age) # gu.addIndividualToGraph( # graph,age_id,age,self.globaltt['age']) # gu.addTriple( # graph,age_id,self.globaltt['has measurement value'],age, # True) # ############# BUILD THE PATIENT ############# # Add the patient ID as an individual. model.addPerson(patient_id, patient_label) # TODO map relationship to proband as a class # (what ontology?) # Add race of patient # FIXME: Adjust for subcategories based on ethnicity field # EDIT: There are 743 different entries for ethnicity... # Too many to map? # Add ethnicity as literal in addition to the mapped race? # Adjust the ethnicity txt (if using) # to initial capitalization to remove ALLCAPS # TODO race should go into the individual's background # and abstracted out to the Genotype class punting for now. # if race != '': # mapped_race = self.resolve(race) # if mapped_race is not None: # gu.addTriple( # g,patient_id,self.globaltt['race'], mapped_race) # model.addSubClass( # mapped_race,self.globaltt['ethnic_group']) # ############# BUILD THE FAMILY ############# # Add triples for family_id, if present. if fam_id != '': family_comp_id = 'CoriellFamily:' + fam_id family_label = ' '.join( ('Family of proband with', short_desc)) # Add the family ID as a named individual model.addIndividualToGraph(family_comp_id, family_label, self.globaltt['family']) # Add the patient as a member of the family family.addMemberOf(patient_id, family_comp_id) # ############# BUILD THE GENOTYPE ############# # the important things to pay attention to here are: # karyotype = chr rearrangements (somatic?) # mutation = protein-level mutation as a label, # often from omim # gene = gene symbol - TODO get id # variant_id = omim variant ids (; delimited) # dbsnp_id = snp individual ids = full genotype? # note GM00633 is a good example of chromosomal variation # - do we have enough to capture this? # GM00325 has both abnormal karyotype and variation # make an assumption that if the taxon is blank, # that it is human! species = row[col.index('species')].strip() if species is None or species == '': species = 'H**o sapiens' taxon = self.resolve(species) # if there's a dbSNP id, # this is actually the individual's genotype genotype_id = None genotype_label = None dbsnp_id = row[col.index('dbsnp_id')].strip() if dbsnp_id != '': genotype_id = 'dbSNPIndividual:' + dbsnp_id omim_map = {} gvc_id = None # some of the karyotypes are encoded # with terrible hidden codes. remove them here # i've seen a <98> character karyotype = row[col.index('karyotype')].strip() karyotype = diputil.remove_control_characters(karyotype) karyotype_id = None if karyotype.strip() != '': karyotype_id = '_:' + re.sub('MONARCH:', '', self.make_id(karyotype)) # add karyotype as karyotype_variation_complement model.addIndividualToGraph( karyotype_id, karyotype, self.globaltt['karyotype_variation_complement']) # TODO break down the karyotype into parts # and map into GENO. depends on #77 # place the karyotype in a location(s). karyo_chrs = self._get_affected_chromosomes_from_karyotype( karyotype) for chrom in karyo_chrs: chr_id = makeChromID(chrom, taxon, 'CHR') # add an anonymous sequence feature, # each located on chr karyotype_feature_id = '-'.join((karyotype_id, chrom)) karyotype_feature_label = \ 'some karyotype alteration on chr' + str(chrom) feat = Feature(graph, karyotype_feature_id, karyotype_feature_label, self.globaltt['sequence_alteration']) feat.addFeatureStartLocation(None, chr_id) feat.addFeatureToGraph() geno.addParts(karyotype_feature_id, karyotype_id, self.globaltt['has_variant_part']) gene = row[col.index('gene')].strip() mutation = row[col.index('mutation')].strip() if gene != '': vl = gene + '(' + mutation + ')' # fix the variant_id so it's always in the same order variant_id = row[col.index('variant_id')].strip() vids = variant_id.split(';') variant_id = ';'.join(sorted(list(set(vids)))) if karyotype.strip() != '' and not self._is_normal_karyotype( karyotype): gvc_id = karyotype_id if variant_id != '': gvc_id = '_:' + variant_id.replace(';', '-') + '-' \ + re.sub(r'\w*:', '', karyotype_id) if mutation.strip() != '': gvc_label = '; '.join((vl, karyotype)) else: gvc_label = karyotype elif variant_id.strip() != '': gvc_id = '_:' + variant_id.replace(';', '-') gvc_label = vl else: # wildtype? pass # add the karyotype to the gvc. # use reference if normal karyotype karyo_rel = self.globaltt['has_variant_part'] if self._is_normal_karyotype(karyotype): karyo_rel = self.globaltt['has_reference_part'] if karyotype_id is not None \ and not self._is_normal_karyotype(karyotype) \ and gvc_id is not None and karyotype_id != gvc_id: geno.addParts(karyotype_id, gvc_id, karyo_rel) if variant_id.strip() != '': # split the variants & add them as part of the genotype # we don't necessarily know their zygosity, # just that they are part of the genotype variant ids # are from OMIM, so prefix as such we assume that the # sequence alts will be defined in OMIM not here # TODO sort the variant_id list, if the omim prefix is # the same, then assume it's the locus make a hashmap # of the omim id to variant id list; # then build the genotype hashmap is also useful for # removing the "genes" from the list of "phenotypes" # will hold gene/locus id to variant list omim_map = {} locus_num = None for var in variant_id.split(';'): # handle omim-style and odd var ids # like 610661.p.R401X mch = re.match(r'(\d+)\.+(.*)', var.strip()) if mch is not None and len(mch.groups()) == 2: (locus_num, var_num) = mch.groups() if locus_num is not None and locus_num not in omim_map: omim_map[locus_num] = [var_num] else: omim_map[locus_num] += [var_num] for omim in omim_map: # gene_id = 'OMIM:' + omim # TODO unused vslc_id = '_:' + '-'.join( [omim + '.' + a for a in omim_map.get(omim)]) vslc_label = vl # we don't really know the zygosity of # the alleles at all. # so the vslcs are just a pot of them model.addIndividualToGraph( vslc_id, vslc_label, self.globaltt['variant single locus complement']) for var in omim_map.get(omim): # this is actually a sequence alt allele1_id = 'OMIM:' + omim + '.' + var geno.addSequenceAlteration(allele1_id, None) # assume that the sa -> var_loc -> gene # is taken care of in OMIM geno.addPartsToVSLC( vslc_id, allele1_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) if vslc_id != gvc_id: geno.addVSLCtoParent(vslc_id, gvc_id) if affected == 'unaffected': # let's just say that this person is wildtype model.addType(patient_id, self.globaltt['wildtype']) elif genotype_id is None: # make an anonymous genotype id (aka blank node) genotype_id = '_:geno' + catalog_id.strip() # add the gvc if gvc_id is not None: model.addIndividualToGraph( gvc_id, gvc_label, self.globaltt['genomic_variation_complement']) # add the gvc to the genotype if genotype_id is not None: if affected == 'unaffected': rel = self.globaltt['has_reference_part'] else: rel = self.globaltt['has_variant_part'] geno.addParts(gvc_id, genotype_id, rel) if karyotype_id is not None \ and self._is_normal_karyotype(karyotype): if gvc_label is not None and gvc_label != '': genotype_label = '; '.join((gvc_label, karyotype)) elif karyotype is not None: genotype_label = karyotype if genotype_id is None: genotype_id = karyotype_id else: geno.addParts(karyotype_id, genotype_id, self.globaltt['has_reference_part']) else: genotype_label = gvc_label # use the catalog id as the background genotype_label += ' [' + catalog_id.strip() + ']' if genotype_id is not None and gvc_id is not None: # only add the genotype if it has some parts geno.addGenotype(genotype_id, genotype_label, self.globaltt['intrinsic_genotype']) geno.addTaxon(taxon, genotype_id) # add that the patient has the genotype # TODO check if the genotype belongs to # the cell line or to the patient graph.addTriple(patient_id, self.globaltt['has_genotype'], genotype_id) else: geno.addTaxon(taxon, patient_id) # TODO: Add sex/gender (as part of the karyotype?) # = row[col.index('')].strip() # ############# DEAL WITH THE DISEASES ############# omim_num = row[col.index('omim_num')].strip() # we associate the disease to the patient if affected == 'affected' and omim_num != '': for d in omim_num.split(';'): if d is not None and d != '': # if the omim number is in omim_map, # then it is a gene not a pheno # TEC - another place to use the mimTitle omim # classifier omia & genereviews are using if d not in omim_map: disease_id = 'OMIM:' + d.strip() # assume the label is taken care of in OMIM model.addClassToGraph(disease_id, None) # add the association: # the patient has the disease assoc = G2PAssoc(graph, self.name, patient_id, disease_id) assoc.add_association_to_graph() # this line is a model of this disease # TODO abstract out model into # it's own association class? graph.addTriple(cell_line_id, self.globaltt['is model of'], disease_id) else: LOG.info('drop gene %s from disease list', d) # ############# ADD PUBLICATIONS ############# pubmed_ids = row[col.index('pubmed_ids')].strip() if pubmed_ids != '': for s in pubmed_ids.split(';'): pubmed_id = 'PMID:' + s.strip() ref = Reference(graph, pubmed_id) ref.setType(self.globaltt['journal article']) ref.addRefToGraph() graph.addTriple(pubmed_id, self.globaltt['mentions'], cell_line_id) if not self.testMode and (limit is not None and line_counter > limit): break return
class Dataset: """ this will produce the metadata about a dataset following the example laid out here: http://htmlpreview.github.io/? https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1 (mind the wrap) """ def __init__(self, identifier, title, url, description=None, license_url=None, data_rights=None, graph_type=None, file_handle=None): if graph_type is None: self.graph = RDFGraph() elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph() self.model = Model(self.graph) self.identifier = ':' + identifier self.version = None self.date_issued = None # The data_accesed value is later used as an object literal of properties such as dct:issued, which needs to conform xsd:dateTime format. # self.date_accessed = datetime.now().strftime('%Y-%m-%d-%H-%M') self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') self.citation = set() self.license = license_url self.model.addType(self.identifier, 'dctypes:Dataset') self.graph.addTriple(self.identifier, 'dct:title', title, True) self.graph.addTriple(self.identifier, 'dct:identifier', identifier, object_is_literal=True) self.graph.addTriple(self.identifier, 'foaf:page', url) # maybe in the future add the logo here: # schemaorg:logo <http://www.ebi.ac.uk/rdf/sites/ebi.ac.uk.rdf/files/resize/images/rdf/chembl_service_logo-146x48.gif> . # TODO add the licence info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.addTriple(self.identifier, 'dct:license', license_url) else: logger.debug('No license provided.') if data_rights is not None: self.graph.addTriple(self.identifier, 'dct:rights', data_rights, object_is_literal=True) else: logger.debug('No rights provided.') if description is not None: self.model.addDescription(self.identifier, description) return def setVersion(self, date_issued, version_id=None): """ Legacy function... should use the other set_* for version and date as of 2016-10-20 used in: dipper/sources/HPOAnnotations.py 139: dipper/sources/CTD.py 99: dipper/sources/BioGrid.py 100: dipper/sources/MGI.py 255: dipper/sources/EOM.py 93: dipper/sources/Coriell.py 200: dipper/sources/MMRRC.py 77: # TODO set as deprecated :param date_issued: :param version_id: :return: """ if date_issued is not None: self.set_date_issued(date_issued) elif version_id is not None: self.set_version_by_num(version_id) else: logger.error("date or version not set!") # TODO throw error return if version_id is not None: self.set_version_by_num(version_id) else: logger.info("set version to %s", self.version) self.set_version_by_date(date_issued) logger.info("set version to %s", self.version) return def set_date_issued(self, date_issued): self.date_issued = date_issued self.graph.addTriple(self.identifier, 'dct:issued', date_issued, object_is_literal=True) logger.info("setting date to %s", date_issued) return def set_version_by_date(self, date_issued=None): """ This will set the version by the date supplied, the date already stored in the dataset description, or by the download date (today) :param date_issued: :return: """ if date_issued is not None: d = date_issued elif self.date_issued is not None: d = self.date_issued else: d = self.date_accessed logger.info("No date supplied for setting version; " "using download timestamp for date_issued") logger.info("setting version by date") self.set_version_by_num(d) return def set_version_by_num(self, version_num): self.version = self.identifier + version_num self.graph.addTriple(self.version, 'dct:isVersionOf', self.identifier) self.graph.addTriple(self.version, 'pav:version', version_num, object_is_literal=True) logger.info("setting version to %s", self.version) # set the monarch-generated-version of the resource-version # TODO sync this up with the ontology version if version_num != self.date_accessed: dipperized_version = ':' + str(self.date_accessed) self.graph.addTriple(dipperized_version, 'dct:isVersionOf', self.version) self.graph.addTriple(dipperized_version, 'pav:version', self.date_accessed, object_is_literal=True) self.graph.addTriple(dipperized_version, 'dct:issued', self.date_accessed, object_is_literal=True, literal_type="xsd:dateTime") return def setFileAccessUrl(self, url, is_object_literal=False): self.graph.addTriple(self.identifier, 'dcat:accessURL', url, is_object_literal) def getGraph(self): return self.graph def set_license(self, license): self.license = license return def get_license(self): return self.license def set_citation(self, citation_id): self.citation.add(citation_id) # TODO # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id) return
def process_feature_loc(self, limit): raw = '/'.join((self.rawdir, self.files['feature_loc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Feature location and attributes") line_counter = 0 geno = Genotype(g) strain_to_variant_map = {} build_num = self.version_num build_id = 'WormBase:' + build_num with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue (chrom, db, feature_type_label, start, end, score, strand, phase, attributes) = row # I interpolated_pmap_position gene 1 559768 . . . ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM) # I WormBase gene 3747 3909 . - . ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6 # I absolute_pmap_position gene 4119 10230 . . . ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM) # dbs = re.split( # r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA') # # if db not in dbs: # continue if feature_type_label not in [ 'gene', 'point_mutation', 'deletion', 'RNAi_reagent', 'duplication', 'enhancer', 'binding_site', 'biological_region', 'complex_substitution', 'substitution', 'insertion', 'inverted_repeat' ]: # note biological_regions include balancers # other options here: promoter, regulatory_region, reagent continue line_counter += 1 attribute_dict = {} if attributes != '': attribute_dict = dict( item.split("=") for item in re.sub(r'"', '', attributes).split(";")) fid = flabel = desc = None if 'ID' in attribute_dict: fid = attribute_dict.get('ID') if re.search(r'WB(Gene|Var|sf)', fid): fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid) elif re.match(r'(gmap|landmark)', fid): continue else: logger.info('other identifier %s', fid) fid = None elif 'variation' in attribute_dict: fid = 'WormBase:' + attribute_dict.get('variation') flabel = attribute_dict.get('public_name') sub = attribute_dict.get('substitution') ins = attribute_dict.get('insertion') # if it's a variation: # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T desc = '' if sub is not None: desc = 'substitution=' + sub if ins is not None: desc = 'insertion=' + ins # keep track of the strains with this variation, # for later processing strain_list = attribute_dict.get('strain') if strain_list is not None: for s in re.split(r',', strain_list): if s.strip() not in strain_to_variant_map: strain_to_variant_map[s.strip()] = set() strain_to_variant_map[s.strip()].add(fid) # if feature_type_label == 'RNAi_reagent': # Target=WBRNAi00096030 1 4942 # this will tell us where the RNAi is actually binding # target = attribute_dict.get('Target') # TODO unused # rnai_num = re.split(r' ', target)[0] # TODO unused # it will be the reagent-targeted-gene that has a position, # (i think) # TODO finish the RNAi binding location name = attribute_dict.get('Name') polymorphism = attribute_dict.get('polymorphism') if fid is None: if name is not None and re.match(r'WBsf', name): fid = 'WormBase:' + name name = None else: continue if self.testMode \ and re.sub(r'WormBase:', '', fid) \ not in self.test_ids['gene']+self.test_ids['allele']: continue # these really aren't that interesting if polymorphism is not None: continue if name is not None and not re.search(name, fid): if flabel is None: flabel = name else: model.addSynonym(fid, name) if desc is not None: model.addDescription(fid, desc) alias = attribute_dict.get('Alias') biotype = attribute_dict.get('biotype') note = attribute_dict.get('Note') other_name = attribute_dict.get('other_name') for n in [alias, other_name]: if n is not None: model.addSynonym(fid, other_name) ftype = self.get_feature_type_by_class_and_biotype( feature_type_label, biotype) chr_id = makeChromID(chrom, build_id, 'CHR') geno.addChromosomeInstance(chrom, build_id, build_num) feature = Feature(g, fid, flabel, ftype) feature.addFeatureStartLocation(start, chr_id, strand) feature.addFeatureEndLocation(start, chr_id, strand) feature_is_class = False if feature_type_label == 'gene': feature_is_class = True feature.addFeatureToGraph(True, None, feature_is_class) if note is not None: model.addDescription(fid, note) if not self.testMode \ and limit is not None and line_counter > limit: break # RNAi reagents: # I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10 # I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052 # I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH # TODO TF bindiing sites and network: # I TF_binding_site_region TF_binding_site 1861 2048 . + . Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16 # I TF_binding_site_region TF_binding_site 3403 4072 . + . Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1 return
class Dataset: """ this will produce the metadata about a dataset following the example laid out here: http://htmlpreview.github.io/? https://github.com/joejimbo/HCLSDatasetDescriptions/blob/master/Overview.html#appendix_1 (mind the wrap) """ def __init__( self, identifier, # name? should be Archive url via Source title, url, ingest_desc=None, license_url=None, data_rights=None, graph_type='rdf_graph', # rdf_graph, streamed_graph file_handle=None): if graph_type is None: self.graph = RDFGraph(None, identifier) elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, identifier, file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph(True, identifier) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # TODO: move hard coded curies to translation table calls self.identifier = identifier if title is None: self.title = identifier else: self.title = title self.version = None self.date_issued = None # The data_accesed value is later used as an literal of properties # such as dcterms:issued, which needs to conform xsd:dateTime format. # TODO ... we need to have a talk about typed literals and SPARQL self.date_accessed = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') self.citation = set() self.license_url = license_url self.model.addType(self.identifier, 'dctypes:Dataset') self.graph.addTriple(self.identifier, 'dcterms:title', title, True) self.graph.addTriple(self.identifier, 'dcterms:identifier', identifier, True) if url is not None: self.graph.addTriple(self.identifier, 'foaf:page', url) # maybe in the future add the logo here: # schemaorg:logo <uri> # TODO add the license info # FIXME:Temporarily making this in IF statement, # can revert after all current resources are updated. if license_url is not None: self.graph.addTriple(self.identifier, 'dcterms:license', license_url) else: LOG.debug('No license provided.') if data_rights is not None: self.graph.addTriple(self.identifier, 'dcterms:rights', data_rights, object_is_literal=True) else: LOG.debug('No rights provided.') if ingest_desc is not None: self.model.addDescription(self.identifier, ingest_desc) return def setVersion(self, date_issued, version_id=None): """ Legacy function... should use the other set_* for version and date as of 2016-10-20 used in: dipper/sources/HPOAnnotations.py 139: dipper/sources/CTD.py 99: dipper/sources/BioGrid.py 100: dipper/sources/MGI.py 255: dipper/sources/EOM.py 93: dipper/sources/Coriell.py 200: dipper/sources/MMRRC.py 77: # TODO set as deprecated :param date_issued: :param version_id: :return: """ if date_issued is not None: self.set_date_issued(date_issued) elif version_id is not None: self.set_version_by_num(version_id) else: LOG.error("date or version not set!") # TODO throw error return if version_id is not None: self.set_version_by_num(version_id) else: LOG.info("set version to %s", self.version) self.set_version_by_date(date_issued) LOG.info("set version to %s", self.version) return def set_date_issued(self, date_issued): self.date_issued = date_issued self.graph.addTriple(self.identifier, 'dcterms:issued', date_issued, object_is_literal=True) LOG.info("setting date to %s", date_issued) return def set_version_by_date(self, date_issued=None): """ This will set the version by the date supplied, the date already stored in the dataset description, or by the download date (today) :param date_issued: :return: """ if date_issued is not None: dat = date_issued elif self.date_issued is not None: dat = self.date_issued else: dat = self.date_accessed LOG.info( "No date supplied, using download timestamp for date_issued") LOG.info("setting version by date to: %s", dat) self.set_version_by_num(dat) return def set_version_by_num(self, version_num): self.version = self.identifier + version_num self.graph.addTriple(self.version, 'dcterms:isVersionOf', self.identifier) self.graph.addTriple(self.version, 'pav:version', version_num, object_is_literal=True) LOG.info("setting version to %s", self.version) # set the monarch-generated-version of the resource-version # TODO sync this up with the ontology version if version_num != self.date_accessed: dipperized_version = ':' + str(self.date_accessed) self.graph.addTriple(dipperized_version, 'dcterms:isVersionOf', "MonarchData:" + self.identifier + ".ttl") # fix suffix self.graph.addTriple(dipperized_version, 'pav:version', self.date_accessed, object_is_literal=True) self.graph.addTriple(dipperized_version, 'dcterms:issued', self.date_accessed, object_is_literal=True, literal_type="xsd:dateTime") return def setFileAccessUrl(self, url, is_object_literal=False): self.graph.addTriple(self.identifier, 'dcat:accessURL', url, is_object_literal) def getGraph(self): return self.graph def set_license(self, license_url): self.license_url = license_url return def get_license(self): return self.license_url def set_citation(self, citation_id): self.citation.add(citation_id) # TODO # model.addTriple(self.identifier, 'cito:citeAsAuthority', citation_id) return
def _process_data(self, raw, limit=None): LOG.info("Processing Data from %s", raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) # Add the taxon as a class taxon_id = self.globaltt['Mus musculus'] model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: col = self.files['all']['columns'] with gzip.open(raw, 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='\"') row = next(reader) # presumed header if not self.check_fileheader(col, row): pass for row in reader: # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g" marker_accession_id = row[col.index('marker_accession_id')].strip() marker_symbol = row[col.index('marker_symbol')].strip() phenotyping_center = row[col.index('phenotyping_center')].strip() colony_raw = row[col.index('colony_id')].strip() sex = row[col.index('sex')].strip() zygosity = row[col.index('zygosity')].strip() allele_accession_id = row[col.index('allele_accession_id')].strip() allele_symbol = row[col.index('allele_symbol')].strip() # allele_name = row[col.index('allele_name')] strain_accession_id = row[col.index('strain_accession_id')].strip() strain_name = row[col.index('strain_name')].strip() # project_name = row[col.index('project_name')] project_fullname = row[col.index('project_fullname')].strip() pipeline_name = row[col.index('pipeline_name')].strip() pipeline_stable_id = row[col.index('pipeline_stable_id')].strip() procedure_stable_id = row[col.index('procedure_stable_id')].strip() procedure_name = row[col.index('procedure_name')].strip() parameter_stable_id = row[col.index('parameter_stable_id')].strip() parameter_name = row[col.index('parameter_name')].strip() # top_level_mp_term_id = row[col.index('top_level_mp_term_id')] # top_level_mp_term_name = row[col.index('top_level_mp_term_name')] mp_term_id = row[col.index('mp_term_id')].strip() mp_term_name = row[col.index('mp_term_name')].strip() p_value = row[col.index('p_value')].strip() percentage_change = row[col.index('percentage_change')].strip() effect_size = row[col.index('effect_size')].strip() statistical_method = row[col.index('statistical_method')].strip() resource_name = row[col.index('resource_name')].strip() if self.test_mode and marker_accession_id not in self.gene_ids: continue # ##### cleanup some of the identifiers ###### zygosity = zygosity.strip() zygosity_id = self.resolve(zygosity) if zygosity_id == zygosity: LOG.warning( "Zygosity '%s' unmapped. detting to indeterminate", zygosity) zygosity_id = self.globaltt['indeterminate'] # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony_raw) if not re.match(r'MGI', allele_accession_id): allele_accession_id = '_:IMPC-'+re.sub( r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC # TODO blank nodes do not maintain identifiers strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): LOG.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = re.match( r'.*<(.*)>', allele_symbol) if sequence_alteration_name is not None: sequence_alteration_name = sequence_alteration_name.group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and marker_accession_id == '': LOG.warning("Marker unspecified on row %d", reader.line_num) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = self.globaltt['variant_locus'] geno.addGene( marker_accession_id, marker_symbol, self.globaltt['gene']) geno.addAllele( variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) # TAG bnode sequence_alteration_id = '_:seqalt' + re.sub( r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration( sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, with unknown zygosity stem_cell_class = self.globaltt['embryonic stem cell line'] if colony_id is None: print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num) model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = '_:'+re.sub( r':', '', allele_accession_id + self.globaltt['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts( allele_accession_id, colony_genotype_id, self.globaltt['has_variant_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) graph.addTriple( colony_id, self.globaltt['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = self.globaltt['has_variant_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: LOG.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:'+vslc_id model.addIndividualToGraph( vslc_id, vslc_name, self.globaltt['variant single locus complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, self.globaltt['has_variant_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType(vslc_id, self.globaltt['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, self.globaltt['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = strain_name + '-' + phenotyping_center \ + '-' + colony_raw pheno_center_strain_id = '-'.join(( re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony_raw))) if not re.match(r'^_', pheno_center_strain_id): # Tag bnode pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype( pheno_center_strain_id, pheno_center_strain_label, self.globaltt['genomic_background']) geno.addSequenceDerivesFrom( pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name + ' [' + pheno_center_strain_label + ']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id(( colony_id + phenotyping_center + zygosity + strain_accession_id + sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' sq_type_id = self.resolve(sex, False) if sq_type_id == sex: sq_type_id = self.globaltt['intrinsic_genotype'] LOG.warning( "Unknown sex qualifier %s, adding as intrinsic_genotype", sex) geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, self.globaltt['has_variant_part']) if genomic_background_id is not None and genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender # sometimes phenotype ids are missing. (about 711 early 2020) if mp_term_id is None or mp_term_id == '': LOG.warning( "No phenotype id specified for row %d", reader.line_num) continue # hard coded ECO code eco_id = self.globaltt['mutant phenotype evidence'] # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc( graph, self.name, sex_qualified_genotype_id, mp_term_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model._addSexSpecificity(assoc_id, self.resolve(sex)) # add a free-text description try: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = self._add_study_provenance( phenotyping_center, colony_raw, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = self._add_evidence( assoc_id, eco_id, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(graph, assoc_id, resource_id) if not self.test_mode and limit is not None and reader.line_num > limit: break
def _process_breed_phene_row(self, row): model = Model(self.graph) # Linking disorders/characteristic to breeds # breed_id, phene_id, added_by breed_id = self.id_hash['breed'].get(row['breed_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) # get the omia id omia_id = self._get_omia_id_from_phene_id(phene_id) if breed_id is None or phene_id is None or ( self.test_mode and (omia_id not in self.test_ids['disease'] or row['breed_id'] not in self.test_ids['breed'])): return # FIXME we want a different relationship here assoc = G2PAssoc(self.graph, self.name, breed_id, phene_id, self.globaltt['has phenotype']) assoc.add_association_to_graph() # add that the breed is a model of the human disease # use the omia-omim mappings for this # we assume that we have already scrubbed out the genes # from the omim list, so we can make the model associations here omim_ids = self.omia_omim_map.get(omia_id) eco_id = self.globaltt['biological aspect of descendant evidence'] if omim_ids is not None and omim_ids: # if len(omim_ids) > 1: # LOG.info( # "There's 1:many omia:omim mapping: %s, %s", omia_id, str(omim_ids)) # else: # oid = list(omim_ids)[0] # LOG.info("OMIA %s is mapped to OMIM %s", omia_id, oid) for oid in omim_ids: assoc = G2PAssoc(self.graph, self.name, breed_id, oid, self.globaltt['is model of']) assoc.add_evidence(eco_id) assoc.add_association_to_graph() aid = assoc.get_association_id() breed_label = self.label_hash.get(breed_id) if breed_label is None: # get taxon label? breed_label = "this breed" mch = re.search(r'\((.*)\)', breed_label) if mch: sp_label = mch.group(1) else: sp_label = '' phene_label = self.label_hash.get(phene_id) if phene_label is None: phene_label = "phenotype" elif phene_label.endswith(sp_label): # some of the labels we made already include the species; # remove it to make a cleaner desc phene_label = re.sub(r' in ' + sp_label, '', phene_label) desc = ' '.join( ("High incidence of", phene_label, "in", breed_label, "suggests it to be a model of disease", oid + ".")) model.addDescription(aid, desc) else: LOG.warning("No OMIM Disease associated with %s", omia_id)
def _process_phene_row(self, row): model = Model(self.g) phenotype_id = None sp_phene_label = row['phene_name'] if sp_phene_label == '': sp_phene_label = None if 'omia_id' not in row: logger.info("omia_id not present for %s", row['phene_id']) omia_id = self._make_internal_id('phene', phenotype_id) else: omia_id = 'OMIA:'+str(row['omia_id']) if self.testMode and not\ (int(row['gb_species_id']) in self.test_ids['taxon'] and omia_id in self.test_ids['disease']): return # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = omia_id descr = row['summary'] if descr == '': descr = None # omia label omia_label = self.label_hash.get(omia_id) # add the species-specific subclass (TODO please review this choice) gb_species_id = row['gb_species_id'] if gb_species_id != '': sp_phene_id = '-'.join((omia_id, gb_species_id)) else: logger.error( "No species supplied in species-specific phene table for %s", omia_id) return species_id = 'NCBITaxon:'+str(gb_species_id) # use this instead species_label = self.label_hash.get('NCBITaxon:'+gb_species_id) if sp_phene_label is None and \ omia_label is not None and species_label is not None: sp_phene_label = ' '.join((omia_label, 'in', species_label)) model.addClassToGraph( sp_phene_id, sp_phene_label, omia_id, descr) # add to internal hash store for later lookup self.id_hash['phene'][row['phene_id']] = sp_phene_id self.label_hash[sp_phene_id] = sp_phene_label # add each of the following descriptions, # if they are populated, with a tag at the end. for item in [ 'clin_feat', 'history', 'pathology', 'mol_gen', 'control']: if row[item] is not None and row[item] != '': model.addDescription( sp_phene_id, row[item] + ' ['+item+']') # if row['symbol'] is not None: # species-specific # CHECK ME - sometimes spaces or gene labels # gu.addSynonym(g, sp_phene, row['symbol']) model.addOWLPropertyClassRestriction( sp_phene_id, model.object_properties['in_taxon'], species_id) # add inheritance as an association inheritance_id = self._map_inheritance_term_id(row['inherit']) if inheritance_id is not None: assoc = DispositionAssoc(self.g, self.name, sp_phene_id, inheritance_id) assoc.add_association_to_graph() if row['characterised'] == 'Yes': self.stored_omia_mol_gen[omia_id] = { 'mol_gen': row['mol_gen'], 'map_info': row['map_info'], 'species': row['gb_species_id']} return
def _transform_entry(self, ent, graph): self.graph = graph model = Model(graph) geno = Genotype(graph) tax_label = 'H**o sapiens' tax_id = self.globaltt[tax_label] build_num = "GRCh38" asm_curie = ':'.join(('NCBIAssembly', build_num)) # get the numbers, labels, and descriptions omim_num = str(ent['entry']['mimNumber']) titles = ent['entry']['titles'] label = titles['preferredTitle'] other_labels = [] if 'alternativeTitles' in titles: other_labels += self._get_alt_labels(titles['alternativeTitles']) if 'includedTitles' in titles: other_labels += self._get_alt_labels(titles['includedTitles']) # remove the abbreviation (comes after the ;) from the preferredTitle, abbrev = None lab_lst = label.split(';') if len(lab_lst) > 1: abbrev = lab_lst[1].strip() newlabel = self._cleanup_label(label) omim_curie = 'OMIM:' + omim_num omimtype = self.omim_type[omim_num] nodelabel = newlabel # this uses our cleaned-up label if omimtype == self.globaltt['heritable_phenotypic_marker']: if abbrev is not None: nodelabel = abbrev # in this special case, # make it a disease by not declaring it as a gene/marker # ??? and if abbrev is None? model.addClassToGraph(omim_curie, nodelabel, description=newlabel) # class_type=self.globaltt['disease or disorder'], elif omimtype in [ self.globaltt['gene'], self.globaltt['has_affected_feature'] ]: omimtype = self.globaltt['gene'] if abbrev is not None: nodelabel = abbrev # omim is subclass_of gene (provide type term) model.addClassToGraph(omim_curie, nodelabel, self.globaltt['gene'], newlabel) else: # omim is NOT subclass_of D|P|or ?... model.addClassToGraph(omim_curie, newlabel) # KS: commenting out, we will get disease descriptions # from MONDO, and gene descriptions from the mygene API # if this is a genetic locus (not sequenced) then # add the chrom loc info to the ncbi gene identifier, # not to the omim id (we reserve the omim id to be the phenotype) ################################################################# # the above makes no sense to me. (TEC) # For Monarch, OMIM is authoritative for disease / phenotype # if they say a phenotype is associated with a locus # that is what dipper should report. # OMIM is not authoritative for NCBI gene locations, locus or otherwise. # and dipper should not be reporting gene locations via OMIM. feature_id = None feature_label = None if 'geneMapExists' in ent['entry'] and ent['entry']['geneMapExists']: genemap = ent['entry']['geneMap'] is_gene = False if omimtype == self.globaltt['heritable_phenotypic_marker']: # get the ncbigene ids ncbifeature = self._get_mapped_gene_ids(ent['entry'], graph) if len(ncbifeature) == 1: feature_id = 'NCBIGene:' + str(ncbifeature[0]) # add this feature as a cause for the omim disease # TODO SHOULD I EVEN DO THIS HERE? assoc = G2PAssoc(graph, self.name, feature_id, omim_curie) assoc.add_association_to_graph() else: LOG.info( "Its ambiguous when %s maps to not one gene id: %s", omim_curie, str(ncbifeature)) elif omimtype in [ self.globaltt['gene'], self.globaltt['has_affected_feature'] ]: feature_id = omim_curie is_gene = True omimtype = self.globaltt['gene'] else: # 158900 falls into this category feature_id = self._make_anonymous_feature(omim_num) if abbrev is not None: feature_label = abbrev omimtype = self.globaltt['heritable_phenotypic_marker'] if feature_id is not None: if 'comments' in genemap: # add a comment to this feature comment = genemap['comments'] if comment.strip() != '': model.addDescription(feature_id, comment) if 'cytoLocation' in genemap: cytoloc = genemap['cytoLocation'] # parse the cytoloc. # add this omim thing as # a subsequence of the cytofeature # 18p11.3-p11.2 # FIXME # add the other end of the range, # but not sure how to do that # not sure if saying subsequence of feature # is the right relationship feat = Feature(graph, feature_id, feature_label, omimtype) if 'chromosomeSymbol' in genemap: chrom_num = str(genemap['chromosomeSymbol']) chrom = makeChromID(chrom_num, tax_id, 'CHR') geno.addChromosomeClass(chrom_num, self.globaltt['H**o sapiens'], tax_label) # add the positional information, if available fstart = fend = -1 if 'chromosomeLocationStart' in genemap: fstart = genemap['chromosomeLocationStart'] if 'chromosomeLocationEnd' in genemap: fend = genemap['chromosomeLocationEnd'] if fstart >= 0: # make the build-specific chromosome chrom_in_build = makeChromID( chrom_num, build_num, 'MONARCH') # then, add the chromosome instance # (from the given build) geno.addChromosomeInstance(chrom_num, asm_curie, build_num, chrom) if omimtype == self.globaltt[ 'heritable_phenotypic_marker']: postypes = [self.globaltt['FuzzyPosition']] else: postypes = None # NOTE that no strand information # is available in the API feat.addFeatureStartLocation( fstart, chrom_in_build, None, postypes) if fend >= 0: feat.addFeatureEndLocation( fend, chrom_in_build, None, postypes) if fstart > fend: LOG.info("start>end (%d>%d) for %s", fstart, fend, omim_curie) # add the cytogenic location too # for now, just take the first one cytoloc = cytoloc.split('-')[0] loc = makeChromID(cytoloc, tax_id, 'CHR') model.addClassToGraph(loc, None) feat.addSubsequenceOfFeature(loc) feat.addFeatureToGraph(True, None, is_gene) # end adding causative genes/features if ent['entry']['status'] in ['moved', 'removed']: LOG.warning('UNEXPECTED! not expecting obsolete record %s', omim_curie) self._get_phenotypicseries_parents(ent['entry'], graph) self._get_mappedids(ent['entry'], graph) self._get_mapped_gene_ids(ent['entry'], graph) self._get_pubs(ent['entry'], graph) self._get_process_allelic_variants(ent['entry'], graph)
def _process_data(self, raw, limit=None): logger.info("Processing Data from %s", raw) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) geno = Genotype(g) line_counter = 0 impc_map = self.open_and_parse_yaml(self.map_files['impc_map']) impress_map = json.loads( self.fetch_from_url( self.map_files['impress_map']).read().decode('utf-8')) # Add the taxon as a class taxon_id = 'NCBITaxon:10090' # map to Mus musculus model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: with gzip.open(raw, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: line_counter += 1 (marker_accession_id, marker_symbol, phenotyping_center, colony, sex, zygosity, allele_accession_id, allele_symbol, allele_name, strain_accession_id, strain_name, project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, top_level_mp_term_id, top_level_mp_term_name, mp_term_id, mp_term_name, p_value, percentage_change, effect_size, statistical_method, resource_name) = row if self.testMode and marker_accession_id not in self.test_ids: continue # ##### cleanup some of the identifiers ###### zygosity_id = self._map_zygosity(zygosity) # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony) if not re.match(r'MGI', allele_accession_id): allele_accession_id = \ '_:IMPC-'+re.sub(r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): logger.info("Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:' + strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = \ re.match(r'.*<(.*)>', allele_symbol).group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and \ marker_accession_id == '': logger.warning("Marker unspecified on row %d", line_counter) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = geno.genoparts['variant_locus'] geno.addGene(marker_accession_id, marker_symbol, geno.genoparts['gene']) geno.addAllele(variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) sequence_alteration_id = \ '_:seqalt'+re.sub(r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration(sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, # with unknown zygosity stem_cell_class = 'ERO:0002002' model.addIndividualToGraph(colony_id, colony, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = \ '_:'+re.sub(r':', '', allele_accession_id+geno.zygosity['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts(allele_accession_id, colony_genotype_id, geno.object_properties['has_alternate_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, geno.zygosity['indeterminate'], geno.object_properties['has_alternate_part']) g.addTriple(colony_id, geno.object_properties['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = geno.object_properties['has_alternate_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: logger.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:' + vslc_id model.addIndividualToGraph( vslc_id, vslc_name, geno.genoparts['variant_single_locus_complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, geno.object_properties['has_alternate_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType( vslc_id, Genotype.genoparts['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype(genomic_background_id, strain_name, geno.genoparts['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = \ strain_name + '-' + phenotyping_center + '-' + colony pheno_center_strain_id = \ '-'.join((re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony))) if not re.match(r'^_', pheno_center_strain_id): pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype(pheno_center_strain_id, pheno_center_strain_label, geno.genoparts['genomic_background']) geno.addSequenceDerivesFrom(pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name+' ['+pheno_center_strain_label+']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id+sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' if sex == 'male': sq_type_id = geno.genoparts['male_genotype'] elif sex == 'female': sq_type_id = geno.genoparts['female_genotype'] else: sq_type_id = geno.genoparts['sex_qualified_genotype'] geno.addGenotype(sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts(genotype_id, sex_qualified_genotype_id, geno.object_properties['has_alternate_part']) if genomic_background_id is not None and \ genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': logger.warning("No phenotype id specified for row %d: %s", line_counter, str(row)) continue # hard coded ECO code eco_id = "ECO:0000015" # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc(g, self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() # add a free-text description try: description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = \ ' '.join((mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = \ self._add_study_provenance( impc_map, impress_map, phenotyping_center, colony, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name) evidence_line_bnode = \ self._add_evidence( assoc_id, eco_id, impc_map, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode, impc_map) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(g, assoc_id, resource_id) if not self.testMode and \ limit is not None and line_counter > limit: break return
def process_omia_phenotypes(self, limit): # process the whole directory # TODO get the file listing if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) LOG.info("Processing Monarch OMIA Animal disease-phenotype associations") src_key = 'omia_d2p' # get file listing mypath = '/'.join((self.rawdir, 'OMIA-disease-phenotype')) file_list = [ f for f in listdir(mypath) if isfile(join(mypath, f)) and re.search(r'.txt$', f)] col = self.files[src_key]['columns'] # reusable initial code generator # for c in col: # print( # '# '+str.lower(c.replace(" ",""))+" = row[col.index('"+c+"')].strip()") for filename in file_list: LOG.info("Processing %s", filename) count_missing = 0 bad_rows = list() fname = '/'.join((mypath, filename)) with open(fname, 'r') as csvfile: filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"') fileheader = next(filereader) if fileheader != col: LOG.error('Expected %s to have columns: %s', fname, col) LOG.error('But Found %s to have columns: %s', fname, fileheader) raise AssertionError('Incomming data headers have changed.') for row in filereader: if len(row) != len(col): LOG.info( "Not enough cols %d in %s - please fix", len(row), filename) continue disease_num = row[col.index('Disease ID')].strip() species_id = row[col.index('Species ID')].strip() breed_name = row[col.index('Breed Name')].strip() # variant = row[col.index('Variant')] # inheritance = row[col.index('Inheritance')] phenotype_id = row[col.index('Phenotype ID')].strip() # phenotype_name = row[col.index('Phenotype Name')] entity_id = row[col.index('Entity ID')].strip() entity_name = row[col.index('Entity Name')] quality_id = row[col.index('Quality ID')].strip() quality_name = row[col.index('Quality Name')] # related_entity_id = row[col.index('Related Entity ID')] # related_entity_name = row[col.index('Related Entity Name')] # abnormal_id = row[col.index('Abnormal ID')] # abnormal_name = row[col.index('Abnormal Name')] # phenotype_desc = row[col.index('Phenotype Desc')] assay = row[col.index('Assay')].strip() # frequency = row[col.index('Frequency')] pubmed_id = row[col.index('Pubmed ID')].strip() phenotype_description = row[col.index('Pub Desc')].strip() curator_notes = row[col.index('Curator Notes')].strip() # date_created = row[col.index('Date Created')] if phenotype_id == '': # LOG.warning('Missing phenotype in row:\n%s', row) count_missing += 1 bad_rows.append(row) continue if len(str(disease_num)) < 6: disease_num = str(disease_num).zfill(6) disease_id = 'OMIA:' + disease_num if species_id != '': disease_id = '-'.join((disease_id, species_id)) assoc = D2PAssoc(graph, self.name, disease_id, phenotype_id) if pubmed_id != '': for pnum in re.split(r'[,;]', pubmed_id): pnum = re.sub(r'[^0-9]', '', pnum) pmid = 'PMID:' + pnum assoc.add_source(pmid) else: assoc.add_source( '/'.join(( self.curie_map['OMIA'] + disease_num, species_id))) assoc.add_association_to_graph() aid = assoc.get_association_id() if phenotype_description != '': model.addDescription(aid, phenotype_description) if breed_name != '': model.addDescription(aid, breed_name + ' [observed in]') if assay != '': model.addDescription(aid, assay + ' [assay]') if curator_notes != '': model.addComment(aid, curator_notes) if entity_id != '' or quality_id != '': LOG.info( "EQ not empty for %s: %s + %s", disease_id, entity_name, quality_name) if count_missing > 0: LOG.warning( "We are missing %d of %d D2P annotations from id %s", count_missing, filereader.line_num-1, filename) LOG.warning("Bad rows:\n%s", '\n'.join([str(x) for x in bad_rows])) # finish loop through all files return
class Assoc: """ A base class for OBAN (Monarch)-style associations, to enable attribution of source and evidence on statements. """ def __init__( self, graph, definedby, sub=None, obj=None, pred=None, subject_category=None, object_category=None ): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map # core parts of the association self.definedby = definedby self.sub = sub self.obj = obj self.rel = pred self.subject_category = subject_category self.object_category = object_category self.assoc_id = None self.description = None self.source = [] self.evidence = [] self.date = [] # this is going to be used for the refactored evidence/provenance self.provenance = [] self.score = None self.score_type = None self.score_unit = None def _is_valid(self): # check if sub/obj/rel are none...raise error if self.sub is None: raise ValueError( 'No subject set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) if self.obj is None: raise ValueError( 'No object set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) if self.rel is None: raise ValueError( 'No predicate set for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) # Are subject & predicate, either a curie or IRI pfx = self.sub.split(':')[0] if pfx not in self.curie_map.keys() and \ pfx not in ['_', 'http', 'https', 'ftp']: raise ValueError( 'Invalid Subject for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) pfx = self.rel.split(':')[0] if pfx not in self.curie_map.keys() and \ pfx not in ['_', 'http', 'https', 'ftp']: raise ValueError( 'Invalid Predicate for this association <%s> <%s> <%s>', self.sub, self.rel, self.obj ) return True def add_association_to_graph(self, association_category=None): # Assume null and iri checks happen downstream #if not self._is_valid(): # return self.graph.addTriple(self.sub, self.rel, self.obj, subject_category=self.subject_category, object_category=self.object_category) if self.assoc_id is None: self.set_association_id() # assert self.assoc_id is not None self.model.addType(self.assoc_id, self.model.globaltt['association']) self.graph.addTriple( self.assoc_id, self.globaltt['association has subject'], self.sub ) self.graph.addTriple( self.assoc_id, self.globaltt['association has object'], self.obj ) self.graph.addTriple( self.assoc_id, self.globaltt['association has predicate'], self.rel ) if association_category is not None: self.graph.addTriple( self.assoc_id, blv.terms['category'], association_category ) if self.description: self.model.addDescription(self.assoc_id, self.description) if self.evidence: for evi in self.evidence: self.graph.addTriple(self.assoc_id, self.globaltt['has evidence'], evi) if self.source: for src in self.source: # TODO assume that the source is a publication? use Reference class self.graph.addTriple(self.assoc_id, self.globaltt['Source'], src) if self.provenance: for prov in self.provenance: self.graph.addTriple( self.assoc_id, self.globaltt['has_provenance'], prov) if self.date: for dat in self.date: self.graph.addTriple( self.assoc_id, self.globaltt['created_on'], dat, object_is_literal=True ) if self.score is not None: self.graph.addTriple( self.assoc_id, self.globaltt['has measurement value'], self.score, True, 'xsd:float' ) # TODO # update with some kind of instance of scoring object # that has a unit and type def add_predicate_object( self, predicate, object_node, object_type=None, datatype=None): if object_type == 'Literal': if datatype is not None: self.graph.addTriple( self.assoc_id, predicate, object_node, True, datatype ) else: self.graph.addTriple(self.assoc_id, predicate, object_node, True) else: self.graph.addTriple(self.assoc_id, predicate, object_node, False) # This isn't java, but predecessors favored the use of property decorators # and CamelCase and ... def set_subject(self, identifier): self.sub = identifier def set_object(self, identifier): self.obj = identifier def set_relationship(self, identifier): self.rel = identifier def set_association_id(self, assoc_id=None): """ This will set the association ID based on the internal parts of the association. To be used in cases where an external association identifier should be used. :param assoc_id: :return: """ if assoc_id is None: self.assoc_id = self.make_association_id( self.definedby, self.sub, self.rel, self.obj) else: self.assoc_id = assoc_id return self.assoc_id def get_association_id(self): if self.assoc_id is None: self.set_association_id() return self.assoc_id def set_description(self, description): self.description = description def set_score(self, score, unit=None, score_type=None): self.score = score self.score_unit = unit self.score_type = score_type def add_evidence(self, identifier): """ Add an evidence code to the association object (maintained as a list) :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.evidence += [identifier] def add_source(self, identifier): """ Add a source identifier (such as publication id) to the association object (maintained as a list) TODO we need to greatly expand this function! :param identifier: :return: """ if identifier is not None and identifier.strip() != '': self.source += [identifier] def add_date(self, date): if date is not None and date.strip() != '': self.date += [date] def add_provenance(self, identifier): if identifier is not None and identifier.strip() != '': self.provenance += [identifier] @staticmethod def make_association_id(definedby, sub, pred, obj, attributes=None): """ A method to create unique identifiers for OBAN-style associations, based on all the parts of the association If any of the items is empty or None, it will convert it to blank. It effectively digests the string of concatonated values. Subclasses of Assoc can submit an additional array of attributes that will be appeded to the ID. Note this is equivalent to a RDF blank node :param definedby: The (data) resource that provided the annotation :param subject: :param predicate: :param object: :param attributes: :return: """ items_to_hash = [definedby, sub, pred, obj] if attributes is not None and len(attributes) > 0: items_to_hash += attributes items_to_hash = [x for x in items_to_hash if x is not None] assoc_id = ':'.join(('MONARCH', GraphUtils.digest_id('+'.join(items_to_hash)))) # assert assoc_id is not None return assoc_id
def process_gene_interaction(self, limit): """ The gene interaction file includes identified interactions, that are between two or more gene (products). In the case of interactions with >2 genes, this requires creating groups of genes that are involved in the interaction. From the wormbase help list: In the example WBInteraction000007779 it would likely be misleading to suggest that lin-12 interacts with (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60 ALONE; the observation in the paper; see Table V in paper PMID:15990876 was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress the "multivulva" phenotype induced synthetically by simultaneous perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele). So this is necessarily a three-gene interaction. Therefore, we can create groups of genes based on their "status" of Effector | Effected. Status: IN PROGRESS :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['gene_interaction']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing gene interaction associations") line_counter = 0 with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar="'") for row in filereader: line_counter += 1 if re.match(r'#', ''.join(row)): continue (interaction_num, interaction_type, interaction_subtype, summary, citation) = row[0:5] # print(row) interaction_id = 'WormBase:' + interaction_num # TODO deal with subtypes interaction_type_id = None if interaction_type == 'Genetic': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'genetically_interacts_with'] elif interaction_type == 'Physical': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'molecularly_interacts_with'] elif interaction_type == 'Regulatory': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'regulates'] else: logger.info("An interaction type I don't understand %s", interaction_type) num_interactors = (len(row) - 5) / 3 if num_interactors != 2: logger.info( "Skipping interactions with !=2 participants:\n %s", str(row)) continue gene_a_id = 'WormBase:' + row[5] gene_b_id = 'WormBase:' + row[8] if self.testMode \ and gene_a_id not in self.test_ids['gene'] \ and gene_b_id not in self.test_ids['gene']: continue assoc = InteractionAssoc(g, self.name, gene_a_id, gene_b_id, interaction_type_id) assoc.set_association_id(interaction_id) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() # citation is not a pmid or WBref - get this some other way model.addDescription(assoc_id, summary) if not self.testMode \ and limit is not None and line_counter > limit: break return
def process_feature_loc(self, limit): raw = '/'.join((self.rawdir, self.files['feature_loc']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing Feature location and attributes") line_counter = 0 geno = Genotype(g) strain_to_variant_map = {} build_num = self.version_num build_id = 'WormBase:'+build_num with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: if re.match(r'\#', ''.join(row)): continue (chrom, db, feature_type_label, start, end, score, strand, phase, attributes) = row # I interpolated_pmap_position gene 1 559768 . . . ID=gmap:spe-13;gmap=spe-13;status=uncloned;Note=-21.3602 cM (+/- 1.84 cM) # I WormBase gene 3747 3909 . - . ID=Gene:WBGene00023193;Name=WBGene00023193;interpolated_map_position=-21.9064;sequence_name=Y74C9A.6;biotype=snoRNA;Alias=Y74C9A.6 # I absolute_pmap_position gene 4119 10230 . . . ID=gmap:homt-1;gmap=homt-1;status=cloned;Note=-21.8252 cM (+/- 0.00 cM) # dbs = re.split( # r' ', 'assembly_component expressed_sequence_match Coding_transcript Genomic_canonical Non_coding_transcript Orfeome Promoterome Pseudogene RNAi_primary RNAi_secondary Reference Transposon Transposon_CDS cDNA_for_RNAi miRanda ncRNA operon polyA_signal_sequence polyA_site snlRNA') # # if db not in dbs: # continue if feature_type_label not in [ 'gene', 'point_mutation', 'deletion', 'RNAi_reagent', 'duplication', 'enhancer', 'binding_site', 'biological_region', 'complex_substitution', 'substitution', 'insertion', 'inverted_repeat']: # note biological_regions include balancers # other options here: promoter, regulatory_region, reagent continue line_counter += 1 attribute_dict = {} if attributes != '': attribute_dict = dict( item.split("=")for item in re.sub(r'"', '', attributes).split(";")) fid = flabel = desc = None if 'ID' in attribute_dict: fid = attribute_dict.get('ID') if re.search(r'WB(Gene|Var|sf)', fid): fid = re.sub(r'^\w+:WB', 'WormBase:WB', fid) elif re.match(r'(gmap|landmark)', fid): continue else: logger.info('other identifier %s', fid) fid = None elif 'variation' in attribute_dict: fid = 'WormBase:'+attribute_dict.get('variation') flabel = attribute_dict.get('public_name') sub = attribute_dict.get('substitution') ins = attribute_dict.get('insertion') # if it's a variation: # variation=WBVar00604246;public_name=gk320600;strain=VC20384;substitution=C/T desc = '' if sub is not None: desc = 'substitution='+sub if ins is not None: desc = 'insertion='+ins # keep track of the strains with this variation, # for later processing strain_list = attribute_dict.get('strain') if strain_list is not None: for s in re.split(r',', strain_list): if s.strip() not in strain_to_variant_map: strain_to_variant_map[s.strip()] = set() strain_to_variant_map[s.strip()].add(fid) # if feature_type_label == 'RNAi_reagent': # Target=WBRNAi00096030 1 4942 # this will tell us where the RNAi is actually binding # target = attribute_dict.get('Target') # TODO unused # rnai_num = re.split(r' ', target)[0] # TODO unused # it will be the reagent-targeted-gene that has a position, # (i think) # TODO finish the RNAi binding location name = attribute_dict.get('Name') polymorphism = attribute_dict.get('polymorphism') if fid is None: if name is not None and re.match(r'WBsf', name): fid = 'WormBase:'+name name = None else: continue if self.testMode \ and re.sub(r'WormBase:', '', fid) \ not in self.test_ids['gene']+self.test_ids['allele']: continue # these really aren't that interesting if polymorphism is not None: continue if name is not None and not re.search(name, fid): if flabel is None: flabel = name else: model.addSynonym(fid, name) if desc is not None: model.addDescription(fid, desc) alias = attribute_dict.get('Alias') biotype = attribute_dict.get('biotype') note = attribute_dict.get('Note') other_name = attribute_dict.get('other_name') for n in [alias, other_name]: if n is not None: model.addSynonym(fid, other_name) ftype = self.get_feature_type_by_class_and_biotype( feature_type_label, biotype) chr_id = makeChromID(chrom, build_id, 'CHR') geno.addChromosomeInstance(chrom, build_id, build_num) feature = Feature(g, fid, flabel, ftype) feature.addFeatureStartLocation(start, chr_id, strand) feature.addFeatureEndLocation(start, chr_id, strand) feature_is_class = False if feature_type_label == 'gene': feature_is_class = True feature.addFeatureToGraph(True, None, feature_is_class) if note is not None: model.addDescription(fid, note) if not self.testMode \ and limit is not None and line_counter > limit: break # RNAi reagents: # I RNAi_primary RNAi_reagent 4184 10232 . + . Target=WBRNAi00001601 1 6049 +;laboratory=YK;history_name=SA:yk326e10 # I RNAi_primary RNAi_reagent 4223 10147 . + . Target=WBRNAi00033465 1 5925 +;laboratory=SV;history_name=MV_SV:mv_G_YK5052 # I RNAi_primary RNAi_reagent 5693 9391 . + . Target=WBRNAi00066135 1 3699 +;laboratory=CH # TODO TF bindiing sites and network: # I TF_binding_site_region TF_binding_site 1861 2048 . + . Name=WBsf292777;tf_id=WBTranscriptionFactor000025;tf_name=DAF-16 # I TF_binding_site_region TF_binding_site 3403 4072 . + . Name=WBsf331847;tf_id=WBTranscriptionFactor000703;tf_name=DPL-1 return
def process_gaf(self, file, limit, id_map=None, eco_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", file) line_counter = 0 uniprot_hit = 0 uniprot_miss = 0 if '7955' in self.tax_ids: zfin = ZFIN(self.graph_type, self.are_bnodes_skized) if '6239' in self.tax_ids: wbase = WormBase(self.graph_type, self.are_bnodes_skized) with gzip.open(file, 'rb') as csvfile: filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in filereader: line_counter += 1 # comments start with exclamation if re.match(r'!', ''.join(row)): continue if len(row) > 17 or len(row) < 15: LOG.warning( "Wrong number of columns %i, expected 15 or 17\n%s", len(row), row) continue if 17 > len(row) >= 15: row += [""] * (17 - len(row)) (dbase, gene_num, gene_symbol, qualifier, go_id, ref, eco_symbol, with_or_from, aspect, gene_name, gene_synonym, object_type, taxon, date, assigned_by, annotation_extension, gene_product_form_id) = row # test for required fields if (dbase == '' or gene_num == '' or gene_symbol == '' or go_id == '' or ref == '' or eco_symbol == '' or aspect == '' or object_type == '' or taxon == '' or date == '' or assigned_by == ''): LOG.error( "Missing required part of annotation on row %d:\n" + '\t'.join(row), line_counter) continue # deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None and gene_num in id_map: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 else: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and not (re.match(r'NCBIGene', gene_id) and int(gene_num) in self.test_ids): continue model.addClassToGraph(gene_id, gene_symbol) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): model.addSynonym(gene_id, syn.strip()) if re.search(r'\|', taxon): # TODO add annotations with >1 taxon LOG.info(">1 taxon (%s) on line %d. skipping", taxon, line_counter) else: tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon) geno.addTaxon(tax_id, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = eco_map[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ####################################################################### # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = re.split(r'\|', with_or_from) phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for i in withitems: if i == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i): LOG.warning( "Don't know what having a uniprot id " + "in the 'with' column means of %s", uniprotid) continue i = re.sub(r'MGI\:MGI\:', 'MGI:', i) i = re.sub(r'WB:', 'WormBase:', i) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', i): targeted_gene_id = zfin.make_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', i): targeted_gene_id = wbase.make_reagent_targeted_gene_id( gene_id, i) geno.addReagentTargetedGene( i, gene_id, targeted_gene_id) assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, i, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[0] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence(self.globaltt[ 'experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be # the evidence for the GO assoc? if not self.test_mode and limit is not None and line_counter > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download", uniprot_per, uniprot_tot) return
def process_gene_interaction(self, limit): """ The gene interaction file includes identified interactions, that are between two or more gene (products). In the case of interactions with >2 genes, this requires creating groups of genes that are involved in the interaction. From the wormbase help list: In the example WBInteraction000007779 it would likely be misleading to suggest that lin-12 interacts with (suppresses in this case) smo-1 ALONE or that lin-12 suppresses let-60 ALONE; the observation in the paper; see Table V in paper PMID:15990876 was that a lin-12 allele (heterozygous lin-12(n941/+)) could suppress the "multivulva" phenotype induced synthetically by simultaneous perturbation of BOTH smo-1 (by RNAi) AND let-60 (by the n2021 allele). So this is necessarily a three-gene interaction. Therefore, we can create groups of genes based on their "status" of Effector | Effected. Status: IN PROGRESS :param limit: :return: """ raw = '/'.join((self.rawdir, self.files['gene_interaction']['file'])) if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) logger.info("Processing gene interaction associations") line_counter = 0 with gzip.open(raw, 'rb') as csvfile: filereader = csv.reader( io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar="'") for row in filereader: line_counter += 1 if re.match(r'#', ''.join(row)): continue (interaction_num, interaction_type, interaction_subtype, summary, citation) = row[0:5] # print(row) interaction_id = 'WormBase:'+interaction_num # TODO deal with subtypes interaction_type_id = None if interaction_type == 'Genetic': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'genetically_interacts_with'] elif interaction_type == 'Physical': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'molecularly_interacts_with'] elif interaction_type == 'Regulatory': interaction_type_id = \ InteractionAssoc.interaction_object_properties[ 'regulates'] else: logger.info( "An interaction type I don't understand %s", interaction_type) num_interactors = (len(row) - 5) / 3 if num_interactors != 2: logger.info( "Skipping interactions with !=2 participants:\n %s", str(row)) continue gene_a_id = 'WormBase:'+row[5] gene_b_id = 'WormBase:'+row[8] if self.testMode \ and gene_a_id not in self.test_ids['gene'] \ and gene_b_id not in self.test_ids['gene']: continue assoc = InteractionAssoc( g, self.name, gene_a_id, gene_b_id, interaction_type_id) assoc.set_association_id(interaction_id) assoc.add_association_to_graph() assoc_id = assoc.get_association_id() # citation is not a pmid or WBref - get this some other way model.addDescription(assoc_id, summary) if not self.testMode \ and limit is not None and line_counter > limit: break return
def _process_breed_phene_row(self, row): model = Model(self.g) # Linking disorders/characteristic to breeds # breed_id, phene_id, added_by breed_id = self.id_hash['breed'].get(row['breed_id']) phene_id = self.id_hash['phene'].get(row['phene_id']) # get the omia id omia_id = self._get_omia_id_from_phene_id(phene_id) if (self.testMode and not ( omia_id in self.test_ids['disease'] and int(row['breed_id']) in self.test_ids['breed']) or breed_id is None or phene_id is None): return # FIXME we want a different relationship here assoc = G2PAssoc( self.g, self.name, breed_id, phene_id, model.object_properties['has_phenotype']) assoc.add_association_to_graph() # add that the breed is a model of the human disease # use the omia-omim mappings for this # we assume that we have already scrubbed out the genes # from the omim list, so we can make the model associations here omim_ids = self.omia_omim_map.get(omia_id) eco_id = "ECO:0000214" # biological aspect of descendant evidence if omim_ids is not None and len(omim_ids) > 0: if len(omim_ids) > 1: logger.info( "There's 1:many omia:omim mapping: %s, %s", omia_id, str(omim_ids)) for i in omim_ids: assoc = G2PAssoc( self.g, self.name, breed_id, i, model.object_properties['model_of']) assoc.add_evidence(eco_id) assoc.add_association_to_graph() aid = assoc.get_association_id() breed_label = self.label_hash.get(breed_id) if breed_label is None: breed_label = "this breed" m = re.search(r'\((.*)\)', breed_label) if m: sp_label = m.group(1) else: sp_label = '' phene_label = self.label_hash.get(phene_id) if phene_label is None: phene_label = "phenotype" elif phene_label.endswith(sp_label): # some of the labels we made already include the species; # remove it to make a cleaner desc phene_label = re.sub(r' in '+sp_label, '', phene_label) desc = ' '.join( ("High incidence of", phene_label, "in", breed_label, "suggests it to be a model of disease", i + ".")) model.addDescription(aid, desc) return
def _process_data(self, raw, limit=None): LOG.info("Processing Data from %s", raw) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) line_counter = 0 # Add the taxon as a class taxon_id = self.globaltt['Mus musculus'] model.addClassToGraph(taxon_id, None) # with open(raw, 'r', encoding="utf8") as csvfile: with gzip.open(raw, 'rt') as csvfile: filereader = csv.reader(csvfile, delimiter=',', quotechar='\"') next(filereader, None) # skip the header row for row in filereader: line_counter += 1 ( marker_accession_id, marker_symbol, phenotyping_center, colony_raw, sex, zygosity, allele_accession_id, allele_symbol, allele_name, strain_accession_id, strain_name, project_name, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, top_level_mp_term_id, top_level_mp_term_name, mp_term_id, mp_term_name, p_value, percentage_change, effect_size, statistical_method, resource_name ) = row if self.test_mode and marker_accession_id not in self.gene_ids: continue # ##### cleanup some of the identifiers ###### zygosity = zygosity.strip() zygosity_id = self.resolve(zygosity) if zygosity_id == zygosity: LOG.warning( "Zygosity '%s' unmapped. detting to indeterminate", zygosity) zygosity_id = self.globaltt['indeterminate'] # colony ids sometimes have <> in them, spaces, # or other non-alphanumerics and break our system; # replace these with underscores colony_id = '_:' + re.sub(r'\W+', '_', colony_raw) if not re.match(r'MGI', allele_accession_id): allele_accession_id = '_:IMPC-'+re.sub( r':', '', allele_accession_id) if re.search(r'EUROCURATE', strain_accession_id): # the eurocurate links don't resolve at IMPC # TODO blank nodes do not maintain identifiers strain_accession_id = '_:' + strain_accession_id elif not re.match(r'MGI', strain_accession_id): LOG.info( "Found a strange strain accession...%s", strain_accession_id) strain_accession_id = 'IMPC:'+strain_accession_id ###################### # first, add the marker and variant to the graph as with MGI, # the allele is the variant locus. IF the marker is not known, # we will call it a sequence alteration. otherwise, # we will create a BNode for the sequence alteration. sequence_alteration_id = variant_locus_id = None variant_locus_name = sequence_alteration_name = None # extract out what's within the <> to get the symbol if re.match(r'.*<.*>', allele_symbol): sequence_alteration_name = re.match( r'.*<(.*)>', allele_symbol).group(1) else: sequence_alteration_name = allele_symbol if marker_accession_id is not None and marker_accession_id == '': LOG.warning("Marker unspecified on row %d", line_counter) marker_accession_id = None if marker_accession_id is not None: variant_locus_id = allele_accession_id variant_locus_name = allele_symbol variant_locus_type = self.globaltt['variant_locus'] geno.addGene( marker_accession_id, marker_symbol, self.globaltt['gene']) geno.addAllele( variant_locus_id, variant_locus_name, variant_locus_type, None) geno.addAlleleOfGene(variant_locus_id, marker_accession_id) # TAG bnode sequence_alteration_id = '_:seqalt' + re.sub( r':', '', allele_accession_id) geno.addSequenceAlterationToVariantLocus( sequence_alteration_id, variant_locus_id) else: sequence_alteration_id = allele_accession_id # IMPC contains targeted mutations with either gene traps, # knockouts, insertion/intragenic deletions. # but I don't really know what the SeqAlt is here, # so I don't add it. geno.addSequenceAlteration( sequence_alteration_id, sequence_alteration_name) # ############# BUILD THE COLONY ############# # First, let's describe the colony that the animals come from # The Colony ID refers to the ES cell clone # used to generate a mouse strain. # Terry sez: we use this clone ID to track # ES cell -> mouse strain -> mouse phenotyping. # The same ES clone maybe used at multiple centers, # so we have to concatenate the two to have a unique ID. # some useful reading about generating mice from ES cells: # http://ki.mit.edu/sbc/escell/services/details # here, we'll make a genotype # that derives from an ES cell with a given allele. # the strain is not really attached to the colony. # the colony/clone is reflective of the allele, with unknown zygosity stem_cell_class = self.globaltt['embryonic stem cell line'] if colony_id is None: print(colony_raw, stem_cell_class, "\nline:\t", line_counter) model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class) # vslc of the colony has unknown zygosity # note that we will define the allele # (and it's relationship to the marker, etc.) later # FIXME is it really necessary to create this vslc # when we always know it's unknown zygosity? vslc_colony = '_:'+re.sub( r':', '', allele_accession_id + self.globaltt['indeterminate']) vslc_colony_label = allele_symbol + '/<?>' # for ease of reading, we make the colony genotype variables. # in the future, it might be desired to keep the vslcs colony_genotype_id = vslc_colony colony_genotype_label = vslc_colony_label geno.addGenotype(colony_genotype_id, colony_genotype_label) geno.addParts( allele_accession_id, colony_genotype_id, self.globaltt['has_variant_part']) geno.addPartsToVSLC( vslc_colony, allele_accession_id, None, self.globaltt['indeterminate'], self.globaltt['has_variant_part']) graph.addTriple( colony_id, self.globaltt['has_genotype'], colony_genotype_id) # ########## BUILD THE ANNOTATED GENOTYPE ########## # now, we'll build the genotype of the individual that derives # from the colony/clone genotype that is attached to # phenotype = colony_id + strain + zygosity + sex # (and is derived from a colony) # this is a sex-agnostic genotype genotype_id = self.make_id( (colony_id + phenotyping_center + zygosity + strain_accession_id)) geno.addSequenceDerivesFrom(genotype_id, colony_id) # build the VSLC of the sex-agnostic genotype # based on the zygosity allele1_id = allele_accession_id allele2_id = allele2_rel = None allele1_label = allele_symbol allele2_label = '<?>' # Making VSLC labels from the various parts, # can change later if desired. if zygosity == 'heterozygote': allele2_label = re.sub(r'<.*', '<+>', allele1_label) allele2_id = None elif zygosity == 'homozygote': allele2_label = allele1_label allele2_id = allele1_id allele2_rel = self.globaltt['has_variant_part'] elif zygosity == 'hemizygote': allele2_label = re.sub(r'<.*', '<0>', allele1_label) allele2_id = None elif zygosity == 'not_applicable': allele2_label = re.sub(r'<.*', '<?>', allele1_label) allele2_id = None else: LOG.warning("found unknown zygosity %s", zygosity) break vslc_name = '/'.join((allele1_label, allele2_label)) # Add the VSLC vslc_id = '-'.join( (marker_accession_id, allele_accession_id, zygosity)) vslc_id = re.sub(r':', '', vslc_id) vslc_id = '_:'+vslc_id model.addIndividualToGraph( vslc_id, vslc_name, self.globaltt['variant single locus complement']) geno.addPartsToVSLC( vslc_id, allele1_id, allele2_id, zygosity_id, self.globaltt['has_variant_part'], allele2_rel) # add vslc to genotype geno.addVSLCtoParent(vslc_id, genotype_id) # note that the vslc is also the gvc model.addType(vslc_id, self.globaltt['genomic_variation_complement']) # Add the genomic background # create the genomic background id and name if strain_accession_id != '': genomic_background_id = strain_accession_id else: genomic_background_id = None genotype_name = vslc_name if genomic_background_id is not None: geno.addGenotype( genomic_background_id, strain_name, self.globaltt['genomic_background']) # make a phenotyping-center-specific strain # to use as the background pheno_center_strain_label = strain_name + '-' + phenotyping_center \ + '-' + colony_raw pheno_center_strain_id = '-'.join(( re.sub(r':', '', genomic_background_id), re.sub(r'\s', '_', phenotyping_center), re.sub(r'\W+', '', colony_raw))) if not re.match(r'^_', pheno_center_strain_id): # Tag bnode pheno_center_strain_id = '_:' + pheno_center_strain_id geno.addGenotype( pheno_center_strain_id, pheno_center_strain_label, self.globaltt['genomic_background']) geno.addSequenceDerivesFrom( pheno_center_strain_id, genomic_background_id) # Making genotype labels from the various parts, # can change later if desired. # since the genotype is reflective of the place # it got made, should put that in to disambiguate genotype_name = \ genotype_name + ' [' + pheno_center_strain_label + ']' geno.addGenomicBackgroundToGenotype( pheno_center_strain_id, genotype_id) geno.addTaxon(taxon_id, pheno_center_strain_id) # this is redundant, but i'll keep in in for now geno.addSequenceDerivesFrom(genotype_id, colony_id) geno.addGenotype(genotype_id, genotype_name) # Make the sex-qualified genotype, # which is what the phenotype is associated with sex_qualified_genotype_id = \ self.make_id(( colony_id + phenotyping_center + zygosity + strain_accession_id + sex)) sex_qualified_genotype_label = genotype_name + ' (' + sex + ')' sq_type_id = self.resolve(sex, False) if sq_type_id == sex: sq_type_id = self.globaltt['intrinsic_genotype'] LOG.warning( "Unknown sex qualifier %s, adding as intrinsic_genotype", sex) geno.addGenotype( sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id) geno.addParts( genotype_id, sex_qualified_genotype_id, self.globaltt['has_variant_part']) if genomic_background_id is not None and genomic_background_id != '': # Add the taxon to the genomic_background_id geno.addTaxon(taxon_id, genomic_background_id) else: # add it as the genomic background geno.addTaxon(taxon_id, genotype_id) # ############# BUILD THE G2P ASSOC ############# # from an old email dated July 23 2014: # Phenotypes associations are made to # imits colony_id+center+zygosity+gender phenotype_id = mp_term_id # it seems that sometimes phenotype ids are missing. # indicate here if phenotype_id is None or phenotype_id == '': LOG.warning( "No phenotype id specified for row %d: %s", line_counter, str(row)) continue # hard coded ECO code eco_id = self.globaltt['mutant phenotype evidence'] # the association comes as a result of a g2p from # a procedure in a pipeline at a center and parameter tested assoc = G2PAssoc( graph, self.name, sex_qualified_genotype_id, phenotype_id) assoc.add_evidence(eco_id) # assoc.set_score(float(p_value)) # TODO add evidence instance using # pipeline_stable_id + # procedure_stable_id + # parameter_stable_id assoc.add_association_to_graph() assoc_id = assoc.get_association_id() model._addSexSpecificity(assoc_id, self.resolve(sex)) # add a free-text description try: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(round(float(effect_size), 5)), '(p =', "{:.4e}".format(float(p_value)), ').')) except ValueError: description = ' '.join(( mp_term_name, 'phenotype determined by', phenotyping_center, 'in an', procedure_name, 'assay where', parameter_name.strip(), 'was measured with an effect_size of', str(effect_size), '(p =', "{0}".format(p_value), ').')) study_bnode = self._add_study_provenance( phenotyping_center, colony_raw, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name, line_counter) evidence_line_bnode = self._add_evidence( assoc_id, eco_id, p_value, percentage_change, effect_size, study_bnode) self._add_assertion_provenance(assoc_id, evidence_line_bnode) model.addDescription(evidence_line_bnode, description) # resource_id = resource_name # assoc.addSource(graph, assoc_id, resource_id) if not self.test_mode and limit is not None and line_counter > limit: break return