def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in xrefs.strip().split('|'): prefix = ':'.join(dbxref.split(':')[:-1]).strip() if prefix in self.localtt: prefix = self.localtt[prefix] dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None and prefix != '': if prefix == 'HPRD': # proteins are not == genes. model.addTriple( gene_id, self.globaltt['has gene product'], dbxref_curie) continue # skip some of these for now based on curie prefix if prefix in filter_out: continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': if dbxref_curie in self.omim_replaced: repl = self.omim_replaced[dbxref_curie] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = omim if dbxref_curie in self.omim_type and \ self.omim_type[dbxref_curie] != self.globaltt['gene']: continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)
def _add_gene_equivalencies(self, xrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.testMode: graph = self.testgraph else: graph = self.graph filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport'] taxon_spec_filters = { '10090': ['ENSEMBL'] } if taxon in taxon_spec_filters: filter_out += taxon_spec_filters[taxon] model = Model(graph) # deal with the xrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for ref in xrefs.strip().split('|'): xref_curie = self._cleanup_id(ref) if xref_curie is not None and xref_curie.strip() != '': if re.match(r'HPRD', xref_curie): # proteins are not == genes. model.addTriple( gene_id, self.properties['has_gene_product'], xref_curie) continue # skip some of these for now if xref_curie.split(':')[0] in filter_out: continue if re.match(r'^OMIM', xref_curie): if DipperUtil.is_omim_disease(xref_curie): continue try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass( gene_id, xref_curie) if int(taxon) in clique_map: if clique_map[int(taxon)] == xref_curie.split(':')[0]: model.makeLeader(xref_curie) elif clique_map[int(taxon)] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, xref_curie) except AssertionError as e: logger.warn("Error parsing {0}: {1}".format(gene_id, e)) return
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian phenotype """ # prep record # remove description and mapp Experiment Type to apo term experiment_type = record['Experiment Type'].split('(')[0] experiment_type = experiment_type.split(',') record['experiment_type'] = list() for exp_type in experiment_type: exp_type = exp_type.lstrip().rstrip() record['experiment_type'].append( { 'id': self.apo_term_id[exp_type], 'term': exp_type, }) sgd_phenotype = record['Phenotype'] pheno_obj = { 'entity': { 'term': None, 'apo_id': None }, 'quality': { 'term': None, 'apo_id': None }, 'has_quality': False # False = phenotype was descriptive and don't bother looking for a quality } phenotype = record['Phenotype'] if ':' in phenotype: pheno_obj['has_quality'] = True ent_qual = sgd_phenotype.split(': ') entity = ent_qual[0] quality = ent_qual[1] pheno_obj['entity']['term'] = entity pheno_obj['entity']['apo_id'] = self.apo_term_id[entity] pheno_obj['quality']['term'] = quality pheno_obj['quality']['apo_id'] = self.apo_term_id[quality] else: pheno_obj['entity']['term'] = phenotype pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype] record['pheno_obj'] = pheno_obj # begin modeling model = Model(self.graph) # define the triple gene = 'SGD:{}'.format(record['SGDID']) relation = Model.object_properties['has_phenotype'] # has phenotype if record['pheno_obj']['has_quality']: pheno_label = '{0}:{1}'.format( record['pheno_obj']['entity']['term'], record['pheno_obj']['quality']['term']) pheno_id = 'MONARCH:{0}{1}'.format( record['pheno_obj']['entity']['apo_id'].replace(':', '_'), record['pheno_obj']['quality']['apo_id'].replace(':', '_') ) g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) else: pheno_label = record['pheno_obj']['entity']['term'] pheno_id = record['pheno_obj']['entity']['apo_id'] g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) assoc_id = g2p_assoc.make_association_id(definedby='yeastgenome.org', subject=gene, predicate=relation, object=pheno_id) g2p_assoc.set_association_id(assoc_id=assoc_id) # add to graph to mint assoc id g2p_assoc.add_association_to_graph() model.addLabel(subject_id=gene, label=record['Gene Name']) # add the association triple model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id) # make pheno subclass of UPHENO:0001001 model.addTriple(subject_id=pheno_id, predicate_id=Model.object_properties['subclass_of'], obj='UPHENO:0001001') # label nodes # pheno label model.addLabel(subject_id=pheno_id, label=pheno_label) g2p_assoc.description = self._make_description(record) # add the references references = record['Reference'] references = references.replace(' ', '') references = references.split('|') # created RGDRef prefix in curie map to route to proper reference URL in RGD if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference( self.graph, references[0], Reference.ref_types['publication'] ) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add experiment type as evidence for exp_type in record['experiment_type']: g2p_assoc.add_evidence(exp_type['id']) model.addLabel(subject_id=exp_type['id'], label=exp_type['term']) try: g2p_assoc.add_association_to_graph() except Exception as e: print(e) return
def _build_gene_disease_model( self, gene_id, relation_id, disease_id, variant_label, consequence_predicate=None, consequence_id=None, allelic_requirement=None, pmids=None): """ Builds gene variant disease model :return: None """ model = Model(self.graph) geno = Genotype(self.graph) pmids = [] if pmids is None else pmids is_variant = False variant_or_gene = gene_id variant_id_string = variant_label variant_bnode = self.make_id(variant_id_string, "_") if consequence_predicate is not None \ and consequence_id is not None: is_variant = True model.addTriple(variant_bnode, consequence_predicate, consequence_id) # Hack to add labels to terms that # don't exist in an ontology if consequence_id.startswith(':'): model.addLabel(consequence_id, consequence_id.strip(':').replace('_', ' ')) if is_variant: variant_or_gene = variant_bnode # Typically we would type the variant using the # molecular consequence, but these are not specific # enough for us to make mappings (see translation table) model.addIndividualToGraph(variant_bnode, variant_label, self.globaltt['variant_locus']) geno.addAffectedLocus(variant_bnode, gene_id) model.addBlankNodeAnnotation(variant_bnode) assoc = G2PAssoc( self.graph, self.name, variant_or_gene, disease_id, relation_id) assoc.source = pmids assoc.add_association_to_graph() if allelic_requirement is not None and is_variant is False: model.addTriple( assoc.assoc_id, self.globaltt['has_allelic_requirement'], allelic_requirement) if allelic_requirement.startswith(':'): model.addLabel( allelic_requirement, allelic_requirement.strip(':').replace('_', ' '))
def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple( subject_id=package['unii'], predicate_id=target['action'], obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple( subject_id=target['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) model.addTriple( subject_id=package['drugbank_id'], predicate_id=self.globaltt['equivalent_class'], obj=package['unii']) model.addTriple( subject_id=target['action'], predicate_id=self.globaltt['subPropertyOf'], obj=self.globaltt['molecularly_interacts_with']) model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) if source == 'drugcentral': for indication in package['indications']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['is substance that treats'], obj=indication['snomed_id']) model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addTriple( subject_id=indication['snomed_id'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['disease']) model.addLabel( subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['molecularly_interacts_with'], obj=interaction['uniprot']) # model.addLabel( # subject_id=interaction['uniprot'], # label='Protein_{}'.format(interaction['uniprot'])) model.addLabel( subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addDescription( subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple( subject_id=interaction['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) return
def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple(subject_id=package['unii'], predicate_id=target['action'], obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple(subject_id=target['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') model.addTriple(subject_id=package['drugbank_id'], predicate_id=Model.object_properties['equivalent_class'], obj=package['unii']) model.addTriple(subject_id=target['action'], predicate_id='rdfs:subPropertyOf', obj='RO:0002436') model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') if source == 'drugcentral': for indication in package['indications']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002606', obj=indication['snomed_id']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addTriple(subject_id=indication['snomed_id'], predicate_id=Model.object_properties['subclass_of'], obj='DOID:4') model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002436', obj=interaction['uniprot']) # model.addLabel(subject_id=interaction['uniprot'], label='Protein_{}'.format(interaction['uniprot'])) model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple(subject_id=interaction['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') return
def process_gaf(self, gaffile, limit, id_map=None): if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) geno = Genotype(graph) LOG.info("Processing Gene Associations from %s", gaffile) uniprot_hit = 0 uniprot_miss = 0 col = self.gaf_columns with gzip.open(gaffile, 'rb') as csvfile: reader = csv.reader(io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"') for row in reader: # comments start with exclamation if row[0][0] == '!': continue if len(row) != len(col): LOG.error( "Wrong number of columns %i, expected ... got:\n\t%s", len(col), row) exit(1) dbase = row[col.index('DB')].strip() gene_num = row[col.index('DB_Object_ID')].strip() gene_symbol = row[col.index('DB_Object_Symbol')].strip() qualifier = row[col.index('Qualifier')] go_id = row[col.index('GO_ID')].strip() ref = row[col.index('DB:Reference')].strip() eco_symbol = row[col.index('Evidence Code')].strip() with_or_from = row[col.index('With (or) From')] aspect = row[col.index('Aspect')].strip() gene_name = row[col.index('DB_Object_Name')] gene_synonym = row[col.index('DB_Object_Synonym')] # object_type = row[col.index('DB_Object_Type')].strip() taxon = row[col.index('Taxon and Interacting taxon')].strip() # date = row[col.index('Date')].strip() # assigned_by = row[col.index('Assigned_By')].strip() # annotation_extension = row[col.index('Annotation_Extension')] # gene_product_form_id = row[col.index('Gene_Product_Form_ID')] # test for required fields if '' in [row[:10], row[12]]: LOG.error( "Missing required part of annotation on row %i:\n%s", reader.line_num, str(row[:-4])) continue # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with if re.search(r'NOT', qualifier): continue if dbase in self.localtt: dbase = self.localtt[dbase] uniprotid = None gene_id = None if dbase == 'UniProtKB': if id_map is not None: # try/except much faster than checking # for dict key membership try: gene_id = id_map[gene_num] uniprotid = ':'.join((dbase, gene_num)) (dbase, gene_num) = gene_id.split(':') uniprot_hit += 1 except KeyError: # LOG.warning( # "UniProt id %s is without a 1:1 mapping to entrez/ensembl", # gene_num) uniprot_miss += 1 continue else: gene_num = gene_num.split(':')[-1] # last gene_id = ':'.join((dbase, gene_num)) if self.test_mode and gene_id[:9] != 'NCBIGene:' and\ gene_num not in self.test_ids: continue model.addLabel(gene_id, gene_symbol) model.addType(gene_id, self.globaltt['gene']) if gene_name != '': model.addDescription(gene_id, gene_name) if gene_synonym != '': for syn in re.split(r'\|', gene_synonym): syn = syn.strip() if syn[:10] == 'UniProtKB:': model.addTriple(gene_id, self.globaltt['has gene product'], syn) elif re.fullmatch(graph.curie_regexp, syn) is not None and\ syn.split(':')[0] not in self.wont_prefix: syn = syn.strip() LOG.warning( 'possible curie "%s" as a literal synomym for %s', syn, gene_id) if syn != '': model.addSynonym(gene_id, syn) elif syn != '': model.addSynonym(gene_id, syn) # First taxon is for the gene, after the pipe are interacting taxa tax_curie = taxon.split('|')[0].replace('taxon', 'NCBITaxon') # this is a required field but good to safe if tax_curie: geno.addTaxon(tax_curie, gene_id) assoc = Assoc(graph, self.name) assoc.set_subject(gene_id) assoc.set_object(go_id) try: eco_id = self.gaf_eco[eco_symbol] assoc.add_evidence(eco_id) except KeyError: LOG.error("Evidence code (%s) not mapped", eco_symbol) refs = re.split(r'\|', ref) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[-2] # sidestep 'MGI:MGI:' if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) refg = Reference(graph, ref) if prefix == 'PMID': ref_type = self.globaltt['journal article'] refg.setType(ref_type) refg.addRefToGraph() assoc.add_source(ref) # TODO add the source of the annotations from assigned by? rel = self.resolve(aspect, mandatory=False) if rel is not None and aspect == rel: if aspect == 'F' and re.search(r'contributes_to', qualifier): assoc.set_relationship(self.globaltt['contributes to']) else: LOG.error( "Aspect: %s with qualifier: %s is not recognized", aspect, qualifier) elif rel is not None: assoc.set_relationship(rel) assoc.add_association_to_graph() else: LOG.warning("No predicate for association \n%s\n", str(assoc)) if uniprotid is not None: assoc.set_description('Mapped from ' + uniprotid) # object_type should be one of: # protein_complex; protein; transcript; ncRNA; rRNA; tRNA; # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology. # If the precise product type is unknown, # gene_product should be used ######################################################################## # Derive G2P Associations from IMP annotations # in version 2.1 Pipe will indicate 'OR' # and Comma will indicate 'AND'. # in version 2.0, multiple values are separated by pipes # where the pipe has been used to mean 'AND' if eco_symbol == 'IMP' and with_or_from != '': withitems = with_or_from.split('|') phenotypeid = go_id + 'PHENOTYPE' # create phenotype associations for itm in withitems: if itm == '' or re.match( r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm): LOG.warning("Skipping %s from or with %s", uniprotid, itm) continue # sanity check/conversion on go curie prefix (pfx, lclid) = itm.split(':')[-2:] # last prefix wins if pfx in self.localtt: pfx = self.localtt[pfx] itm = ':'.join((pfx, lclid)) # for worms and fish, they might give a RNAi or MORPH # in these cases make a reagent-targeted gene if re.search('MRPHLNO|CRISPR|TALEN', itm): targeted_gene_id = self.zfin.make_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene( itm, gene_id, targeted_gene_id) # TODO PYLINT why is this needed? # Redefinition of assoc type from # dipper.models.assoc.Association.Assoc to # dipper.models.assoc.G2PAssoc.G2PAssoc assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) elif re.search(r'WBRNAi', itm): targeted_gene_id = self.wbase.make_reagent_targeted_gene_id( gene_id, itm) geno.addReagentTargetedGene( itm, gene_id, targeted_gene_id) assoc = G2PAssoc(graph, self.name, targeted_gene_id, phenotypeid) else: assoc = G2PAssoc(graph, self.name, itm, phenotypeid) for ref in refs: ref = ref.strip() if ref != '': prefix = ref.split(':')[-2] if prefix in self.localtt: prefix = self.localtt[prefix] ref = ':'.join((prefix, ref.split(':')[-1])) assoc.add_source(ref) # experimental phenotypic evidence assoc.add_evidence(self.globaltt[ 'experimental phenotypic evidence']) assoc.add_association_to_graph() # TODO should the G2PAssoc be the evidence for the GO assoc? if not self.test_mode and limit is not None and \ reader.line_num > limit: break uniprot_tot = (uniprot_hit + uniprot_miss) uniprot_per = 0.0 if uniprot_tot != 0: uniprot_per = 100.0 * uniprot_hit / uniprot_tot LOG.info( "Uniprot: %.2f%% of %i benefited from the mapping download", uniprot_per, uniprot_tot)
def _add_study_provenance( self, phenotyping_center, colony, project_fullname, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name, row_num ): """ :param phenotyping_center: str, from self.files['all'] :param colony: str, from self.files['all'] :param project_fullname: str, from self.files['all'] :param pipeline_name: str, from self.files['all'] :param pipeline_stable_id: str, from self.files['all'] :param procedure_stable_id: str, from self.files['all'] :param procedure_name: str, from self.files['all'] :param parameter_stable_id: str, from self.files['all'] :param parameter_name: str, from self.files['all'] :param statistical_method: str, from self.files['all'] :param resource_name: str, from self.files['all'] :return: study bnode """ provenance_model = Provenance(self.graph) model = Model(self.graph) # Add provenance # A study is a blank node equal to its parts study_bnode = self.make_id("{0}{1}{2}{3}{4}{5}{6}{7}".format( phenotyping_center, colony, project_fullname, pipeline_stable_id, procedure_stable_id, parameter_stable_id, statistical_method, resource_name), '_') model.addIndividualToGraph( study_bnode, None, self.globaltt['study']) # List of nodes linked to study with has_part property study_parts = [] # Add study parts model.addIndividualToGraph(self.resolve(procedure_stable_id), procedure_name) study_parts.append(self.resolve(procedure_stable_id)) study_parts.append(self.resolve(statistical_method)) provenance_model.add_study_parts(study_bnode, study_parts) # Add parameter/measure statement: study measures parameter parameter_label = "{0} ({1})".format(parameter_name, procedure_name) logging.info("Adding Provenance") model.addIndividualToGraph( self.resolve(parameter_stable_id), parameter_label) provenance_model.add_study_measure( study_bnode, self.resolve(parameter_stable_id)) # Add Colony colony_bnode = self.make_id("{0}".format(colony), '_') model.addIndividualToGraph(colony_bnode, colony) # Add study agent model.addIndividualToGraph( self.resolve(phenotyping_center), phenotyping_center, self.globaltt['organization']) # self.graph model.addTriple( study_bnode, self.globaltt['has_agent'], self.resolve(phenotyping_center)) # add pipeline and project model.addIndividualToGraph( self.resolve(pipeline_stable_id), pipeline_name) # self.graph model.addTriple( study_bnode, self.globaltt['part_of'], self.resolve(pipeline_stable_id)) model.addIndividualToGraph( self.resolve(project_fullname), project_fullname, self.globaltt['project']) # self.graph model.addTriple( study_bnode, self.globaltt['part_of'], self.resolve(project_fullname)) return study_bnode
class Dataset: """ This class produces metadata about a dataset that is compliant with the HCLS dataset specification: https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/#s4_4 Summary level: The summary level provides a description of a dataset that is independent of a specific version or format. (e.g. the Monarch ingest of CTD) CURIE for this is something like MonarchData:[SOURCE IDENTIFIER] Version level: The version level captures version-specific characteristics of a dataset. (e.g. the 01-02-2018 ingest of CTD) CURIE for this is something like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP] Distribution level: The distribution level captures metadata about a specific form and version of a dataset (e.g. turtle file for 01-02-2018 ingest of CTD). There is a [distribution level resource] for each different downloadable file we emit, i.e. one for the TTL file, one for the ntriples file, etc. CURIE for this is like MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].ttl or MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].nt or MonarchData:[SOURCE IDENTIFIER_INGESTTIMESTAMP].[whatever file format] We write out at least the following triples: SUMMARY LEVEL TRIPLES: [summary level resource] - rdf:type -> dctypes:Dataset [summary level resource] - dct:title -> title (literal) [summary level resource] - dct:description -> description (literal) (use docstring from Source class) [summary level resource] - dcterms:source -> [source web page, e.g. omim.org] [summary level resource] - schema:logo -> [source logo IRI] [summary level resource] - dct:publisher -> monarchinitiative.org n.b: about summary level resource triples: -- HCLS spec says we "should" link to our logo and web page, but I'm not, because it would confuse the issue of whether we are pointing to our logo/page or the logo/page of the data source for this ingest. Same below for [version level resource] and [distibution level resource] - I'm not linking to our page/logo down there either. - spec says we "should" include summary level triples describing Update frequency and SPARQL endpoint but I'm omitting this for now, because these are not clearly defined at the moment VERSION LEVEL TRIPLES: [version level resource] - rdf:type -> dctypes:Dataset [version level resource] - dct:title -> version title (literal) [version level resource] - dct:description -> version description (literal) [version level resource] - dct:created -> ingest timestamp [ISO 8601 compliant] [version level resource] - pav:version -> ingest timestamp (same one above) [version level resource] - dct:creator -> monarchinitiative.org [version level resource] - dct:publisher -> monarchinitiative.org [version level resource] - dct:isVersionOf -> [summary level resource] [version level resource] - dcterms:source -> [source file 1 IRI] [version level resource] - dcterms:source -> [source file 2 IRI] ... [source file 1 IRI] - pav:retrievedOn -> [download date timestamp] [source file 2 IRI] - pav:version -> [source version (if set, optional)] [source file 2 IRI] - pav:retrievedOn -> [download date timestamp] [source file 2 IRI] - pav:version -> [source version (if set, optional)] ... [version level resource] - pav:createdWith -> [Dipper github URI] [version level resource] - void:dataset -> [distribution level resource] [version level resource] - cito:citesAsAuthoriy -> [citation id 1] [version level resource] - cito:citesAsAuthoriy -> [citation id 2] [version level resource] - cito:citesAsAuthoriy -> [citation id 3] n.b: about version level resource triples: - spec says we "should" include Date of issue/dct:issued triple, but I'm not because it is redundant with this triple above: [version level resource] - dct:created -> time stamp and would introduce ambiguity and confusion if the two disagree. Same below for [distribution level resource] - dct:created -> tgiime stamp below Also omitting: - triples linking to our logo and page, see above. - License/dct:license triple, because we will make this triple via the [distribution level resource] below - Language/dct:language triple b/c it seems superfluous. Same below for [distribution level resource] - no language triple. - [version level resource] - pav:version triple is also a bit redundant with the pav:version triple below, but the spec requires both these triples - I'm omitting the [version level resource] -> pav:previousVersion because Dipper doesn't know this info for certain at run time. Same below for [distribution level resource] - pav:previousVersion. DISTRIBUTION LEVEL TRIPLES: [distribution level resource] - rdf:type -> dctypes:Dataset [distribution level resource] - rdf:type -> dcat:Distribution [distribution level resource] - dct:title -> distribution title (literal) [distribution level resource] - dct:description -> distribution description (lit.) [distribution level resource] - dct:created -> ingest timestamp[ISO 8601 compliant] [distribution level resource] - pav:version -> ingest timestamp (same as above) [distribution level resource] - dct:creator -> monarchinitiative.org [distribution level resource] - dct:publisher -> monarchinitiative.org [distribution level resource] - dct:license -> [license info, if available otherwise indicate unknown] [distribution level resource] - dcterms:rights -> [data rights IRI] [distribution level resource] - pav:createdWith -> [Dipper github URI] [distribution level resource] - dct:format -> [IRI of ttl|nt|whatever spec] [distribution level resource] - dct:downloadURL -> [ttl|nt URI] [distribution level resource] - void:triples -> [triples count (literal)] [distribution level resource] - void:entities -> [entities count (literal)] [distribution level resource] - void:distinctSubjects -> [subject count (literal)] [distribution level resource] - void:distinctObjects -> [object count (literal)] [distribution level resource] - void:properties -> [properties count (literal)] ... n.b: about distribution level resource triples: - omitting Vocabularies used/void:vocabulary and Standards used/dct:conformTo triples, because they are described in the ttl file - also omitting Example identifier/idot:exampleIdentifier and Example resource/void:exampleResource, because we don't really have one canonical example of either - they're all very different. - [distribution level resource] - dct:created should have the exact same time stamp as this triple above: [version level resource] - dct:created -> time stamp - this [distribution level resource] - pav:version triple should have the same object as [version level resource] - pav:version triple above - Data source provenance/dct:source triples are above in the [version level resource] - omitting Byte size/dct:byteSize, RDF File URL/void:dataDump, and Linkset/void:subset triples because they probably aren't necessary for MI right now - these triples "should" be emitted, but we will do this in a later iteration: # of classes void:classPartition IRI # of literals void:classPartition IRI # of RDF graphs void:classPartition IRI Note: Do not use blank nodes in the dataset graph. This dataset graph is added to the main Dipper graph in Source.write() like so $ mainGraph = mainGraph + datasetGraph which apparently in theory could lead to blank node ID collisions between the two graphs. Note also that this implementation currently does not support producing metadata for StreamedGraph graphs (see dipper/graph/StreamedGraph.py). StreamedGraph is currently not being used for any ingests, so this isn't a problem. There was talk of using StreamedGraph for a rewrite/refactor of the Clinvar ingest, which would probably require adding support here for StreamedGraph's. """ def __init__( self, identifier, data_release_version, ingest_name, ingest_title, ingest_url, ingest_logo=None, ingest_description=None, license_url=None, data_rights=None, graph_type='rdf_graph', # rdf_graph, streamed_graph file_handle=None, distribution_type='ttl', dataset_curie_prefix='MonarchArchive'): if graph_type is None: self.graph = RDFGraph(None, ":".join([dataset_curie_prefix, identifier])) elif graph_type == 'streamed_graph': self.graph = StreamedGraph(True, ":".join( [dataset_curie_prefix, identifier]), file_handle=file_handle) elif graph_type == 'rdf_graph': self.graph = RDFGraph(True, ':'.join([dataset_curie_prefix, identifier])) if data_release_version is not None: self.data_release_version = data_release_version else: self.data_release_version = datetime.today().strftime("%Y%m%d") self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.identifier = ':'.join([dataset_curie_prefix, identifier]) self.citation = set() self.ingest_name = ingest_name self.ingest_title = ingest_title if self.ingest_title is None: self.ingest_title = ":".join([dataset_curie_prefix, identifier]) self.ingest_url = ingest_url self.ingest_logo = self.curie_map.get('MonarchLogoRepo') + ingest_logo self.ingest_description = ingest_description self.date_issued = None self.license_url = license_url self.data_rights = data_rights self.distribution_type = distribution_type # set HCLS resource CURIEs self.summary_level_curie = ':'.join( [dataset_curie_prefix, '#' + identifier]) self.version_level_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/#' + identifier self.distribution_level_turtle_curie = \ dataset_curie_prefix + ':' + \ self.data_release_version + \ '/rdf/' + \ identifier + "." + self.distribution_type # The following might seem a little odd, but we need to set downloadURLs this # way in order for them to point to where they will end up in archive.MI.org as # of Sept 2019. URL is: # https://archive.MI.org/[release version]/[dist type]/[source].[dist type] self.download_url = \ self.curie_map.get("MonarchArchive") + self.data_release_version + \ "/rdf/" + self.ingest_name + "." + self.distribution_type self._set_summary_level_triples() self._set_version_level_triples() self._set_distribution_level_triples() def _set_summary_level_triples(self): self.model.addType(self.summary_level_curie, self.globaltt['Dataset']) self.graph.addTriple(self.summary_level_curie, self.globaltt['title'], self.ingest_title, True) self.model.addTriple(self.summary_level_curie, self.globaltt['Publisher'], self.curie_map.get("")) self.model.addTriple(self.summary_level_curie, "schema:logo", self.ingest_logo) self.graph.addTriple(self.summary_level_curie, self.globaltt['identifier'], self.summary_level_curie) if self.ingest_url is not None: self.graph.addTriple(self.summary_level_curie, self.globaltt["Source"], self.ingest_url) if self.ingest_description is not None: self.model.addDescription(self.summary_level_curie, self.ingest_description) def _set_version_level_triples(self): self.model.addType(self.version_level_curie, self.globaltt['Dataset']) self.graph.addTriple( self.version_level_curie, self.globaltt['title'], self.ingest_title + " Monarch version " + self.data_release_version, True) if self.ingest_description is not None: self.model.addDescription(self.version_level_curie, self.ingest_description) self.graph.addTriple( self.version_level_curie, self.globaltt['Date Created'], Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date)) self.graph.addTriple( self.version_level_curie, self.globaltt['version'], Literal(self.data_release_version, datatype=XSD.date)) self.graph.addTriple(self.version_level_curie, self.globaltt['creator'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.version_level_curie, self.globaltt['Publisher'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.version_level_curie, self.globaltt['isVersionOf'], self.summary_level_curie, object_is_literal=False) self.graph.addTriple(self.version_level_curie, self.globaltt['distribution'], self.distribution_level_turtle_curie, object_is_literal=False) def _set_distribution_level_triples(self): self.model.addType(self.distribution_level_turtle_curie, self.globaltt['Dataset']) self.model.addType(self.distribution_level_turtle_curie, self.globaltt['distribution']) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['title'], self.ingest_title + " distribution " + self.distribution_type, True) if self.ingest_description is not None: self.model.addDescription(self.distribution_level_turtle_curie, self.ingest_description) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['version'], Literal(self.data_release_version, datatype=XSD.date)) self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['Date Created'], Literal(datetime.today().strftime("%Y%m%d"), datatype=XSD.date)) self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['creator'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['Publisher'], self.curie_map.get("")) # eval's to MI.org self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['created_with'], "https://github.com/monarch-initiative/dipper") self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['format'], "https://www.w3.org/TR/turtle/") self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['downloadURL'], self.download_url) if self.license_url is None: self.graph.addTriple( self.distribution_level_turtle_curie, self.globaltt['license'], 'https://project-open-data.cio.gov/unknown-license/') else: self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['license'], self.license_url) if self.data_rights is not None: self.graph.addTriple(self.distribution_level_turtle_curie, self.globaltt['rights'], self.data_rights) self._declare_as_ontology() def set_ingest_source_file_version_num(self, file_iri, version): """ This method sets the version of a remote file or resource that is used in the ingest. It writes this triple: file_iri - 'pav:version' -> version Version is an untyped literal Note: if your version is a date or timestamp, use set_ingest_source_file_version_date() instead :param file_iri: a remote file or resource used in ingest :param version: a number or string (e.g. v1.2.3) that the source (OMIM, CTD) uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['version'], version, object_is_literal=True) def set_ingest_source_file_version_date(self, file_iri, date, datatype=XSD.date): """ This method sets the version that the source (OMIM, CTD, whatever) uses to refer to this version of the remote file/resource that was used in the ingest It writes this triple: file_iri - 'pav:version' -> date or timestamp Version is added as a literal of datatype XSD date Note: if file_iri was retrieved using get_files(), then the following triple was created and you might not need this method: file_iri - 'pav:retrievedOn' -> download date :param file_iri: a remote file or resource used in ingest :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can add timestamp as a version by using a different datatype (below) :param datatype: an XSD literal datatype, default is XSD.date uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['version'], date, object_is_literal=True, literal_type=datatype) def set_ingest_source_file_version_retrieved_on(self, file_iri, date, datatype=XSD.date): """ This method sets the date on which a remote file/resource (from OMIM, CTD, etc) was retrieved. It writes this triple: file_iri - 'pav:retrievedOn' -> date or timestamp Version is added as a literal of datatype XSD date by default Note: if file_iri was retrieved using get_files(), then the following triple was created and you might not need this method: file_iri - 'pav:retrievedOn' -> download date :param file_iri: a remote file or resource used in ingest :param date: a date in YYYYMMDD format that the source (OMIM, CTD). You can add timestamp as a version by using a different datatype (below) :param datatype: an XSD literal datatype, default is XSD.date uses to refer to this version of the file/resource used during the ingest :return: None """ self.graph.addTriple(file_iri, self.globaltt['retrieved_on'], date, object_is_literal=True, literal_type=datatype) def set_ingest_source(self, url, predicate=None, is_object_literal=False): """ This method writes a triple to the dataset graph indicating that the ingest used a file or resource at [url] during the ingest. Triple emitted is version_level_curie dcterms:source [url] This triple is likely to be redundant if Source.get_files() is used to retrieve the remote files/resources, since this triple should also be emitted as files/resources are being retrieved. This method is provided as a convenience method for sources that do their own downloading of files. :param url: a remote resource used as a source during ingest :param predicate: the predicate to use for the triple ["dcterms:source"] from spec (https://www.w3.org/TR/2015/NOTE-hcls-dataset-20150514/) "Use dct:source when the source dataset was used in whole or in part. Use pav:retrievedFrom when the source dataset was used in whole and was not modified from its original distribution. Use prov:wasDerivedFrom when the source dataset was in whole or in part and was modified from its original distribution." :return: None """ if predicate is None: predicate = self.globaltt["Source"] self.graph.addTriple(self.version_level_curie, predicate, url, object_is_literal=is_object_literal, subject_category=blv.terms['DataSetVersion']) def get_graph(self): """ This method returns the dataset graph :param :return: dataset graph """ return self.graph def get_license(self): """ This method returns the license info :param :return: license info """ return self.license_url def set_citation(self, citation_id): """ This method adds [citaton_id] argument to the set of citations, and also adds a triple indicating that version level cito:citesAsAuthority [citation_id] :param: citation_id :return: none """ self.citation.add(citation_id) self.graph.addTriple(self.version_level_curie, self.globaltt['citesAsAuthority'], citation_id) def _declare_as_ontology(self, version_info=None): """ Declare the distribution level IRI as an ontology, and also make triple distribution level IRI - version_iri -> version level IRI TEC: I am not convinced dipper reformatting external data as RDF triples makes an OWL ontology (nor that it should be considered a goal). Proper ontologies are built by ontologists. Dipper reformats data and annotates/decorates it with a minimal set of carefully arranged terms drawn from from multiple proper ontologies. Which allows the whole (dipper's RDF triples and parent ontologies) to function as a single ontology we can reason over when combined in a store such as SciGraph. Including more than the minimal ontological terms in dipper's RDF output constitutes a liability as it allows greater divergence between dipper artifacts and the proper ontologies. :param version_info: a string describing version info for the ontology :return: """ model = Model(self.graph) model.addOntologyDeclaration(self.summary_level_curie) model.addOWLVersionIRI(self.summary_level_curie, self.version_level_curie) if version_info is not None: model.addOWLVersionInfo(self.distribution_level_turtle_curie, version_info) @staticmethod def make_id(long_string, prefix='MONARCH'): """ A method to create DETERMINISTIC identifiers based on a string's digest. currently implemented with sha1 Duplicated from Source.py to avoid circular imports. :param long_string: string to use to generate identifier :param prefix: prefix to prepend to identifier [Monarch] :return: a Monarch identifier """ return ':'.join((prefix, Dataset.hash_id(long_string))) @staticmethod def hash_id(word): # same as graph/GraphUtils.digest_id(wordage) """ Given a string, make a hash Duplicated from Source.py. :param word: str string to be hashed :return: hash of id """ return 'b' + hashlib.sha1(word.encode('utf-8')).hexdigest()[1:20]
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian??? phenotype """ # prep record # remove description and mapp Experiment Type to apo term experiment_type = record['Experiment Type'].split('(')[0] experiment_type = experiment_type.split(',') record['experiment_type'] = list() for exp_type in experiment_type: exp_type = exp_type.lstrip().rstrip() record['experiment_type'].append({ 'id': self.apo_term_id[exp_type], 'term': exp_type, }) sgd_phenotype = record['Phenotype'] pheno_obj = { 'entity': { 'term': None, 'apo_id': None }, 'quality': { 'term': None, 'apo_id': None }, 'has_quality': False # descriptive and don't bother looking for a quality } phenotype = record['Phenotype'] if ':' in phenotype: pheno_obj['has_quality'] = True ent_qual = sgd_phenotype.split(': ') entity = ent_qual[0] quality = ent_qual[1] pheno_obj['entity']['term'] = entity pheno_obj['entity']['apo_id'] = self.apo_term_id[entity] pheno_obj['quality']['term'] = quality pheno_obj['quality']['apo_id'] = self.apo_term_id[quality] else: pheno_obj['entity']['term'] = phenotype pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype] record['pheno_obj'] = pheno_obj # begin modeling model = Model(self.graph) # define the triple gene = 'SGD:{}'.format(record['SGDID']) relation = self.globaltt['has phenotype'] if record['pheno_obj']['has_quality']: pheno_label = '{0}:{1}'.format( record['pheno_obj']['entity']['term'], record['pheno_obj']['quality']['term']) pheno_id = 'MONARCH:{0}{1}'.format( record['pheno_obj']['entity']['apo_id'].replace(':', '_'), record['pheno_obj']['quality']['apo_id'].replace(':', '_')) g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) else: pheno_label = record['pheno_obj']['entity']['term'] pheno_id = record['pheno_obj']['entity']['apo_id'] g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) assoc_id = g2p_assoc.make_association_id('yeastgenome.org', gene, relation, pheno_id) g2p_assoc.set_association_id(assoc_id=assoc_id) # add to graph to mint assoc id g2p_assoc.add_association_to_graph() model.addLabel(subject_id=gene, label=record['Gene Name']) # add the association triple model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id) model.addTriple(subject_id=pheno_id, predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['phenotype']) # label nodes # pheno label model.addLabel(subject_id=pheno_id, label=pheno_label) g2p_assoc.description = self._make_description(record) # add the references references = record['Reference'] references = references.replace(' ', '') references = references.split('|') # created Ref prefix in curie map to route to proper reference URL in SGD if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference(self.graph, references[0], self.globaltt['publication']) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add experiment type as evidence for exp_type in record['experiment_type']: g2p_assoc.add_evidence(exp_type['id']) model.addLabel(subject_id=exp_type['id'], label=exp_type['term']) try: g2p_assoc.add_association_to_graph() except Exception as e: print(e) return
def make_association(self, record): """ contstruct the association :param record: :return: modeled association of genotype to mammalian phenotype """ # prep record # remove description and mapp Experiment Type to apo term experiment_type = record['Experiment Type'].split('(')[0] experiment_type = experiment_type.split(',') record['experiment_type'] = list() for exp_type in experiment_type: exp_type = exp_type.lstrip().rstrip() record['experiment_type'].append({ 'id': self.apo_term_id[exp_type], 'term': exp_type, }) sgd_phenotype = record['Phenotype'] pheno_obj = { 'entity': { 'term': None, 'apo_id': None }, 'quality': { 'term': None, 'apo_id': None }, 'has_quality': False # False = phenotype was descriptive and don't bother looking for a quality } phenotype = record['Phenotype'] if ':' in phenotype: pheno_obj['has_quality'] = True ent_qual = sgd_phenotype.split(': ') entity = ent_qual[0] quality = ent_qual[1] pheno_obj['entity']['term'] = entity pheno_obj['entity']['apo_id'] = self.apo_term_id[entity] pheno_obj['quality']['term'] = quality pheno_obj['quality']['apo_id'] = self.apo_term_id[quality] else: pheno_obj['entity']['term'] = phenotype pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype] record['pheno_obj'] = pheno_obj # begin modeling model = Model(self.graph) # define the triple gene = 'SGD:{}'.format(record['SGDID']) relation = Model.object_properties['has_phenotype'] # has phenotype if record['pheno_obj']['has_quality']: pheno_label = '{0}:{1}'.format( record['pheno_obj']['entity']['term'], record['pheno_obj']['quality']['term']) pheno_id = 'MONARCH:{0}{1}'.format( record['pheno_obj']['entity']['apo_id'].replace(':', '_'), record['pheno_obj']['quality']['apo_id'].replace(':', '_')) g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) else: pheno_label = record['pheno_obj']['entity']['term'] pheno_id = record['pheno_obj']['entity']['apo_id'] g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation) assoc_id = g2p_assoc.make_association_id( definedby='yeastgenome.org', subject=gene, predicate=relation, object=pheno_id) g2p_assoc.set_association_id(assoc_id=assoc_id) # add to graph to mint assoc id g2p_assoc.add_association_to_graph() model.addLabel(subject_id=gene, label=record['Gene Name']) # add the association triple model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id) # make pheno subclass of UPHENO:0001001 model.addTriple(subject_id=pheno_id, predicate_id=Model.object_properties['subclass_of'], obj='UPHENO:0001001') # label nodes # pheno label model.addLabel(subject_id=pheno_id, label=pheno_label) # add the descripiton: all the unmodeled data in a '|' delimited list description = [ 'genomic_background: {}'.format(record['Strain Background']), 'allele: {}'.format(record['Allele']), 'chemical: {}'.format(record['Chemical']), 'condition: {}'.format(record['Condition']), 'details: {}'.format(record['Details']), 'feature_name: {}'.format(record['Feature Name']), 'gene_name: {}'.format(record['Gene Name']), 'mutant_type: {}'.format(record['Mutant Type']), 'reporter: {}'.format(record['Reporter']), ] g2p_assoc.description = " | ".join(description) # add the references references = record['Reference'] references = references.replace(' ', '') references = references.split('|') # created RGDRef prefix in curie map to route to proper reference URL in RGD if len(references) > 0: # make first ref in list the source g2p_assoc.add_source(identifier=references[0]) ref_model = Reference(self.graph, references[0], Reference.ref_types['publication']) ref_model.addRefToGraph() if len(references) > 1: # create equivalent source for any other refs in list for ref in references[1:]: model.addSameIndividual(sub=references[0], obj=ref) # add experiment type as evidence for exp_type in record['experiment_type']: g2p_assoc.add_evidence(exp_type['id']) model.addLabel(subject_id=exp_type['id'], label=exp_type['term']) try: g2p_assoc.add_association_to_graph() except Exception as e: print(e) return
def _add_study_provenance(self, phenotyping_center, colony, project_name, pipeline_name, pipeline_stable_id, procedure_stable_id, procedure_name, parameter_stable_id, parameter_name, statistical_method, resource_name): """ :param phenotyping_center: str, from self.files['g2p_assertions']['columns'] :param colony: str, from self.files['g2p_assertions'] :param project_name: str, from self.files['g2p_assertions'] :param pipeline_name: str, from self.files['g2p_assertions'] :param pipeline_stable_id: str, from self.files['g2p_assertions'] :param procedure_stable_id: str, from self.files['g2p_assertions'] :param procedure_name: str, from self.files['g2p_assertions'] :param parameter_stable_id: str, from self.files['g2p_assertions'] :param parameter_name: str, from self.files['g2p_assertions'] :param statistical_method: str, from self.files['g2p_assertions'] :param resource_name: str, from self.files['g2p_assertions'] :return: study bnode """ provenance_model = Provenance(self.graph) model = Model(self.graph) # Add provenance # A study is a blank node equal to its parts study_bnode = self.make_id( "{0}{1}{2}{3}{4}{5}{6}{7}".format( phenotyping_center, colony, project_name, # switched to from 'project_fullname' 2020 V12 pipeline_stable_id, procedure_stable_id, parameter_stable_id, statistical_method, resource_name), '_') model.addIndividualToGraph(study_bnode, None, self.globaltt['study']) # List of nodes linked to study with has_part property study_parts = [] pipeline_curie = 'IMPC-pipe:' + pipeline_stable_id procedure_curie = 'IMPC-proc:' + procedure_stable_id parameter_curie = 'IMPC-param:' + procedure_stable_id parameter_curie += '#' + parameter_stable_id # Add study parts model.addIndividualToGraph(procedure_curie, procedure_name) study_parts.append(procedure_curie) # ? stable or curie study_parts.append(self.resolve(statistical_method)) provenance_model.add_study_parts(study_bnode, study_parts) # Add parameter/measure statement: study measures parameter parameter_label = "{0} ({1})".format(parameter_name, procedure_name) # logging.info("Adding Provenance for %s", project_name) model.addIndividualToGraph(parameter_curie, parameter_label) provenance_model.add_study_measure(study_bnode, parameter_curie, object_is_literal=False) # Add Colony colony_bnode = self.make_id("{0}".format(colony), '_') model.addIndividualToGraph(colony_bnode, colony) # Add study agent phenotyping_center_id = self.localtt[phenotyping_center] model.addIndividualToGraph(phenotyping_center_id, phenotyping_center, self.globaltt['organization']) # self.graph model.addTriple(study_bnode, self.globaltt['has_agent'], phenotyping_center_id) # add pipeline and project model.addIndividualToGraph(pipeline_curie, pipeline_name) # self.graph model.addTriple(study_bnode, self.globaltt['part_of'], pipeline_curie) # as of V12 col 'project_fullname' became empty switched to 'project_name' if project_name is not None and project_name != '': for prj_nm in project_name.split(','): project_name_id = self.localtt[prj_nm] model.addIndividualToGraph(project_name_id, project_name, self.globaltt['project']) model.addTriple(study_bnode, self.globaltt['part_of'], project_name_id) return study_bnode
def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple(subject_id=package['unii'], predicate_id=target['action'], obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple(subject_id=target['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) model.addTriple(subject_id=package['drugbank_id'], predicate_id=self.globaltt['equivalent_class'], obj=package['unii']) model.addTriple( subject_id=target['action'], predicate_id=self.globaltt['subPropertyOf'], obj=self.globaltt['molecularly_interacts_with']) model.addTriple(subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) if source == 'drugcentral': for indication in package['indications']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['is substance that treats'], obj=indication['snomed_id']) model.addTriple(subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addTriple(subject_id=indication['snomed_id'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['disease']) model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple( subject_id=package['unii'], predicate_id=self.globaltt['molecularly_interacts_with'], obj=interaction['uniprot']) # model.addLabel( # subject_id=interaction['uniprot'], # label='Protein_{}'.format(interaction['uniprot'])) model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple(subject_id=package['unii'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['molecular entity']) model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple(subject_id=interaction['uniprot'], predicate_id=self.globaltt['subclass_of'], obj=self.globaltt['polypeptide']) return
def make_triples(self, source, package): model = Model(self.graph) if source == 'drugbank': for target in package['targets']: model.addTriple(subject_id=package['unii'],predicate_id=target['action'],obj=target['uniprot']) model.addLabel(subject_id=target['uniprot'], label=target['name']) model.addTriple(subject_id=target['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') model.addTriple(subject_id=package['drugbank_id'], predicate_id=Model.object_properties['equivalent_class'], obj=package['unii']) model.addTriple(subject_id=target['action'], predicate_id='rdfs:subPropertyOf', obj='RO:0002436') model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') if source == 'drugcentral': for indication in package['indications']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002606', obj=indication['snomed_id']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addTriple(subject_id=indication['snomed_id'], predicate_id=Model.object_properties['subclass_of'], obj='DOID:4') model.addLabel(subject_id=indication['snomed_id'], label=indication['snomed_name']) for interaction in package['interactions']: model.addTriple(subject_id=package['unii'], predicate_id='RO:0002436', obj=interaction['uniprot']) # model.addLabel(subject_id=interaction['uniprot'], label='Protein_{}'.format(interaction['uniprot'])) model.addLabel(subject_id=interaction['uniprot'], label=interaction['target_name']) model.addTriple(subject_id=package['unii'], predicate_id=Model.object_properties['subclass_of'], obj='CHEBI:23367') model.addDescription(subject_id=interaction['uniprot'], description=interaction['target_class']) model.addTriple(subject_id=interaction['uniprot'], predicate_id=Model.object_properties['subclass_of'], obj='SO:0000104') return
def _add_gene_equivalencies(self, dbxrefs, gene_id, taxon): """ Add equivalentClass and sameAs relationships Uses external resource map located in /resources/clique_leader.yaml to determine if an NCBITaxon ID space is a clique leader """ clique_map = self.open_and_parse_yaml(self.resources['clique_leader']) if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport', ''] # deal with the dbxrefs # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696 for dbxref in dbxrefs.strip().split('|'): prefix = ':'.join(dbxref.split(':')[:-1]).strip() # restore nonterminal ':' if prefix in self.localtt: prefix = self.localtt[prefix] # skip some of these for now based on curie prefix if prefix in filter_out: continue dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1])) if dbxref_curie is not None: if prefix == 'HPRD': # proteins are not == genes. model.addTriple( gene_id, self.globaltt['has gene product'], dbxref_curie) continue if prefix == 'ENSEMBL': model.addXref(gene_id, dbxref_curie) if prefix == 'OMIM': omim_num = dbxref_curie[5:] if omim_num in self.omim_replaced: repl = self.omim_replaced[omim_num] for omim in repl: if omim in self.omim_type and \ self.omim_type[omim] == self.globaltt['gene']: dbxref_curie = 'OMIM:' + omim model.addXref(gene_id, dbxref_curie) omim_num = omim # last wins elif omim_num in self.omim_type and\ self.omim_type[omim_num] == self.globaltt['gene']: model.addXref(gene_id, dbxref_curie) else: continue # no equivilance between ncbigene and omin-nongene # designate clique leaders # (perhaps premature as this ingest can't know what else exists) try: if self.class_or_indiv.get(gene_id) == 'C': model.addEquivalentClass(gene_id, dbxref_curie) if taxon in clique_map: if clique_map[taxon] == prefix: model.makeLeader(dbxref_curie) elif clique_map[taxon] == gene_id.split(':')[0]: model.makeLeader(gene_id) else: model.addSameIndividual(gene_id, dbxref_curie) except AssertionError as err: LOG.warning("Error parsing %s: %s", gene_id, err)