def __init__( self, graph, feature_id=None, label=None, feature_type=None, description=None, feature_category=None ): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.gfxutl = GraphUtils(self.curie_map) self.fid = feature_id self.feature_category = feature_category self.label = label self.ftype = feature_type self.description = description self.start = None self.stop = None self.taxon = None
def _add_therapy_drug_association(self, drug_id, disease_id, therapy_status_id): """ Create an association linking a drug and disease with RO:0002606 (substance_that_treats) and any supporting information such as FDA approval and source (not implemented) :param drug_id: Id as curie of the drug :param disease_id: Id as curie of the disease :param therapy_status: (Optional) String label of therapy approval status :return: None """ gu = GraphUtils(curie_map.get()) # Placeholder relationship, note this does not exist in RO relationship_id = "RO:has_approval_status" gu.addTriple(self.graph, drug_id, gu.object_properties['substance_that_treats'], disease_id) # Make association drug_disease_annot = self.make_cgd_id("assoc{0}{1}".format(drug_id, disease_id)) therapy_disease_assoc = Assoc(self.name) therapy_disease_assoc.set_subject(drug_id) therapy_disease_assoc.set_relationship(gu.object_properties['substance_that_treats']) therapy_disease_assoc.set_object(disease_id) therapy_disease_assoc.set_association_id(drug_disease_annot) therapy_disease_assoc.add_association_to_graph(self.graph) gu.addTriple(self.graph, drug_disease_annot, relationship_id, therapy_status_id)
def __init__(self, graph, definedby, entity_id, phenotype_id, rel=None, entity_category=None, phenotype_category=None): super().__init__(graph, definedby) self.entity_id = entity_id self.phenotype_id = phenotype_id if rel is None: rel = self.globaltt['has phenotype'] self.start_stage_id = None self.end_stage_id = None self.environment_id = None self.stage_process_id = None self.set_subject(entity_id) self.set_object(phenotype_id) self.set_relationship(rel) self.subject_category = entity_category self.object_category = phenotype_category self.gut = GraphUtils(None) return
def __init__(self, graph): if isinstance(graph, Graph): self.graph = graph else: raise ValueError("{} is not a graph".format(graph)) self.model = Model(self.graph) self.globaltt = self.graph.globaltt self.globaltcid = self.graph.globaltcid self.curie_map = self.graph.curie_map self.gut = GraphUtils(self.curie_map)
def _add_variant_gene_relationship(self, variant_id, hgnc_symbol): """ :param variant_id :param hgnc_symbol :return: None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) if hgnc_symbol in self.gene_map: gene_id = self.gene_map[hgnc_symbol] else: gene_id = self.make_cgd_id("{0}{1}".format(variant_id, hgnc_symbol)) logger.warn("Can't map gene symbol {0} " "to entrez ID".format(hgnc_symbol)) gu.addClassToGraph(self.graph, gene_id, hgnc_symbol) geno.addAlleleOfGene(variant_id, gene_id) return
def write(self, fmt='turtle', stream=None, write_metadata_in_main_graph=False): """ This convenience method will write out all of the graphs associated with the source. Right now these are hardcoded to be a single main "graph" and a "src_dataset.ttl" and a "src_test.ttl" If you do not supply stream='stdout' it will default write these to files. In addition, if the version number isn't yet set in the dataset, it will be set to the date on file. :return: None """ fmt_ext = { 'rdfxml': 'xml', 'turtle': 'ttl', 'nt': 'nt', # ntriples 'nquads': 'nq', 'n3': 'n3' # notation3 } # make the regular graph output file dest = None if self.name is not None: dest = '/'.join((self.outdir, self.name)) if fmt in fmt_ext: dest = '.'.join((dest, fmt_ext.get(fmt))) else: dest = '.'.join((dest, fmt)) LOG.info("Setting outfile to %s", dest) # make the dataset_file name, always format as turtle self.datasetfile = '/'.join( (self.outdir, self.name + '_dataset.ttl')) LOG.info("Setting dataset file to %s", self.datasetfile) else: LOG.warning("No output file set. Using stdout") stream = 'stdout' graph_util = GraphUtils(None) # the _dataset description is always turtle graph_util.write(self.dataset.get_graph(), 'turtle', filename=self.datasetfile) if self.test_mode: # unless we stop hardcoding, the test dataset is always turtle LOG.info("Setting testfile to %s", self.testfile) graph_util.write(self.testgraph, 'turtle', filename=self.testfile) if write_metadata_in_main_graph: self.graph = self.graph + self.dataset.get_graph() # print graph out if stream is None: outfile = dest elif stream.lower().strip() == 'stdout': outfile = None else: LOG.error("I don't understand our stream.") return graph_util.write(self.graph, fmt, filename=outfile)
def write(self, fmt='turtle', stream=None): """ This convenience method will write out all of the graphs associated with the source. Right now these are hardcoded to be a single "graph" and a "src_dataset.ttl" and a "src_test.ttl" If you do not supply stream='stdout' it will default write these to files. In addition, if the version number isn't yet set in the dataset, it will be set to the date on file. :return: None """ fmt_ext = { 'rdfxml': 'xml', 'turtle': 'ttl', 'nt': 'nt', # ntriples 'nquads': 'nq', 'n3': 'n3' } # make the regular graph output file dest = None if self.name is not None: dest = '/'.join((self.outdir, self.name)) if fmt in fmt_ext: dest = '.'.join((dest, fmt_ext.get(fmt))) else: dest = '.'.join((dest, fmt)) logger.info("Setting outfile to %s", dest) # make the datasetfile name, always format as turtle datasetfile = '/'.join((self.outdir, self.name + '_dataset.ttl')) if self.dataset is not None and self.dataset.version is None: self.dataset.set_version_by_date() logger.info("No version for " + self.name + " setting to date issued.") else: logger.warning("No output file set. Using stdout") stream = 'stdout' gu = GraphUtils(None) # the _dataset descriptions is always turtle gu.write(self.dataset.getGraph(), 'turtle', file=datasetfile) # unless we stop hardcoding above, the test dataset is always turtle if self.testMode: gu.write(self.testgraph, 'turtle', file=self.testfile) # print graph out if stream is None: f = dest elif stream.lower().strip() == 'stdout': f = None else: logger.error("I don't understand your stream.") return gu.write(self.graph, fmt, file=f) return
def add_disease_drug_variant_to_graph(self, table): """ Takes an iterable of iterables as input with the following structure, optional indices can be Null: [[variant_key, variant_label, diagnoses_key, diagnoses, specific_diagnosis, organ, relationship, drug_key, drug, therapy_status (optional), pubmed_id(optional)]] See ongoing discussion of how to best model here: https://github.com/monarch-initiative/mckb/issues/9 :param table: iterable of iterables, for example, a tuple of tuples from _get_disease_drug_variant_relationship :return: None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) for row in table: (variant_key, variant_label, diagnoses_key, diagnoses, specific_diagnosis, organ, relationship, drug_key, drug_label, therapy_status, pubmed_id) = row if specific_diagnosis is not None: diagnoses_label = specific_diagnosis else: diagnoses_label = diagnoses # Arbitrary IDs to be replaced by ontology mappings variant_id = self.make_cgd_id('variant{0}'.format(variant_key)) disease_id = self._get_disease_id(diagnoses_key, diagnoses_label) therapy_status_id = self.make_cgd_id('{0}'.format(therapy_status)) relationship_id = "RO:has_environment" disease_quality = ("CGD:{0}".format(relationship)).replace(" ", "_") has_quality_property = "BFO:0000159" drug_id = self._get_drug_id(drug_key, drug_label) geno.addGenotype(variant_id, variant_label, geno.genoparts['sequence_alteration']) disease_instance_id = self.make_cgd_id('disease{0}{1}'.format( diagnoses_label, variant_key)) phenotype_instance_id = self.make_cgd_id('phenotype{0}{1}{2}'.format( diagnoses_label, variant_key, relationship)) phenotype_instance_label = "{0} with {1} to therapy".format(diagnoses_label, relationship) if relationship == "detrimental effect": phenotype_instance_label = "{0} with therapeutic response {1} to health"\ .format(diagnoses_label, relationship) # Reified association for disease caused_by genotype variant_disease_annot = self.make_cgd_id("assoc{0}{1}".format(variant_key, diagnoses_label)) # Add individuals/classes gu.addClassToGraph(self.graph, disease_id, diagnoses_label, 'DOID:4') gu.addClassToGraph(self.graph, drug_id, drug_label, 'CHEBI:23888') gu.addIndividualToGraph(self.graph, phenotype_instance_id, phenotype_instance_label, disease_id) gu.loadObjectProperties(self.graph, {relationship: relationship_id}) if pubmed_id is not None: source_id = "PMID:{0}".format(pubmed_id) ref = Reference(source_id, Reference.ref_types['journal_article']) ref.addRefToGraph(self.graph) evidence = 'ECO:0000033' else: source_id = None evidence = None rel_id = gu.object_properties['has_phenotype'] variant_phenotype_assoc = G2PAssoc(self.name, variant_id, phenotype_instance_id, rel_id) variant_phenotype_assoc.set_association_id(variant_disease_annot) if evidence: variant_phenotype_assoc.add_evidence(evidence) if source_id: variant_phenotype_assoc.add_source(source_id) variant_phenotype_assoc.add_association_to_graph(self.graph) gu.addTriple(self.graph, variant_disease_annot, relationship_id, drug_id) gu.addTriple(self.graph, phenotype_instance_id, has_quality_property, disease_quality) # Add therapy-disease association and approval status marker_relation = "RO:has_biomarker" disease_instance_label = "{0} with biomarker {1}".format(diagnoses_label, variant_label) gu.addIndividualToGraph(self.graph, disease_instance_id, disease_instance_label, disease_id) gu.addTriple(self.graph, disease_instance_id, marker_relation, variant_id) gu.addClassToGraph(self.graph, therapy_status_id, therapy_status) self._add_therapy_drug_association(drug_id, disease_instance_id, therapy_status_id) return
def _add_variant_cdna_variant_assoc_to_graph(self, row): """ Generates relationships between variants and cDNA variants given a row of data :param iterable: row of data, see add_variant_info_to_graph() docstring for expected structure. Only applicable for structure 2. :return None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) is_literal = True (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source, variant_gene, bp_pos, variant_cdna, cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base, variant_base, primary_transcript_exons, primary_transcript_variant_sub_types, variant_type, chromosome, genome_build, build_version, build_date) = row variant_id = self.make_cgd_id('variant{0}'.format(variant_key)) # Add gene self._add_variant_gene_relationship(variant_id, variant_gene) # Transcript reference for nucleotide position transcript_curie = self._make_transcript_curie(transcript_id) # Make region IDs cdna_region_id = ":_{0}Region".format(transcript_curie) chrom_region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build, chromosome, genome_pos_start, genome_pos_end) # Add the genome build genome_label = "Human" build_id = "UCSC:{0}".format(genome_build) taxon_id = 'NCBITaxon:9606' geno.addGenome(taxon_id, genome_label) geno.addReferenceGenome(build_id, genome_build, taxon_id) # Add chromosome chrom_class_id = makeChromID(chromosome, '9606', 'CHR') # the chrom class (generic) id chrom_instance_id = makeChromID(chromosome, build_id, 'MONARCH') # first, add the chromosome class (in the taxon) geno.addChromosomeClass(chromosome, taxon_id, 'Human') # then, add the chromosome instance (from the given build) geno.addChromosomeInstance(chromosome, build_id, genome_build, chrom_class_id) # Add variant coordinates in reference to chromosome self._add_feature_with_coords(variant_id,genome_pos_start, genome_pos_end, chrom_instance_id, chrom_region_id) # Add mutation coordinates in reference to gene self._add_feature_with_coords(variant_id, bp_pos, bp_pos, transcript_curie, cdna_region_id) # Add nucleotide mutation gu.addTriple(self.graph, variant_id, geno.properties['reference_nucleotide'], ref_base, is_literal) gu.addTriple(self.graph, variant_id, geno.properties['altered_nucleotide'], variant_base, is_literal) """ Here we update any internal cgd variant IDS with a cosmic ID or dbSNP ID. Alternatively we could do this using sql rather than a sparql update which may be safer """ # Add SNP xrefs if cosmic_id is not None: cosmic_id_list = cosmic_id.split(', ') cosmic_curie_list = [] for c_id in cosmic_id_list: cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id) cosmic_curie_list.append(cosmic_curie) gu.addIndividualToGraph(self.graph, cosmic_curie, c_id, geno.genoparts['missense_variant']) # If there are multiple ids set them equivalent to the first for curie in cosmic_curie_list[1:]: gu.addSameIndividual(self.graph, cosmic_curie_list[0], curie) self._replace_entity(self.graph, variant_id, cosmic_curie_list[0], self.bindings) if db_snp_id is not None: db_snp_curie = re.sub(r'rs(\d+)', r'dbSNP:\1', db_snp_id) gu.addIndividualToGraph(self.graph, db_snp_curie, db_snp_id, geno.genoparts['missense_variant']) if cosmic_id is None: self._replace_entity(self.graph, variant_id, db_snp_curie, self.bindings) else: cosmic_id_list = cosmic_id.split(', ') for c_id in cosmic_id_list: cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id) gu.addSameIndividual(self.graph, cosmic_curie, db_snp_curie) return
def _add_variant_protein_variant_assoc_to_graph(self, row): """ Generates relationships between variants and protein variants given a row of data :param iterable: row of data, see add_variant_info_to_graph() docstring for expected structure :return None """ gu = GraphUtils(curie_map.get()) geno = Genotype(self.graph) is_missense = False is_literal = True (variant_key, variant_label, amino_acid_variant, amino_acid_position, transcript_id, transcript_priority, protein_variant_type, functional_impact, stop_gain_loss, transcript_gene, protein_variant_source) = row[0:11] variant_id = self.make_cgd_id('variant{0}'.format(variant_key)) transcript_curie = self._make_transcript_curie(transcript_id) uniprot_curie = self._make_uniprot_polypeptide_curie(transcript_id) ncbi_protein_curie = self._make_ncbi_polypeptide_curie(transcript_id) geno.addGenotype(variant_id, variant_label, geno.genoparts['sequence_alteration']) # Make fake amino acid sequence in case we # can't get a CCDS to Uniprot and/or NCBI Protein mapping aa_seq_id = self.make_cgd_id('transcript{0}'.format(amino_acid_variant)) # Add Transcript: geno.addTranscript(variant_id, transcript_curie, transcript_id, geno.genoparts['transcript']) # Add polypeptide if ncbi_protein_curie is not None: geno.addPolypeptide(ncbi_protein_curie, self.transcript_xrefs['RefSeq'][transcript_id], transcript_curie) aa_seq_id = ncbi_protein_curie if uniprot_curie is not None: geno.addPolypeptide(uniprot_curie, self.transcript_xrefs['UniProt'][transcript_id], transcript_curie) # Overrides ncbi_protein_curie, # but we set them as equal individuals below aa_seq_id = uniprot_curie if ncbi_protein_curie is not None and uniprot_curie is not None: gu.addSameIndividual(self.graph, ncbi_protein_curie, uniprot_curie) else: aa_seq_id = self.make_cgd_id('transcript{0}'.format(amino_acid_variant)) if protein_variant_type == 'nonsynonymous - missense' \ or re.search(r'missense', variant_label): is_missense = True geno.addGenotype(variant_id, variant_label, geno.genoparts['missense_variant']) # Get gene ID from gene map self._add_variant_gene_relationship(variant_id, transcript_gene) amino_acid_regex = re.compile(r'^p\.([A-Za-z]{1,3})(\d+)([A-Za-z]{1,3})$') if is_missense: match = re.match(amino_acid_regex, amino_acid_variant.rstrip()) else: match = None if match is not None: ref_amino_acid = match.group(1) position = match.group(2) altered_amino_acid = match.group(3) else: logger.debug("Could not parse amino acid information" " from {0} variant:" " {1} type: {2}".format(amino_acid_variant, variant_label, protein_variant_type)) # Add amino acid change to model if is_missense is True and match is not None: gu.addTriple(self.graph, variant_id, geno.properties['reference_amino_acid'], ref_amino_acid, is_literal) gu.addTriple(self.graph, variant_id, geno.properties['results_in_amino_acid_change'], altered_amino_acid, is_literal) aa_region_id = ":_{0}{1}{2}Region".format(position, position, aa_seq_id) self._add_feature_with_coords(variant_id, position, position, aa_seq_id, aa_region_id) return