Пример #1
0
    def __init__(
            self,
            graph,
            feature_id=None,
            label=None,
            feature_type=None,
            description=None,
            feature_category=None
    ):

        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.gfxutl = GraphUtils(self.curie_map)
        self.fid = feature_id
        self.feature_category = feature_category
        self.label = label
        self.ftype = feature_type
        self.description = description
        self.start = None
        self.stop = None
        self.taxon = None
Пример #2
0
    def _add_therapy_drug_association(self, drug_id, disease_id, therapy_status_id):
        """
        Create an association linking a drug and disease with
        RO:0002606 (substance_that_treats) and any supporting information
        such as FDA approval and source (not implemented)
        :param drug_id: Id as curie of the drug
        :param disease_id: Id as curie of the disease
        :param therapy_status: (Optional) String label of therapy approval status
        :return: None
        """
        gu = GraphUtils(curie_map.get())
        # Placeholder relationship, note this does not exist in RO
        relationship_id = "RO:has_approval_status"
        gu.addTriple(self.graph, drug_id, gu.object_properties['substance_that_treats'], disease_id)
        # Make association
        drug_disease_annot = self.make_cgd_id("assoc{0}{1}".format(drug_id, disease_id))

        therapy_disease_assoc = Assoc(self.name)
        therapy_disease_assoc.set_subject(drug_id)
        therapy_disease_assoc.set_relationship(gu.object_properties['substance_that_treats'])
        therapy_disease_assoc.set_object(disease_id)
        therapy_disease_assoc.set_association_id(drug_disease_annot)
        therapy_disease_assoc.add_association_to_graph(self.graph)

        gu.addTriple(self.graph, drug_disease_annot, relationship_id, therapy_status_id)
Пример #3
0
    def __init__(self,
                 graph,
                 definedby,
                 entity_id,
                 phenotype_id,
                 rel=None,
                 entity_category=None,
                 phenotype_category=None):
        super().__init__(graph, definedby)
        self.entity_id = entity_id
        self.phenotype_id = phenotype_id

        if rel is None:
            rel = self.globaltt['has phenotype']

        self.start_stage_id = None
        self.end_stage_id = None
        self.environment_id = None
        self.stage_process_id = None

        self.set_subject(entity_id)
        self.set_object(phenotype_id)
        self.set_relationship(rel)

        self.subject_category = entity_category
        self.object_category = phenotype_category
        self.gut = GraphUtils(None)

        return
Пример #4
0
 def __init__(self, graph):
     if isinstance(graph, Graph):
         self.graph = graph
     else:
         raise ValueError("{} is not a graph".format(graph))
     self.model = Model(self.graph)
     self.globaltt = self.graph.globaltt
     self.globaltcid = self.graph.globaltcid
     self.curie_map = self.graph.curie_map
     self.gut = GraphUtils(self.curie_map)
Пример #5
0
 def _add_variant_gene_relationship(self, variant_id, hgnc_symbol):
     """
     :param variant_id
     :param hgnc_symbol
     :return: None
     """
     gu = GraphUtils(curie_map.get())
     geno = Genotype(self.graph)
     if hgnc_symbol in self.gene_map:
         gene_id = self.gene_map[hgnc_symbol]
     else:
         gene_id = self.make_cgd_id("{0}{1}".format(variant_id, hgnc_symbol))
         logger.warn("Can't map gene symbol {0} "
                     "to entrez ID".format(hgnc_symbol))
     gu.addClassToGraph(self.graph, gene_id, hgnc_symbol)
     geno.addAlleleOfGene(variant_id, gene_id)
     return
Пример #6
0
    def write(self, fmt='turtle', stream=None, write_metadata_in_main_graph=False):
        """
        This convenience method will write out all of the graphs
            associated with the source.
        Right now these are hardcoded to be a single main "graph"
        and a "src_dataset.ttl" and a "src_test.ttl"
        If you do not supply stream='stdout'
        it will default write these to files.

        In addition, if the version number isn't yet set in the dataset,
        it will be set to the date on file.
        :return: None

        """
        fmt_ext = {
            'rdfxml': 'xml',
            'turtle': 'ttl',
            'nt': 'nt',         # ntriples
            'nquads': 'nq',
            'n3': 'n3'          # notation3
        }

        # make the regular graph output file
        dest = None
        if self.name is not None:
            dest = '/'.join((self.outdir, self.name))
            if fmt in fmt_ext:
                dest = '.'.join((dest, fmt_ext.get(fmt)))
            else:
                dest = '.'.join((dest, fmt))
            LOG.info("Setting outfile to %s", dest)

            # make the dataset_file name, always format as turtle
            self.datasetfile = '/'.join(
                (self.outdir, self.name + '_dataset.ttl'))
            LOG.info("Setting dataset file to %s", self.datasetfile)
        else:
            LOG.warning("No output file set. Using stdout")
            stream = 'stdout'

        graph_util = GraphUtils(None)

        # the  _dataset description is always turtle
        graph_util.write(self.dataset.get_graph(), 'turtle', filename=self.datasetfile)

        if self.test_mode:
            # unless we stop hardcoding, the test dataset is always turtle
            LOG.info("Setting testfile to %s", self.testfile)
            graph_util.write(self.testgraph, 'turtle', filename=self.testfile)

        if write_metadata_in_main_graph:
            self.graph = self.graph + self.dataset.get_graph()

        # print graph out
        if stream is None:
            outfile = dest
        elif stream.lower().strip() == 'stdout':
            outfile = None
        else:
            LOG.error("I don't understand our stream.")
            return

        graph_util.write(self.graph, fmt, filename=outfile)
Пример #7
0
    def write(self, fmt='turtle', stream=None):
        """
        This convenience method will write out all of the graphs
        associated with the source.
        Right now these are hardcoded to be a single "graph"
        and a "src_dataset.ttl" and a "src_test.ttl"
        If you do not supply stream='stdout'
        it will default write these to files.

        In addition, if the version number isn't yet set in the dataset,
        it will be set to the date on file.
        :return: None

        """
        fmt_ext = {
            'rdfxml': 'xml',
            'turtle': 'ttl',
            'nt': 'nt',  # ntriples
            'nquads': 'nq',
            'n3': 'n3'
        }

        # make the regular graph output file
        dest = None
        if self.name is not None:
            dest = '/'.join((self.outdir, self.name))
            if fmt in fmt_ext:
                dest = '.'.join((dest, fmt_ext.get(fmt)))
            else:
                dest = '.'.join((dest, fmt))
            logger.info("Setting outfile to %s", dest)

            # make the datasetfile name, always format as turtle
            datasetfile = '/'.join((self.outdir, self.name + '_dataset.ttl'))

            if self.dataset is not None and self.dataset.version is None:
                self.dataset.set_version_by_date()
                logger.info("No version for " + self.name +
                            " setting to date issued.")
        else:
            logger.warning("No output file set. Using stdout")
            stream = 'stdout'

        gu = GraphUtils(None)

        # the  _dataset descriptions is always turtle
        gu.write(self.dataset.getGraph(), 'turtle', file=datasetfile)

        # unless we stop hardcoding above, the test dataset is always turtle
        if self.testMode:
            gu.write(self.testgraph, 'turtle', file=self.testfile)

        # print graph out
        if stream is None:
            f = dest
        elif stream.lower().strip() == 'stdout':
            f = None
        else:
            logger.error("I don't understand your stream.")
            return

        gu.write(self.graph, fmt, file=f)
        return
Пример #8
0
    def add_disease_drug_variant_to_graph(self, table):
        """
        Takes an iterable of iterables as input with the following structure,
        optional indices can be Null:
        [[variant_key, variant_label, diagnoses_key, diagnoses,
          specific_diagnosis, organ, relationship,
          drug_key, drug, therapy_status (optional), pubmed_id(optional)]]

        See ongoing discussion of how to best model here:
        https://github.com/monarch-initiative/mckb/issues/9

        :param table: iterable of iterables, for example, a tuple of tuples
                      from _get_disease_drug_variant_relationship
        :return: None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)

        for row in table:
            (variant_key, variant_label, diagnoses_key, diagnoses,
             specific_diagnosis, organ, relationship,
             drug_key, drug_label, therapy_status, pubmed_id) = row

            if specific_diagnosis is not None:
                diagnoses_label = specific_diagnosis
            else:
                diagnoses_label = diagnoses

            # Arbitrary IDs to be replaced by ontology mappings
            variant_id = self.make_cgd_id('variant{0}'.format(variant_key))
            disease_id = self._get_disease_id(diagnoses_key, diagnoses_label)
            therapy_status_id = self.make_cgd_id('{0}'.format(therapy_status))
            relationship_id = "RO:has_environment"
            disease_quality = ("CGD:{0}".format(relationship)).replace(" ", "_")
            has_quality_property = "BFO:0000159"
            drug_id = self._get_drug_id(drug_key, drug_label)

            geno.addGenotype(variant_id, variant_label,
                             geno.genoparts['sequence_alteration'])

            disease_instance_id = self.make_cgd_id('disease{0}{1}'.format(
                                                     diagnoses_label, variant_key))

            phenotype_instance_id = self.make_cgd_id('phenotype{0}{1}{2}'.format(
                                                     diagnoses_label, variant_key, relationship))

            phenotype_instance_label = "{0} with {1} to therapy".format(diagnoses_label, relationship)
            if relationship == "detrimental effect":
                phenotype_instance_label = "{0} with therapeutic response {1} to health"\
                                           .format(diagnoses_label, relationship)

            # Reified association for disease caused_by genotype
            variant_disease_annot = self.make_cgd_id("assoc{0}{1}".format(variant_key, diagnoses_label))

            # Add individuals/classes
            gu.addClassToGraph(self.graph, disease_id, diagnoses_label, 'DOID:4')

            gu.addClassToGraph(self.graph, drug_id, drug_label, 'CHEBI:23888')
            gu.addIndividualToGraph(self.graph, phenotype_instance_id, phenotype_instance_label,
                                    disease_id)
            gu.loadObjectProperties(self.graph, {relationship: relationship_id})

            if pubmed_id is not None:
                source_id = "PMID:{0}".format(pubmed_id)
                ref = Reference(source_id, Reference.ref_types['journal_article'])
                ref.addRefToGraph(self.graph)
                evidence = 'ECO:0000033'
            else:
                source_id = None
                evidence = None

            rel_id = gu.object_properties['has_phenotype']
            variant_phenotype_assoc = G2PAssoc(self.name,
                                               variant_id,
                                               phenotype_instance_id,
                                               rel_id)

            variant_phenotype_assoc.set_association_id(variant_disease_annot)
            if evidence:
                variant_phenotype_assoc.add_evidence(evidence)

            if source_id:
                variant_phenotype_assoc.add_source(source_id)

            variant_phenotype_assoc.add_association_to_graph(self.graph)
            gu.addTriple(self.graph, variant_disease_annot, relationship_id, drug_id)
            gu.addTriple(self.graph, phenotype_instance_id, has_quality_property, disease_quality)

            # Add therapy-disease association and approval status
            marker_relation = "RO:has_biomarker"

            disease_instance_label = "{0} with biomarker {1}".format(diagnoses_label, variant_label)
            gu.addIndividualToGraph(self.graph, disease_instance_id, disease_instance_label,
                                    disease_id)
            gu.addTriple(self.graph, disease_instance_id, marker_relation, variant_id)

            gu.addClassToGraph(self.graph, therapy_status_id, therapy_status)
            self._add_therapy_drug_association(drug_id, disease_instance_id, therapy_status_id)

        return
Пример #9
0
    def _add_variant_cdna_variant_assoc_to_graph(self, row):
        """
        Generates relationships between variants and cDNA variants
        given a row of data
        :param iterable: row of data, see add_variant_info_to_graph()
                                      docstring for expected structure.
                                      Only applicable for structure 2.
        :return None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)
        is_literal = True

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna,
         cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base,
         variant_base, primary_transcript_exons,
         primary_transcript_variant_sub_types, variant_type, chromosome,
         genome_build, build_version, build_date) = row

        variant_id = self.make_cgd_id('variant{0}'.format(variant_key))

        # Add gene
        self._add_variant_gene_relationship(variant_id, variant_gene)

        # Transcript reference for nucleotide position
        transcript_curie = self._make_transcript_curie(transcript_id)

        # Make region IDs
        cdna_region_id = ":_{0}Region".format(transcript_curie)
        chrom_region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build,
                                                          chromosome,
                                                          genome_pos_start,
                                                          genome_pos_end)

        # Add the genome build
        genome_label = "Human"
        build_id = "UCSC:{0}".format(genome_build)
        taxon_id = 'NCBITaxon:9606'
        geno.addGenome(taxon_id, genome_label)
        geno.addReferenceGenome(build_id, genome_build, taxon_id)

        # Add chromosome

        chrom_class_id = makeChromID(chromosome, '9606', 'CHR')  # the chrom class (generic) id
        chrom_instance_id = makeChromID(chromosome, build_id, 'MONARCH')

        # first, add the chromosome class (in the taxon)
        geno.addChromosomeClass(chromosome, taxon_id, 'Human')

        # then, add the chromosome instance (from the given build)
        geno.addChromosomeInstance(chromosome, build_id, genome_build, chrom_class_id)

        # Add variant coordinates in reference to chromosome
        self._add_feature_with_coords(variant_id,genome_pos_start,
                                      genome_pos_end, chrom_instance_id, chrom_region_id)

        # Add mutation coordinates in reference to gene
        self._add_feature_with_coords(variant_id, bp_pos,
                                      bp_pos, transcript_curie, cdna_region_id)

        # Add nucleotide mutation
        gu.addTriple(self.graph, variant_id,
                     geno.properties['reference_nucleotide'],
                     ref_base, is_literal)
        gu.addTriple(self.graph, variant_id,
                     geno.properties['altered_nucleotide'],
                     variant_base, is_literal)

        """
        Here we update any internal cgd variant IDS with a cosmic ID
        or dbSNP ID.  Alternatively we could do this using sql rather
        than a sparql update which may be safer
        """
        # Add SNP xrefs
        if cosmic_id is not None:
            cosmic_id_list = cosmic_id.split(', ')
            cosmic_curie_list = []
            for c_id in cosmic_id_list:
                cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id)
                cosmic_curie_list.append(cosmic_curie)
                gu.addIndividualToGraph(self.graph, cosmic_curie, c_id,
                                        geno.genoparts['missense_variant'])

            # If there are multiple ids set them equivalent to the first
            for curie in cosmic_curie_list[1:]:
                gu.addSameIndividual(self.graph, cosmic_curie_list[0], curie)

            self._replace_entity(self.graph, variant_id, cosmic_curie_list[0], self.bindings)

        if db_snp_id is not None:
            db_snp_curie = re.sub(r'rs(\d+)', r'dbSNP:\1', db_snp_id)
            gu.addIndividualToGraph(self.graph, db_snp_curie, db_snp_id,
                                    geno.genoparts['missense_variant'])

            if cosmic_id is None:
                self._replace_entity(self.graph, variant_id, db_snp_curie, self.bindings)
            else:
                cosmic_id_list = cosmic_id.split(', ')
                for c_id in cosmic_id_list:
                    cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id)
                    gu.addSameIndividual(self.graph, cosmic_curie, db_snp_curie)

        return
Пример #10
0
    def _add_variant_protein_variant_assoc_to_graph(self, row):
        """
        Generates relationships between variants and protein variants
        given a row of data
        :param iterable: row of data, see add_variant_info_to_graph()
                                      docstring for expected structure
        :return None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)
        is_missense = False
        is_literal = True

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source) = row[0:11]

        variant_id = self.make_cgd_id('variant{0}'.format(variant_key))

        transcript_curie = self._make_transcript_curie(transcript_id)
        uniprot_curie = self._make_uniprot_polypeptide_curie(transcript_id)
        ncbi_protein_curie = self._make_ncbi_polypeptide_curie(transcript_id)

        geno.addGenotype(variant_id, variant_label,
                         geno.genoparts['sequence_alteration'])

        # Make fake amino acid sequence in case we
        # can't get a CCDS to Uniprot and/or NCBI Protein mapping
        aa_seq_id = self.make_cgd_id('transcript{0}'.format(amino_acid_variant))

        # Add Transcript:
        geno.addTranscript(variant_id, transcript_curie, transcript_id,
                           geno.genoparts['transcript'])

        # Add polypeptide
        if ncbi_protein_curie is not None:
            geno.addPolypeptide(ncbi_protein_curie,
                                self.transcript_xrefs['RefSeq'][transcript_id],
                                transcript_curie)
            aa_seq_id = ncbi_protein_curie
        if uniprot_curie is not None:
            geno.addPolypeptide(uniprot_curie,
                                self.transcript_xrefs['UniProt'][transcript_id],
                                transcript_curie)
            # Overrides ncbi_protein_curie,
            # but we set them as equal individuals below
            aa_seq_id = uniprot_curie

        if ncbi_protein_curie is not None and uniprot_curie is not None:
            gu.addSameIndividual(self.graph, ncbi_protein_curie, uniprot_curie)
        else:
            aa_seq_id = self.make_cgd_id('transcript{0}'.format(amino_acid_variant))

        if protein_variant_type == 'nonsynonymous - missense' \
                or re.search(r'missense', variant_label):
            is_missense = True
            geno.addGenotype(variant_id, variant_label,
                             geno.genoparts['missense_variant'])

        # Get gene ID from gene map
        self._add_variant_gene_relationship(variant_id, transcript_gene)

        amino_acid_regex = re.compile(r'^p\.([A-Za-z]{1,3})(\d+)([A-Za-z]{1,3})$')

        if is_missense:
            match = re.match(amino_acid_regex, amino_acid_variant.rstrip())
        else:
            match = None

        if match is not None:
            ref_amino_acid = match.group(1)
            position = match.group(2)
            altered_amino_acid = match.group(3)
        else:
            logger.debug("Could not parse amino acid information"
                         " from {0} variant:"
                         " {1} type: {2}".format(amino_acid_variant,
                                                 variant_label,
                                                 protein_variant_type))

        # Add amino acid change to model
        if is_missense is True and match is not None:
            gu.addTriple(self.graph, variant_id,
                         geno.properties['reference_amino_acid'],
                         ref_amino_acid, is_literal)
            gu.addTriple(self.graph, variant_id,
                         geno.properties['results_in_amino_acid_change'],
                         altered_amino_acid, is_literal)

            aa_region_id = ":_{0}{1}{2}Region".format(position, position, aa_seq_id)
            self._add_feature_with_coords(variant_id, position,
                                          position, aa_seq_id, aa_region_id)

        return