Пример #1
0
    def _get_gene2pubmed(self, limit):
        """
        Loops through the gene2pubmed file and adds a simple triple to say that a given publication
        is_about a gene.  Publications are added as NamedIndividuals.
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        is_about = gu.getNode(gu.object_properties['is_about'])
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, pubmed_num) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #       or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                if gene_num == '-' or pubmed_num == '-':
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                pubmed_id = ':'.join(('PMID', pubmed_num))

                # add the gene, in case it hasn't before
                gu.addClassToGraph(g, gene_id, None)
                # add the publication as a NamedIndividual
                gu.addIndividualToGraph(g, pubmed_id, None, None)  # add type publication
                self.graph.add((gu.getNode(pubmed_id), is_about, gu.getNode(gene_id)))

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        return
Пример #2
0
    def addRefToGraph(self, g):

        gu = GraphUtils(curie_map.get())

        n = self.short_citation
        if n is None:
            n = self.title

        if self.ref_url is not None:
            ref_uri = URIRef(self.ref_url)
            g.add((ref_uri, DC['title'], Literal(self.title)))
            g.add((ref_uri, RDF['type'], gu.getNode(self.ref_type)))
            g.add((ref_uri, RDFS['label'], Literal(n)))
        elif self.ref_id is not None:
            gu.addIndividualToGraph(g, self.ref_id, n, self.ref_type)
            if self.title is not None:
                gu.addTitle(g, self.ref_id, self.title)
        else:
            # should never be true
            logger.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for a in self.author_list:
        #        gu.addTriple(
        #           g, self.ref_id, self.props['has_author'], a, True)
        return
Пример #3
0
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.utils.GraphUtils import GraphUtils
        from dipper import curie_map

        # Make testutils object and load ttl
        test_query = TestUtils(self.source.graph)
        test_query.load_testgraph_from_turtle(self.source)

        # Expected structure
        # TODO can this be unified OBAN and the Annot models to be automatically generated?
        sparql_query = """
                       SELECT ?assoc ?pubmed ?disease ?chemical
                       WHERE {
                       ?assoc a Annotation: ;
                           dc:evidence OBO:ECO_0000033 ;
                           dc:source ?pubmed ;
                           :hasObject ?disease ;
                           :hasPredicate OBO:RO_0002606 ;
                           :hasSubject ?chemical .}
                       """

        # SPARQL variables to check
        gu = GraphUtils(curie_map.get())
        chem_id = 'MESH:D009538'
        chem_uri = gu.getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = gu.getNode(disease_id)
        eco = 'ECO:0000033'
        rel_id = gu.object_properties['substance_that_treats']
        pubmed_id = 'PMID:16785264'
        pubmed_uri = gu.getNode(pubmed_id)

        # consider replacing with make_ctd_chem_disease_assoc_id()
        assoc_id = self.source.make_association_id('ctd', chem_id, rel_id, disease_id, eco, pubmed_id)
        assoc_uri = gu.getNode(assoc_id)

        # One of the expected outputs from query
        expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(expected_output in sparql_output, "did not find expected association: " + assoc_id +
                        " found: " + pprint.pformat(sparql_output))

        logger.info("Test query data finished.")
Пример #4
0
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.utils.GraphUtils import GraphUtils

        # Make testutils object and load bindings
        test_query = TestUtils(self.ctd.graph)
        self.ctd.load_bindings()

        # Expected structure
        sparql_query = """
                       SELECT ?assoc ?pubmed ?disease ?chemical
                       WHERE {
                       ?assoc a Annotation: ;
                           dc:evidence OBO:ECO_0000033 ;
                           dc:source ?pubmed ;
                           :hasObject ?disease ;
                           :hasPredicate OBO:RO_0002606 ;
                           :hasSubject ?chemical .}
                       """

        # SPARQL variables to check
        gu = GraphUtils(curie_map.get())
        chem_id = 'MESH:D009538'
        chem_uri = gu.getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = gu.getNode(disease_id)
        pubmed_id = 'PMID:16785264'
        pubmed_uri = gu.getNode(pubmed_id)
        rel_id = gu.object_properties['substance_that_treats']
        eco = 'ECO:0000033'
        # TODO PYLINT  make_association_id() does not exist in CTD
        # there is "_make_association()" with a different sig

        assoc_id = self.ctd.make_association_id(
            'ctd', chem_id, rel_id, disease_id, eco, pubmed_id)
        assoc_uri = gu.getNode(assoc_id)

        # Expected output from query
        expected_output = [assoc_uri, pubmed_uri, disease_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(expected_output in sparql_output)

        logger.info("Test finished.")
Пример #5
0
    def test_curieprefixes(self):
        """
        This will ensure that we can create identifiers for all of the defined curie prefixes using the
        GraphUtils.getNode() method
        :return:
        """
        from dipper.utils.GraphUtils import GraphUtils

        gu = GraphUtils(self.curie_map)

        # add one id per curie as classes to the graph
        for p in self.curie_map.keys():
            testid = p+':testme'
            n = gu.getNode(testid)
            m = "prefix \""+p+"\" has an error...can't create graph node"
            self.assertTrue(n is not None, m)

        return
Пример #6
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
        the graph additions are in the addXToFeature functions,
        but should be separated.
    TODO:
        this will need to be extended to properly deal with
        fuzzy positions in faldo.
    """

    object_properties = {
        'location': 'faldo:location',
        'begin': 'faldo:begin',
        'end': 'faldo:end',
        'reference': 'faldo:reference',
        'gene_product_of': 'RO:0002204',
        'has_gene_product': 'RO:0002205',
        'is_about': 'IAO:00000136',
        'has_subsequence': 'RO:0002524',
        'is_subsequence_of': 'RO:0002525',
        'has_staining_intensity': 'GENO:0000207',
        # was GENO:0000626 (staining_intensity),
        # but changing to has_sequence_attribute
        'upstream_of_sequence_of': 'RO:0002528',
        'downstream_of_sequence_of': 'RO:0002529'

    }

    data_properties = {
        'position': 'faldo:position',
    }

    annotation_properties = {}

    properties = object_properties.copy()
    properties.update(data_properties)
    properties.update(annotation_properties)

    types = {
        'region': 'faldo:Region',
        'Position': 'faldo:Position',
        # big P for Position type.  little p for position property
        'FuzzyPosition': 'faldo:FuzzyPosition',
        'chromosome': 'SO:0000340',
        'chromosome_arm': 'SO:0000105',
        'chromosome_band': 'SO:0000341',
        'chromosome_part': 'SO:0000830',
        'long_chromosome_arm': 'GENO:0000629',
        'short_chromosome_arm': 'GENO:0000628',
        'chromosome_region': 'GENO:0000614',
        'chromosome_subband': 'GENO:0000616',
        'centromere': 'SO:0000577',
        'plus_strand': 'faldo:PlusStrandPosition',
        'minus_strand': 'faldo:MinusStrandPosition',
        'both_strand': 'faldo:BothStrandPosition',
        'score': 'SO:0001685',
        # FIXME - score is not a good solution, too generic
        'reference_genome': 'SO:0001505',
        'genome': 'SO:0001026',
        'assembly_component': 'SO:0000143',
        'SNP': 'SO:0000694',

        # the following are sequence attributes:
        'band_intensity':  'GENO:0000618',
        'gneg': 'GENO:0000620',
        'gpos': 'GENO:0000619',
        'gpos100': 'GENO:0000622',
        'gpos75': 'GENO:0000623',
        'gpos50': 'GENO:0000624',
        'gpos25': 'GENO:0000625',
        'gvar': 'GENO:0000621',
        'gpos33': 'GENO:0000633',
        'gpos66': 'GENO:0000632'
    }

    def __init__(self, id, label, type, description=None):
        self.id = id
        self.label = label
        self.type = type
        self.description = description
        self.gu = GraphUtils(curie_map.get())
        self.start = None
        self.stop = None
        self.nobnodes = True  # TODO remove this before official release
        return

    def addFeatureStartLocation(
            self, coordinate, reference_id, strand=None,
            position_types=None):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:
        :return:
        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand,
                                       position_types)

        return

    def addFeatureEndLocation(
            self, coordinate, reference_id, strand=None,
            position_types=None):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:
        :return:
        """

        self.stop = self._getLocation(coordinate, reference_id, strand,
                                      position_types)

        return

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:
        :return:
        """

        loc = {}
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.types['Position'])

        return loc

    def _getStrandType(self, strand):
        """

        :param strand:
        :return:
        """

        # TODO make this a dictionary/enum:  PLUS, MINUS, BOTH, UNKNOWN
        strand_id = None
        if strand == '+':
            strand_id = self.types['plus_strand']
        elif strand == '-':
            strand_id = self.types['minus_strand']
        elif strand == '.':
            strand_id = self.types['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            logger.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(
            self, graph, add_region=True, region_id=None,
            feature_as_class=False):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
            which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
            which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
            faldo:location region_id
        region_id a faldo:region
            faldo:begin start_position
            faldo:end end_position
        start_position a
            (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
            faldo:position Integer(numeric position)
            faldo:reference reference_id
        end_position a
            (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
            faldo:position Integer(numeric position)
            faldo:reference reference_id

        :param graph:
        :return:
        """

        if feature_as_class:
            self.gu.addClassToGraph(graph, self.id, self.label, self.type,
                                    self.description)
        else:
            self.gu.addIndividualToGraph(graph, self.id, self.label, self.type,
                                         self.description)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and \
                        self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(
                        self.start['type'])
                if self.stop is not None and\
                        self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                rid = '_'+rid+"-Region"
                region_id = rid
                if self.nobnodes:
                    region_id = ':'+region_id
            self.gu.addTriple(graph, self.id, self.properties['location'],
                              region_id)
            self.gu.addIndividualToGraph(
                graph, region_id, None, 'faldo:Region')
        else:
            region_id = self.id
            self.gu.addType(graph, region_id, 'faldo:Region')

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(self.start['reference'],
                                          self.start['coordinate'],
                                          self.start['type'])
            self.addPositionToGraph(graph,
                                    self.start['reference'],
                                    self.start['coordinate'],
                                    self.start['type'])

        if self.stop is not None:
            endp = self._makePositionId(self.stop['reference'],
                                        self.stop['coordinate'],
                                        self.stop['type'])
            self.addPositionToGraph(graph,
                                    self.stop['reference'],
                                    self.stop['coordinate'],
                                    self.stop['type'])

        self.addRegionPositionToGraph(graph, region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

        return

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.types['plus_strand'] in tylist:
            strand = 'plus'
        elif self.types['minus_strand'] in tylist:
            strand = 'minus'
        elif self.types['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
            Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return:
        """

        if reference is None:
            logger.error("Trying to make position with no reference.")
            return None

        i = '_'
        if self.nobnodes:
            i = ':'+i
        reference = re.sub(r'\w+\:', '', reference, 1)
        if re.match(r'^_', reference):
            # this is in the case if the reference is a bnode
            reference = re.sub(r'^_', '', reference)
        i += reference
        if coordinate is not None:
            # just in case it isn't a string already
            i = '-'.join((i, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                i = '-'.join((i, tstring))

        return i

    def addRegionPositionToGraph(
            self, graph, region_id, begin_position_id,
            end_position_id):

        if begin_position_id is None:
            pass
            # logger.warn(
            #   "No begin position specified for region %s", region_id)
        else:
            self.gu.addTriple(graph, region_id, self.properties['begin'],
                              begin_position_id)

        if end_position_id is None:
            pass
            # logger.warn("No end position specified for region %s", region_id)
        else:
            self.gu.addTriple(graph, region_id, self.properties['end'],
                              end_position_id)

        return

    def addPositionToGraph(
            self, graph, reference_id, position,
            position_types=None, strand=None):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
            we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
            faldo:position Integer(numeric position)
            faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:
        :return:  Identifier of the position created
        """

        iid = self._makePositionId(reference_id, position, position_types)
        n = self.gu.getNode(iid)
        pos = self.gu.getNode(self.properties['position'])
        ref = self.gu.getNode(self.properties['reference'])
        if position is not None:
            graph.add((n, pos, Literal(position, datatype=XSD['integer'])))
        graph.add((n, ref, self.gu.getNode(reference_id)))
        if position_types is not None:
            for t in position_types:
                graph.add((n, RDF['type'], self.gu.getNode(t)))
        s = None
        if strand is not None:
            s = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                s = self._getStrandType(strand)
        # else:
        #    s = self.types['both_strand']
        if s is None and (position_types is None or position_types == []):
            s = self.types['Position']

        if s is not None:
            graph.add((n, RDF['type'], self.gu.getNode(s)))

        return iid

    def addSubsequenceOfFeature(self, graph, parentid):
        """
        This will add reciprocal triples like:
        feature is_subsequence_of parent
        parent has_subsequence feature
        :param graph:
        :param parentid:
        :return:
        """
        self.gu.addTriple(
            graph, self.id, self.properties['is_subsequence_of'], parentid)
        self.gu.addTriple(
            graph, parentid, self.properties['has_subsequence'], self.id)

        return

    def addTaxonToFeature(self, graph, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        # TEC: should taxon be set in __init__()?
        self.taxon = taxonid
        self.gu.addTriple(
            graph, self.id, Assoc.properties['in_taxon'], self.taxon)

        return

    def loadAllProperties(self, graph):

        prop_dict = {
            Assoc(None).ANNOTPROP: self.annotation_properties,
            Assoc(None).OBJECTPROP: self.object_properties,
            Assoc(None).DATAPROP: self.data_properties
        }

        for p in prop_dict:
            self.gu.loadProperties(graph, prop_dict.get(p), p)

        return

    def addFeatureProperty(self, graph, property_type, property):
        self.gu.addTriple(graph, self.id, property_type, property)
        return

    def setNoBNodes(self, nobnodes):
        self.nobnodes = nobnodes
        return
Пример #7
0
class Genotype():
    """
    These methods provide convenient methods to add items related to a genotype and it's parts to a supplied graph.
    They follow the patterns set out in GENO https://github.com/monarch-initiative/GENO-ontology.
    For specific sequence features, we use the GenomicFeature class to create them.
    """

    # special genotype parts mapped to their GENO and SO classes that we explicitly reference here
    genoparts = {
        'intrinsic_genotype': 'GENO:0000000',
        'extrinsic_genotype': 'GENO:0000524',
        'effective_genotype': 'GENO:0000525',
        'genomic_background': 'GENO:0000611',
        'genomic_variation_complement': 'GENO:0000009',
        'karyotype_variation_complement': 'GENO:0000644',
        'variant_single_locus_complement': 'GENO:0000030',
        'variant_locus': 'GENO:0000002',
        'reference_locus': 'GENO:0000036',
        'allele': 'GENO:0000008',
        'gene': 'SO:0000704',
        'QTL': 'SO:0000771',
        'transgene': 'SO:0000902',
        'pseudogene': 'SO:0000336',
        'cytogenetic marker': 'SO:0000341',
        'sequence_feature': 'SO:0000110',
        'sequence_alteration': 'SO:0001059',
        'insertion': 'SO:0000667',
        'deletion': 'SO:0000159',
        'substitution': 'SO:1000002',
        'duplication': 'SO:1000035',
        'translocation': 'SO:0000199',
        'inversion': 'SO:1000036',
        'tandem_duplication': 'SO:1000173',
        'point_mutation': 'SO:1000008',
        'population': 'PCO:0000001',  # population
        'family': 'PCO:0000020',  # family
        'wildtype': 'GENO:0000511',
        'reagent_targeted_gene': 'GENO:0000504',
        'targeted_gene_subregion' : 'GENO:0000534',
        'targeted_gene_complement' : 'GENO:0000527',
        'biological_region' : 'SO:0001411',
        'missense_variant': 'SO:0001583',
        'transcript': 'SO:0000233',
        'polypeptide': 'SO:0000104',
        'cDNA': 'SO:0000756',
        'sequence_variant_causing_loss_of_function_of_polypeptide': 'SO:1000118',
        'sequence_variant_causing_gain_of_function_of_polypeptide': 'SO:1000125',
        'sequence_variant_causing_inactive_catalytic_site': 'SO:1000120',
        'sequence_variant_affecting_polypeptide_function': 'SO:1000117',
        'regulatory_transgene_feature': 'GENO:0000638',
        'coding_transgene_feature': 'GENO:0000637',
        'protein_coding_gene': 'SO:0001217',
        'ncRNA_gene': 'SO:0001263'
    }

    object_properties = {
        'is_mutant_of': 'GENO:0000440',
        'derives_from': 'RO:0001000',
        'has_alternate_part': 'GENO:0000382',
        'has_reference_part': 'GENO:0000385',
        'in_taxon': 'RO:0002162',
        'has_zygosity': 'GENO:0000608',
        'is_sequence_variant_instance_of': 'GENO:0000408',  # links a alternate locus (instance) to a gene (class)
        'targets_instance_of': 'GENO:0000414',
        'is_reference_instance_of': 'GENO:0000610',
        'has_part': 'BFO:0000051',
        'has_member_with_allelotype': 'GENO:0000225',  # use this when relating populations
        'is_allelotype_of': 'GENO:0000206',
        'has_genotype': 'GENO:0000222',
        'has_phenotype': 'RO:0002200',
        'transcribed_to': 'RO:0002205',
        'translates_to': 'RO:0002513',
        'is_targeted_expression_variant_of' : 'GENO:0000443',
        'is_transgene_variant_of': 'GENO:0000444',
        'has_expression-variant_part' : 'GENO:0000532',
        'targeted_by' : 'GENO:0000634',  # between a (reagent-targeted gene) and a morpholino
        'derives_sequence_from_gene': 'GENO:0000639',   # FIXME should this just be subsequence of?
        'feature_to_gene_relation': 'GENO:0000418'
    }

    annotation_properties = {
        # TODO change properties with https://github.com/monarch-initiative/GENO-ontology/issues/21
        'reference_nucleotide': 'GENO:reference_nucleotide',  # Made up term
        'reference_amino_acid': 'GENO:reference_amino_acid',  # Made up term
        'altered_nucleotide': 'GENO:altered_nucleotide',  # Made up term
        'results_in_amino_acid_change': 'GENO:results_in_amino_acid_change'  # Made up term
    }

    zygosity = {
        'homoplasmic': 'GENO:0000602',
        'heterozygous': 'GENO:0000135',
        'indeterminate': 'GENO:0000137',
        'heteroplasmic': 'GENO:0000603',
        'hemizygous-y': 'GENO:0000604',
        'hemizygous-x': 'GENO:0000605',
        'homozygous': 'GENO:0000136',
        'hemizygous': 'GENO:0000606',
        'complex_heterozygous': 'GENO:0000402',
        'simple_heterozygous': 'GENO:0000458'
    }

    properties = object_properties.copy()
    properties.update(annotation_properties)

    def __init__(self, graph):

        self.gu = GraphUtils(curie_map.get())

        self.graph = graph

        self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP)

        return

    def addGenotype(self, genotype_id, genotype_label, genotype_type=None, genotype_description=None):
        """
        If a genotype_type is not supplied, we will default to 'intrinsic_genotype'
        :param genotype_id:
        :param genotype_label:
        :param genotype_type:
        :param genotype_description:
        :return:
        """
        if genotype_type is None:
            genotype_type = self.genoparts['intrinsic_genotype']

        self.gu.addIndividualToGraph(self.graph, genotype_id, genotype_label, genotype_type, genotype_description)

        return

    def addAllele(self, allele_id, allele_label, allele_type=None, allele_description=None):
        """
        Make an allele object. If no allele_type is added, it will default to a geno:allele
        :param allele_id: curie for allele (required)
        :param allele_label: label for allele (required)
        :param allele_type: id for an allele type (optional, recommended SO or GENO class)
        :param allele_description: a free-text description of the allele
        :return:
        """
        # TODO should we accept a list of allele types?
        if (allele_type is None):
            allele_type = self.genoparts['allele']  #TODO is this a good idea?
        self.gu.addIndividualToGraph(self.graph, allele_id, allele_label, allele_type, allele_description)

        return

    def addGene(self, gene_id, gene_label, gene_type=None, gene_description=None):
        if gene_type is None:
            gene_type = self.genoparts['gene']
        # genes are classes
        self.gu.addClassToGraph(self.graph, gene_id, gene_label, gene_type, gene_description)

        return

    def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None):
        # TODO add base type for construct
        # if (constrcut_type is None):
        #    constrcut_type=self.construct_base_type
        self.gu.addIndividualToGraph(self.graph, construct_id, construct_label, construct_type, construct_description)

        return

    def addDerivesFrom(self, child_id, parent_id):
        """
        We add a derives_from relationship between the child and parent id.  Examples of uses include between:
        an allele and a construct or strain here, a cell line and it's parent genotype.  Adding the
        parent and child to the graph should happen outside of this function call to
        ensure graph integrity.
        :param child_id:
        :param parent_id:
        :return:
        """

        self.gu.addTriple(self.graph, child_id, self.properties['derives_from'], parent_id)

        return

    def addSequenceDerivesFrom(self, child_id, parent_id):
        self.gu.addTriple(self.graph, child_id, self.properties['derives_sequence_from_gene'], parent_id)
        return

    def addAlleleOfGene(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided, it is a
        GENO:is_sequence_variant_instance_of.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:
        """
        if (rel_id is None):
            rel_id = self.properties['is_sequence_variant_instance_of']
        self.gu.addTriple(self.graph, allele_id, rel_id, gene_id)
        return

    def addTranscript(self, variant_id, transcript_id, transcript_label=None, transcript_type=None):
        """
        Add gene/variant/allele transcribes_to relationship
        :param variant_id:
        :param transcript_id:
        :param transcript_label:
        :param transcript_type:
        :return:
        """
        self.gu.addIndividualToGraph(self.graph, transcript_id, transcript_label, transcript_type)
        self.gu.addTriple(self.graph, variant_id, self.properties['transcribed_to'], transcript_id)

        return

    def addPolypeptide(self, polypeptide_id, polypeptide_label=None, transcript_id=None, polypeptide_type=None, ):
        """
        :param polypeptide_id:
        :param polypeptide_label:
        :param polypeptide_type:
        :param transcript_id:
        :return:
        """
        if polypeptide_type is None:
            polypeptide_type = self.genoparts['polypeptide']
        self.gu.addIndividualToGraph(self.graph, polypeptide_id, polypeptide_label, polypeptide_type)
        if transcript_id is not None:
            self.gu.addTriple(self.graph, transcript_id, self.properties['translates_to'], polypeptide_id)

        return


    def addPartsToVSLC(self, vslc_id, allele1_id, allele2_id, zygosity_id=None, allele1_rel=None, allele2_rel=None):
        """
        Here we add the parts to the VSLC.  While traditionally alleles (reference or variant loci) are
        traditionally added, you can add any node (such as sequence_alterations for unlocated variations)
        to a vslc if they are known to be paired.  However, if a sequence_alteration's loci is unknown,
        it probably should be added directly to the GVC.
        :param vslc_id:
        :param allele1_id:
        :param allele2_id:
        :param zygosity_id:
        :param allele1_rel:
        :param allele2_rel:
        :return:
        """

        # vslc has parts allele1/allele2
        gu = self.gu

        vslc = gu.getNode(vslc_id)
        if allele1_id is not None:
            self.addParts(allele1_id, vslc_id, allele1_rel)
        if allele2_id is not None and allele2_id.strip() != '':
            self.addParts(allele2_id, vslc_id, allele2_rel)

        # figure out zygosity if it's not supplied
        if zygosity_id is None:
            if allele1_id == allele2_id:
                zygosity_id = self.zygosity['homozygous']
            else:
                zygosity_id = self.zygosity['heterozygous']

        if zygosity_id is not None:
            gu.addTriple(self.graph, vslc_id, self.properties['has_zygosity'], zygosity_id)

        return

    def addVSLCtoParent(self, vslc_id, parent_id):
        """
        The VSLC can either be added to a genotype or to a GVC.  The vslc is added as a part of the parent.
        :param vslc_id:
        :param parent_id:
        :return:
        """
        self.addParts(vslc_id, parent_id, self.properties['has_alternate_part'])

        return

    def addParts(self, part_id, parent_id, part_relationship=None):
        """
        This will add a has_part (or subproperty) relationship between a parent_id and the supplied part.
        By default the relationship will be BFO:has_part, but any relationship could be given here.
        :param part_id:
        :param parent_id:
        :param part_relationship:
        :return:
        """
        if part_relationship is None:
            part_relationship = self.properties['has_part']

        self.gu.addTriple(self.graph, parent_id, part_relationship, part_id)

        return

    def addSequenceAlteration(self, sa_id, sa_label, sa_type=None, sa_description=None):
        if sa_type is None:
            sa_type = self.genoparts['sequence_alteration']
        self.gu.addIndividualToGraph(self.graph, sa_id, sa_label, sa_type, sa_description)

        return

    def addSequenceAlterationToVariantLocus(self, sa_id, vl_id):
        self.addParts(sa_id, vl_id, self.properties['has_alternate_part'])
        return

    def addGenomicBackground(self, background_id, background_label, background_type=None, background_description=None):
        if background_type is None:
            background_type = self.genoparts['genomic_background']
        self.gu.addIndividualToGraph(self.graph, background_id, background_label, background_type, background_description)

        return

    def addGenomicBackgroundToGenotype(self, background_id, genotype_id):
        self.gu.addType(self.graph, background_id, self.genoparts['genomic_background'])
        self.addParts(background_id, genotype_id, self.object_properties['has_reference_part'])

        return

    def addTaxon(self, taxon_id, genopart_id):
        """
        The supplied geno part will have the specified taxon added with RO:in_taxon relation.
        Generally the taxon is associated with a genomic_background, but could be added to any
        genotype part (including a gene, regulatory element, or sequence alteration).
        :param taxon_id:
        :param genopart_id:
        :return:
        """
        in_taxon = self.gu.getNode(self.properties['in_taxon'])
        s = self.gu.getNode(genopart_id)
        self.graph.add((s, in_taxon, self.gu.getNode(taxon_id)))

        return

    def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id):
        # for example, add a morphant reagent thingy to the genotype, assuming it's a extrinsic_genotype
        p = self.object_properties['has_expression-variant_part']
        self.gu.addTriple(self.graph, genotype_id, p, reagent_id)

        return

    def addGeneTargetingReagent(self, reagent_id, reagent_label, reagent_type, gene_id, description=None):
        """
        Here, a gene-targeting reagent is added.  The actual targets of this reagent should be added separately.
        :param reagent_id:
        :param reagent_label:
        :param reagent_type:
        :return:
        """
        # TODO add default type to reagent_type
        self.gu.addIndividualToGraph(self.graph, reagent_id, reagent_label, reagent_type, description)

        self.gu.addTriple(self.graph, reagent_id, self.object_properties['targets_instance_of'], gene_id)

        return

    def addReagentTargetedGene(self, reagent_id, gene_id, targeted_gene_id=None, targeted_gene_label=None,
                               description=None):
        """
        This will create the instance of a gene that is targeted by a molecular reagent (such as a morpholino or rnai).
        If an instance id is not supplied, we will create it as an anonymous individual which is of the
        type GENO:reagent_targeted_gene.  We will also add the targets relationship between the reagent and gene class.

        <targeted_gene_id> a GENO:reagent_targeted_gene
            rdf:label targeted_gene_label
            dc:description description
        <reagent_id> GENO:targets_instance_of <gene_id>

        :param reagent_id:
        :param gene_id:
        :param targeted_gene_id:
        :return:
        """

        # akin to a variant locus
        if (targeted_gene_id is None):
            targeted_gene_id = '_' + gene_id + '-' + reagent_id
        self.gu.addIndividualToGraph(self.graph, targeted_gene_id, targeted_gene_label,
                                     self.genoparts['reagent_targeted_gene'], description)

        self.gu.addTriple(self.graph, targeted_gene_id,
                          self.object_properties['is_targeted_expression_variant_of'], gene_id)

        self.gu.addTriple(self.graph, targeted_gene_id, self.object_properties['targeted_by'], reagent_id)

        return

    def addTargetedGeneSubregion(self, tgs_id, tgs_label, tgs_type=None, tgs_description=None):
        if tgs_type is None:
            tgs_type = self.genoparts['targeted_gene_subregion']
        self.gu.addIndividualToGraph(self.graph, tgs_id, tgs_label, tgs_type, tgs_description)

    def addMemberOfPopulation(self, member_id, population_id):
        self.gu.addTriple(self.graph, population_id,
                          self.properties['has_member_with_allelotype'], member_id)

        return


    def addTargetedGeneComplement(self, tgc_id, tgc_label, tgc_type=None, tgc_description=None):
        if tgc_type is None:
            tgc_type = self.genoparts['targeted_gene_complement']
        self.gu.addIndividualToGraph(self.graph, tgc_id, tgc_label, tgc_type, tgc_description)

        return


    def addGenome(self, taxon_id, taxon_label=None):
        if taxon_label is None:
            taxon_label = taxon_id
        genome_label = taxon_label+' genome'
        genome_id = self.makeGenomeID(taxon_id)
        self.gu.addClassToGraph(self.graph, genome_id, genome_label, Feature.types['genome'])

        return

    def addReferenceGenome(self, build_id, build_label, taxon_id):
        genome_id = self.makeGenomeID(taxon_id)
        self.gu.addIndividualToGraph(self.graph, build_id, build_label, Feature.types['reference_genome'])
        self.gu.addType(self.graph, build_id, genome_id)
        self.addTaxon(taxon_id, build_id)

        return

    def makeGenomeID(self, taxon_id):
        # scrub off the taxon prefix.  put it in base space

        genome_id = re.sub('.*\:', ':', taxon_id) + 'genome'

        return genome_id

    def addChromosome(self, chr, tax_id, tax_label=None, build_id=None, build_label=None):
        # if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome.
        # if a build is included, punn the chromosome as a subclass of SO:chromsome, and
        # make the build-specific chromosome an instance of the supplied chr.  The chr then becomes part of the
        # build or genome.

        # first, make the chromosome class, at the taxon level
        chr_id = makeChromID(str(chr), tax_id)
        if tax_label is not None:
            chr_label = makeChromLabel(chr, tax_label)
        else:
            chr_label = makeChromLabel(chr)
        genome_id = self.makeGenomeID(tax_id)
        self.gu.addClassToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome'])
        self.addTaxon(tax_id, genome_id)  # add the taxon to the genome

        if build_id is not None:
            chrinbuild_id = makeChromID(chr, build_id)  # the build-specific chromosome
            if build_label is None:
                build_label = build_id
            chrinbuild_label = makeChromLabel(chr, build_label)
            # add the build-specific chromosome as an instance of the chr class
            self.gu.addIndividualToGraph(self.graph, chrinbuild_id, chrinbuild_label, chr_id)

            # add the build-specific chromosome as a member of the build  (both ways)
            self.gu.addMember(self.graph, build_id, chrinbuild_id)
            self.gu.addMemberOf(self.graph, chrinbuild_id, build_id)

        return

    def addChromosomeClass(self, chrom_num, taxon_id, taxon_label):
        taxon = re.sub('NCBITaxon:', '', taxon_id)
        chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')  # the chrom class (generic) id
        chrom_class_label = makeChromLabel(chrom_num, taxon_label)
        self.gu.addClassToGraph(self.graph, chrom_class_id, chrom_class_label,
                                Feature.types['chromosome'])

        return

    def addChromosomeInstance(self, chr_num, reference_id, reference_label, chr_type=None):
        """
        Add the supplied chromosome as an instance within the given reference
        :param chr:
        :param reference_id: for example, a build id like UCSC:hg19
        :param reference_label:
        :param chr_type: this is the class that this is an instance of.  typically a genome-specific chr
        :return:
        """

        chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH')
        chr_label = makeChromLabel(str(chr_num), reference_label)

        self.gu.addIndividualToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome'])
        self.gu.addType(self.graph, chr_id, chr_type)

        # add the build-specific chromosome as a member of the build  (both ways)
        self.gu.addMember(self.graph, reference_id, chr_id)
        self.gu.addMemberOf(self.graph, chr_id, reference_id)

        return

    def make_variant_locus_label(self, gene_label, allele_label):
        if gene_label is None:
            gene_label = ''
        label = gene_label.strip()+'<' + allele_label.strip() + '>'

        return label

    def make_vslc_label(self, gene_label, allele1_label, allele2_label):
        """
        Make a Variant Single Locus Complement (VSLC) in monarch-style.
        :param gene_label:
        :param allele1_label:
        :param allele2_label:
        :return:
        """
        vslc_label = ''

        if (gene_label is None and allele1_label is None and allele2_label is None):
            logger.error("Not enough info to make vslc label")
            return None

        top = self.make_variant_locus_label(gene_label, allele1_label)
        bottom = ''
        if allele2_label is not None:
            bottom = self.make_variant_locus_label(gene_label, allele2_label)

        vslc_label = '/'.join((top, bottom))

        return vslc_label
Пример #8
0
class Assoc:
    """
    An abstract class for OBAN (Monarch)-style associations,
    to enable attribution of source and evidence
    on statements.
    """

    assoc_types = {
        'association': 'OBAN:association'
    }

    annotation_properties = {
        'replaced_by': 'IAO:0100001',
        'consider': 'OIO:consider',
        'hasExactSynonym': 'OIO:hasExactSynonym',
        'hasRelatedSynonym': 'OIO:hasRelatedSynonym',
        'definition': 'IAO:0000115',
        'has_xref': 'OIO:hasDbXref',
    }

    object_properties = {
        'has_disposition': 'GENO:0000208',
        'has_phenotype': 'RO:0002200',
        'in_taxon': 'RO:0002162',
        'has_quality': 'RO:0000086',
        'towards': 'RO:0002503',
        'has_subject': 'OBAN:association_has_subject',
        'has_object': 'OBAN:association_has_object',
        'has_predicate': 'OBAN:association_has_object_property',
        'is_about': 'IAO:00000136',
        'has_evidence': 'RO:0002558',
        'has_source': 'dc:source',
        'has_provenance': 'OBAN:has_provenance'
    }

    datatype_properties = {
        'position': 'faldo:position',
        'has_measurement': 'IAO:0000004'
    }

    properties = annotation_properties.copy()
    properties.update(object_properties)
    properties.update(datatype_properties)

    OWLCLASS = OWL['Class']
    OWLIND = OWL['NamedIndividual']
    OBJECTPROP = OWL['ObjectProperty']
    ANNOTPROP = OWL['AnnotationProperty']
    DATAPROP = OWL['DatatypeProperty']

    SUBCLASS = RDFS['subClassOf']
    BASE = Namespace(curie_map.get()[''])

    def __init__(self, definedby):
        self.cu = CurieUtil(curie_map.get())
        self.gu = GraphUtils(curie_map.get())

        # core parts of the association
        self.definedby = definedby
        self.sub = self.obj = self.rel = None
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []
        # this is going to be used for the refactored evidence/provenance
        self.provenance = []

        self.score = None
        self.score_type = None
        self.score_unit = None

        return

    def get_properties(self):
        return self.properties

    def _is_valid(self):

        # check if sub/obj/rel are none...throw error
        if self.sub is None:
            raise ValueError('No subject set for this association')
        if self.obj is None:
            raise ValueError('No object set for this association')
        if self.rel is None:
            raise ValueError('No relation set for this association')

        return True

    def _add_basic_association_to_graph(self, g):

        if not self._is_valid():
            return

        # first, add the direct triple
        # anonymous (blank) nodes are indicated with underscore
        s = self.gu.getNode(self.sub)
        o = self.gu.getNode(self.obj)
        p = self.gu.getNode(self.rel)

        if s is None:
            logging.error(
                "Unable to retrieve graph node for Subject %s ", self.sub)
            return
                
        elif p is None:
            logging.error(
                "Unable to retrieve graph node for Predicate %s ", self.rel)
            return
                
        elif o is None:
            logging.error(
                "Unable to retrieve graph node for Object %s ", self.obj)
            return
        else:
            g.add((s, p, o))

        if self.assoc_id is None:
            self.set_association_id()

        node = self.gu.getNode(self.assoc_id)
        g.add((node, RDF['type'],
               self.gu.getNode(self.assoc_types['association'])))

        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_subject'], self.sub)
        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_object'], self.obj)
        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_predicate'], self.rel)

        if self.description is not None:
            self.gu.addDescription(g, self.assoc_id, self.description)

        if self.evidence is not None and len(self.evidence) > 0:
            for e in self.evidence:
                self.gu.addTriple(g, self.assoc_id,
                                  self.object_properties['has_evidence'], e)

        if self.source is not None and len(self.source) > 0:
            for s in self.source:
                if re.match('http', s):
                    # TODO assume that the source is a publication?
                    # use Reference class here
                    self.gu.addTriple(g, self.assoc_id,
                                      self.object_properties['has_source'], s,
                                      True)
                else:
                    self.gu.addTriple(g, self.assoc_id,
                                      self.object_properties['has_source'], s)

        if self.provenance is not None and len(self.provenance) > 0:
            for p in self.provenance:
                self.gu.addTriple(g, self.assoc_id,
                                  self.object_properties['has_provenance'], p)

        if self.score is not None:
            self.gu.addTriple(
                g, self.assoc_id, self.properties['has_measurement'],
                Literal(self.score, datatype=XSD['float']), True)
            # TODO
            # update with some kind of instance of scoring object
            # that has a unit and type

        return

    def add_association_to_graph(self, g):

        self._add_basic_association_to_graph(g)

        return

    def set_subject(self, identifier):
        self.sub = identifier
        return

    def set_object(self, identifier):
        self.obj = identifier
        return

    def set_relationship(self, identifier):
        self.rel = identifier
        return

    def set_association_id(self, assoc_id=None):
        """
        This will set the association ID based on the internal parts
            of the association.
        To be used in cases where an external association identifier
            should be used.

        :param assoc_id:
        :return:
        """
        if assoc_id is None:
            self.assoc_id = self.make_association_id(self.definedby, self.sub,
                                                     self.rel, self.obj)
        else:
            self.assoc_id = assoc_id

        return

    def get_association_id(self):

        return self.assoc_id

    def set_description(self, description):
        self.description = description

        return

    def set_score(self, score, unit=None, score_type=None):

        self.score = score
        self.score_unit = unit
        self.score_type = score_type

        return

    def add_evidence(self, identifier):
        """
        Add an evidence code to the association object (maintained as a list)
        :param identifier:
        :return:
        """

        if identifier is not None and identifier.strip() != '':
            self.evidence += [identifier]

        return

    def add_source(self, identifier):
        """
        Add a source identifier (such as publication id)
        to the association object (maintained as a list)
        TODO we need to greatly expand this function!

        :param identifier:
        :return:
        """

        if identifier is not None and identifier.strip() != '':
            self.source += [identifier]

        return

    def add_provenance(self, identifier):

        if identifier is not None and identifier.strip() != '':
            self.provenance += [identifier]

        return

    def load_all_properties(self, g):
        props = {
            self.OBJECTPROP: self.object_properties,
            self.ANNOTPROP: self.annotation_properties,
            self.DATAPROP: self.datatype_properties
        }

        for p in props:
            self.gu.loadProperties(g, props[p], p)

        return

    def _get_source_uri(self, pub_id):
        """
        Given some kind of pub_id (which might be a CURIE or url),
        convert it into a proper node.

        :param pub_id:
        :return: source: Well-formed URI for the given identifier (or url)
        """

        source = None
        if re.compile('http').match(pub_id):
            source = URIRef(pub_id)
        else:
            u = self.gu.getNode(pub_id)
            if u is not None:
                source = URIRef(u)
            else:
                logger.error(
                    "An id we don't know how to deal with: %s", pub_id)

        return source

    @staticmethod
    def make_association_id(definedby, subject, predicate, object,
                            attributes=None):
        """
        A method to create unique identifiers for OBAN-style associations,
        based on all the parts of the association
        If any of the items is empty or None, it will convert it to blank.
        It effectively md5 hashes the (+)-joined string from the values.
        Subclasses of Assoc can submit an additional array of attributes
        that will be added to the ID.

        :param definedby: The (data) resource that provided the annotation
        :param subject:
        :param predicate:
        :param object:
        :param attributes:
        :return:
        """

        # note others available:
        #   md5(), sha1(), sha224(), sha256(), sha384(), and sha512()
        # TEC: at our scale, md5 is in danger of having collisions.
        # putting definedby first,
        # as this will usually be the datasource providing the annotation
        # this will end up making the first few parts of the id
        # be the same for all annotations in that resource
        items_to_hash = [definedby, subject, predicate, object]
        if attributes is not None:
            items_to_hash += attributes

        for i, val in enumerate(items_to_hash):
            if val is None:
                items_to_hash[i] = ''

        byte_string = '+'.join(items_to_hash).encode("utf-8")

        # TODO put this in a util?
        return ':'.join(('MONARCH', hashlib.md5(byte_string).hexdigest()))