Пример #1
0
    def _process_collection(self, collection_id, label, page):
        """
        This function will process the data supplied internally
        about the repository from Coriell.

        Triples:
            Repository a ERO:collection
            rdf:label Literal(label)
            foaf:page Literal(page)

        :param collection_id:
        :param label:
        :param page:
        :return:
        """
        # #############    BUILD THE CELL LINE REPOSITORY    #############
        for graph in [self.graph, self.testgraph]:
            # TODO: How to devise a label for each repository?
            model = Model(graph)
            reference = Reference(graph)
            repo_id = 'CoriellCollection:' + collection_id
            repo_label = label
            repo_page = page

            model.addIndividualToGraph(repo_id, repo_label,
                                       self.globaltt['collection'])
            reference.addPage(repo_id, repo_page)

        return
Пример #2
0
    def _add_assertion_provenance(
            self,
            assoc_id,
            evidence_line_bnode
    ):
        """
        Add assertion level provenance, currently always IMPC
        :param assoc_id:
        :param evidence_line_bnode:
        :return:
        """
        provenance_model = Provenance(self.graph)
        model = Model(self.graph)
        assertion_bnode = self.make_id(
            "assertion{0}{1}".format(assoc_id, self.localtt['IMPC']), '_')

        model.addIndividualToGraph(assertion_bnode, None, self.globaltt['assertion'])

        provenance_model.add_assertion(
            assertion_bnode, self.localtt['IMPC'],
            'International Mouse Phenotyping Consortium')

        self.graph.addTriple(
            assoc_id, self.globaltt['proposition_asserted_in'], assertion_bnode)

        self.graph.addTriple(
            assertion_bnode,
            self.resolve('is_assertion_supported_by_evidence'),  # "SEPIO:0000111"
            evidence_line_bnode)

        return
Пример #3
0
    def _add_assertion_provenance(
            self,
            assoc_id,
            evidence_line_bnode
    ):
        """
        Add assertion level provenance, currently always IMPC
        :param assoc_id:
        :param evidence_line_bnode:
        :return:
        """
        provenance_model = Provenance(self.graph)
        model = Model(self.graph)
        assertion_bnode = self.make_id(
            "assertion{0}{1}".format(assoc_id, self.localtt['IMPC']), '_')

        model.addIndividualToGraph(assertion_bnode, None, self.globaltt['assertion'])

        provenance_model.add_assertion(
            assertion_bnode, self.localtt['IMPC'],
            'International Mouse Phenotyping Consortium')

        self.graph.addTriple(
            assoc_id, self.globaltt['is_asserted_in'], assertion_bnode)

        self.graph.addTriple(
            assertion_bnode,
            self.globaltt['is_assertion_supported_by_evidence'],  # "SEPIO:0000111"
            evidence_line_bnode)
Пример #4
0
    def _process_breed_row(self, row):
        model = Model(self.graph)
        # in test mode, keep all breeds of our test species
        if self.test_mode and row['gb_species_id'] not in self.test_ids[
                'taxon']:
            return

        # save the breed keys in the test_ids for later processing
        self.test_ids['breed'] += [row['breed_id']]

        breed_id = 'OMIA-breed:' + str(row['breed_id'])

        self.id_hash['breed'][row['breed_id']] = breed_id
        tax_id = 'NCBITaxon:' + str(row['gb_species_id'])
        breed_label = row['breed_name']
        species_label = self.label_hash.get(tax_id)
        if species_label is not None:
            breed_label = breed_label + ' (' + species_label + ')'

        model.addIndividualToGraph(
            breed_id,
            breed_label,
            tax_id,
            ind_category=blv.terms['PopulationOfIndividualOrganisms'])
        self.label_hash[breed_id] = breed_label
Пример #5
0
    def _add_assertion_provenance(self, assoc_id, evidence_line_bnode,
                                  impc_map):
        """
        Add assertion level provenance, currently always IMPC
        :param assoc_id:
        :param evidence_line_bnode:
        :return:
        """
        provenance_model = Provenance(self.graph)
        model = Model(self.graph)
        assertion_bnode = self.make_id(
            "assertion{0}{1}".format(assoc_id,
                                     impc_map['asserted_by']['IMPC']), '_')

        model.addIndividualToGraph(
            assertion_bnode, None,
            provenance_model.provenance_types['assertion'])

        provenance_model.add_assertion(
            assertion_bnode, impc_map['asserted_by']['IMPC'],
            'International Mouse Phenotyping Consortium')

        self.graph.addTriple(
            assoc_id, provenance_model.object_properties['is_asserted_in'],
            assertion_bnode)

        self.graph.addTriple(
            assertion_bnode,
            provenance_model.object_properties['is_assertion_supported_by'],
            evidence_line_bnode)

        return
Пример #6
0
    def _add_assertion_provenance(
            self, assoc_id, evidence_line_bnode, impc_map):
        """
        Add assertion level provenance, currently always IMPC
        :param assoc_id:
        :param evidence_line_bnode:
        :return:
        """
        provenance_model = Provenance(self.graph)
        model = Model(self.graph)
        assertion_bnode = self.make_id("assertion{0}{1}".format(
            assoc_id, impc_map['asserted_by']['IMPC']),  '_')

        model.addIndividualToGraph(
            assertion_bnode, None,
            provenance_model.provenance_types['assertion'])

        provenance_model.add_assertion(
            assertion_bnode, impc_map['asserted_by']['IMPC'],
            'International Mouse Phenotyping Consortium')

        self.graph.addTriple(
            assoc_id, provenance_model.object_properties['is_asserted_in'],
            assertion_bnode)

        self.graph.addTriple(
            assertion_bnode,
            provenance_model.object_properties['is_assertion_supported_by'],
            evidence_line_bnode)

        return
Пример #7
0
    def _process_collection(self, collection_id, label, page):
        """
        This function will process the data supplied internally
        about the repository from Coriell.

        Triples:
            Repository a ERO:collection
            rdf:label Literal(label)
            foaf:page Literal(page)

        :param collection_id:
        :param label:
        :param page:
        :return:
        """
        # #############    BUILD THE CELL LINE REPOSITORY    #############
        for graph in [self.graph, self.testgraph]:
            # TODO: How to devise a label for each repository?
            model = Model(graph)
            reference = Reference(graph)
            repo_id = 'CoriellCollection:' + collection_id
            repo_label = label
            repo_page = page

            model.addIndividualToGraph(
                repo_id, repo_label, self.globaltt['collection'])
            reference.addPage(repo_id, repo_page)

        return
Пример #8
0
    def _build_gene_disease_model(self,
                                  gene_id,
                                  relation_id,
                                  disease_id,
                                  variant_label,
                                  consequence_predicate=None,
                                  consequence_id=None,
                                  allelic_requirement=None,
                                  pmids=None):
        """
        Builds gene variant disease model

        :return: None
        """
        model = Model(self.graph)
        geno = Genotype(self.graph)

        pmids = [] if pmids is None else pmids

        is_variant = False
        variant_or_gene = gene_id

        variant_id_string = variant_label
        variant_bnode = self.make_id(variant_id_string, "_")

        if consequence_predicate is not None \
                and consequence_id is not None:
            is_variant = True
            model.addTriple(variant_bnode, consequence_predicate,
                            consequence_id)
            # Hack to add labels to terms that
            # don't exist in an ontology
            if consequence_id.startswith(':'):
                model.addLabel(consequence_id,
                               consequence_id.strip(':').replace('_', ' '))

        if is_variant:
            variant_or_gene = variant_bnode
            # Typically we would type the variant using the
            # molecular consequence, but these are not specific
            # enough for us to make mappings (see translation table)
            model.addIndividualToGraph(variant_bnode, variant_label,
                                       self.globaltt['variant_locus'])
            geno.addAffectedLocus(variant_bnode, gene_id)
            model.addBlankNodeAnnotation(variant_bnode)

        assoc = G2PAssoc(self.graph, self.name, variant_or_gene, disease_id,
                         relation_id)
        assoc.source = pmids
        assoc.add_association_to_graph()

        if allelic_requirement is not None and is_variant is False:
            model.addTriple(assoc.assoc_id,
                            self.globaltt['has_allelic_requirement'],
                            allelic_requirement)
            if allelic_requirement.startswith(':'):
                model.addLabel(
                    allelic_requirement,
                    allelic_requirement.strip(':').replace('_', ' '))
Пример #9
0
    def _process_straininfo(self, limit):
        # line_counter = 0  # TODO unused
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, self.files['straininfo']['file']))

        tax_id = 'NCBITaxon:10090'

        with open(raw, 'r') as f:
            reader = csv.reader(f, delimiter=',', quotechar='\"')
            self.check_header(self.files['straininfo']['file'], f.readline())
            for row in reader:
                (strain_name, vendor, stocknum, panel, mpd_strainid,
                 straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.testMode and \
                        'MPD:' + str(mpd_strainid) not in self.test_ids:
                    continue
                strain_id = 'MPD-strain:' + str(mpd_strainid)
                model.addIndividualToGraph(strain_id, strain_name, tax_id)
                if mpdshortname.strip() != '':
                    model.addSynonym(strain_id, mpdshortname.strip())
                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:'+stocknum
                        model.addSameIndividual(strain_id, jax_id)
                    elif vendor == 'Rbrc':
                        # reiken
                        reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
                        model.addSameIndividual(strain_id, reiken_id)
                    else:
                        if url != '':
                            model.addXref(strain_id, url, True)
                        if vendor != '':
                            model.addXref(
                                strain_id, ':'.join((vendor, stocknum)),
                                True)

                # add the panel information
                if panel != '':
                    desc = panel+' [panel]'
                    model.addDescription(strain_id, desc)

                # TODO make the panels as a resource collection

        return
Пример #10
0
class Environment():
    """
    These methods provide convenient methods
    to add items related to an experimental environment
    and it's parts to a supplied graph.

    This is a stub.
    """


    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        return

    def addEnvironment(
            self, env_id, env_label, env_type=None, env_description=None):
        if env_type is None:
            env_type = self.globaltt['environmental_system']

        self.model.addIndividualToGraph(
            env_id, env_label, env_type, env_description)

        return

    def addEnvironmentalCondition(
            self, cond_id, cond_label, cond_type=None, cond_description=None):
        if cond_type is None:
            cond_type = self.globaltt['environmental_condition']

        self.model.addIndividualToGraph(
            cond_id, cond_label, cond_type, cond_description)

        return

    def addComponentToEnvironment(self, env_id, component_id):

        self.graph.addTriple(env_id, self.globaltt['has_part'], component_id)

        return

    def addComponentAttributes(self, component_id, entity_id, value=None, unit=None):

        self.graph.addTriple(
            component_id, self.globaltt['has_part'], entity_id)
        # TODO add value and units

        return
Пример #11
0
class Environment():
    """
    These methods provide convenient methods
    to add items related to an experimental environment
    and it's parts to a supplied graph.

    This is a stub ready for expansion.
    """


    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        return

    def addEnvironment(
            self, env_id, env_label, env_type=None, env_description=None):
        if env_type is None:
            env_type = self.globaltt['environmental_system']

        self.model.addIndividualToGraph(
            env_id, env_label, env_type, env_description)

        return

    def addEnvironmentalCondition(
            self, cond_id, cond_label, cond_type=None, cond_description=None):
        if cond_type is None:
            cond_type = self.globaltt['environmental_condition']

        self.model.addIndividualToGraph(
            cond_id, cond_label, cond_type, cond_description)

        return

    def addComponentToEnvironment(self, env_id, component_id):

        self.graph.addTriple(env_id, self.globaltt['has_part'], component_id)

        return

    def addComponentAttributes(self, component_id, entity_id, value=None, unit=None):

        self.graph.addTriple(
            component_id, self.globaltt['has_part'], entity_id)
        # TODO add value and units

        return
Пример #12
0
    def _process_straininfo(self, limit):
        # line_counter = 0  # TODO unused
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, self.files['straininfo']['file']))

        tax_id = 'NCBITaxon:10090'

        with open(raw, 'r') as f:
            reader = csv.reader(f, delimiter=',', quotechar='\"')
            f.readline()  # read the header row; skip
            for row in reader:
                (strain_name, vendor, stocknum, panel, mpd_strainid,
                 straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.testMode and \
                        'MPD:' + str(mpd_strainid) not in self.test_ids:
                    continue
                strain_id = 'MPD-strain:' + str(mpd_strainid)
                model.addIndividualToGraph(strain_id, strain_name, tax_id)
                if mpdshortname.strip() != '':
                    model.addSynonym(strain_id, mpdshortname.strip())
                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:'+stocknum
                        model.addSameIndividual(strain_id, jax_id)
                    elif vendor == 'Rbrc':
                        # reiken
                        reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
                        model.addSameIndividual(strain_id, reiken_id)
                    else:
                        if url != '':
                            model.addXref(strain_id, url, True)
                        if vendor != '':
                            model.addXref(
                                strain_id, ':'.join((vendor, stocknum)),
                                True)

                # add the panel information
                if panel != '':
                    desc = panel+' [panel]'
                    model.addDescription(strain_id, desc)

                # TODO make the panels as a resource collection

        return
Пример #13
0
    def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num,
                          disorder_label, phene_key):

        geno = Genotype(g)
        model = Model(g)
        disorder_id = ':'.join(('OMIM', disorder_num))
        rel_id = model.object_properties['has_phenotype']  # default
        rel_label = 'causes'
        if re.match(r'\[', disorder_label):
            rel_id = model.object_properties['is_marker_for']
            rel_label = 'is a marker for'
        elif re.match(r'\{', disorder_label):
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'
        elif re.match(r'\?', disorder_label):
            # this is a questionable mapping!  skip?
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'

        evidence = self._map_phene_mapping_code_to_eco(phene_key)

        # we actually want the association between the gene and the disease
        # to be via an alternate locus not the "wildtype" gene itself.
        # so we make an anonymous alternate locus,
        # and put that in the association.
        # but we only need to do that in the cases when it's not an NCBIGene
        # (as that is a sequence feature itself)
        if re.match(r'OMIM:', gene_id):
            alt_locus = '_:' + re.sub(r':', '',
                                      gene_id) + '-' + disorder_num + 'VL'
            alt_label = gene_symbol.strip()
            if alt_label is not None and alt_label != '':
                alt_label = \
                    ' '.join(('some variant of', alt_label,
                              'that', rel_label, disorder_label))
            else:
                alt_label = None

            model.addIndividualToGraph(alt_locus, alt_label,
                                       Genotype.genoparts['variant_locus'])
            geno.addAffectedLocus(alt_locus, gene_id)
            model.addBlankNodeAnnotation(alt_locus)

        else:
            # assume it's already been added
            alt_locus = gene_id

        assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id)
        assoc.add_evidence(evidence)
        assoc.add_association_to_graph()

        return
Пример #14
0
    def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num,
                          disorder_label, phene_key):

        geno = Genotype(g)
        model = Model(g)
        disorder_id = ':'.join(('OMIM', disorder_num))
        rel_id = model.object_properties['has_phenotype']  # default
        rel_label = 'causes'
        if re.match(r'\[', disorder_label):
            rel_id = model.object_properties['is_marker_for']
            rel_label = 'is a marker for'
        elif re.match(r'\{', disorder_label):
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'
        elif re.match(r'\?', disorder_label):
            # this is a questionable mapping!  skip?
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'

        evidence = self._map_phene_mapping_code_to_eco(phene_key)

        # we actually want the association between the gene and the disease
        # to be via an alternate locus not the "wildtype" gene itself.
        # so we make an anonymous alternate locus,
        # and put that in the association.
        # but we only need to do that in the cases when it's not an NCBIGene
        # (as that is a sequence feature itself)
        if re.match(r'OMIM:', gene_id):
            alt_locus = '_:'+re.sub(r':', '', gene_id)+'-'+disorder_num+'VL'
            alt_label = gene_symbol.strip()
            if alt_label is not None and alt_label != '':
                alt_label = \
                    ' '.join(('some variant of', alt_label,
                              'that', rel_label, disorder_label))
            else:
                alt_label = None

            model.addIndividualToGraph(
                alt_locus, alt_label, Genotype.genoparts['variant_locus'])
            geno.addAffectedLocus(alt_locus, gene_id)
            model.addBlankNodeAnnotation(alt_locus)

        else:
            # assume it's already been added
            alt_locus = gene_id

        assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id)
        assoc.add_evidence(evidence)
        assoc.add_association_to_graph()

        return
Пример #15
0
 def _add_gene_to_graph(self, gene, variant_bnode, gene_id, relation):
     """
     :param gene:
     :param variant_bnode:
     :return:
     """
     model = Model(self.graph)
     if gene_id:
         self.graph.addTriple(variant_bnode, relation, gene_id)
     elif gene:
         LOG.info("gene %s not mapped to NCBI gene, making blank node", gene)
         gene_bnode = self.make_id("{0}".format(gene), "_")
         model.addIndividualToGraph(gene_bnode, gene)
         self.graph.addTriple(variant_bnode, relation, gene_bnode)
Пример #16
0
 def _add_gene_to_graph(self, gene, variant_bnode, gene_id, relation):
     """
     :param gene:
     :param variant_bnode:
     :return:
     """
     model = Model(self.graph)
     if gene_id:
         self.graph.addTriple(variant_bnode, relation, gene_id)
     elif gene:
         LOG.info("gene %s not mapped to NCBI gene, making blank node",
                  gene)
         gene_bnode = self.make_id("{0}".format(gene), "_")
         model.addIndividualToGraph(gene_bnode, gene)
         self.graph.addTriple(variant_bnode, relation, gene_bnode)
Пример #17
0
    def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank):
        """
        :param gene_id: str Non curified ID
        :param gene_label: str Gene symbol
        :param anatomy_curie: str curified anatomy term
        :param rank: str rank
        :return: None
        """
        g2a_association = Assoc(self.graph, self.name)
        model = Model(self.graph)
        gene_curie = "ENSEMBL:{}".format(gene_id)

        rank = re.sub(r',', '', str(rank))  # ? can't do RE on a float ...
        model.addIndividualToGraph(gene_curie, None)
        g2a_association.sub = gene_curie
        g2a_association.obj = anatomy_curie
        g2a_association.rel = self.globaltt['expressed in']
        g2a_association.add_association_to_graph()
        g2a_association.add_predicate_object(
            self.globaltt['has_quantifier'], float(rank), 'Literal', 'xsd:float')
Пример #18
0
    def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank):
        """
        :param gene_id: str Non curified ID
        :param gene_label: str Gene symbol
        :param anatomy_curie: str curified anatomy term
        :param rank: str rank
        :return: None
        """
        g2a_association = Assoc(self.graph, self.name)
        model = Model(self.graph)
        gene_curie = "ENSEMBL:{}".format(gene_id)

        rank = re.sub(r',', '', str(rank))  # ? can't do RE on a float ...
        model.addIndividualToGraph(gene_curie, None)
        g2a_association.sub = gene_curie
        g2a_association.obj = anatomy_curie
        g2a_association.rel = self.globaltt['expressed in']
        g2a_association.add_association_to_graph()
        g2a_association.add_predicate_object(
            self.globaltt['has_quantifier'], float(rank), 'Literal', 'xsd:float')
        return
Пример #19
0
    def _process_breed_row(self, row):
        model = Model(self.g)
        # in test mode, keep all breeds of our test species
        if self.testMode and \
                (int(row['gb_species_id']) not in self.test_ids['taxon']):
            return

        # save the breed keys in the test_ids for later processing
        self.test_ids['breed'] += [int(row['breed_id'])]

        breed_id = self.make_breed_id(row['breed_id'])

        self.id_hash['breed'][row['breed_id']] = breed_id
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        breed_label = row['breed_name']
        species_label = self.label_hash.get(tax_id)
        if species_label is not None:
            breed_label = breed_label + ' ('+species_label+')'

        model.addIndividualToGraph(breed_id, breed_label, tax_id)
        self.label_hash[breed_id] = breed_label

        return
Пример #20
0
 def _add_gene_anatomy_association(self, gene_id, anatomy_curie, rank):
     """
     :param gene_id: str Non curified ID
     :param gene_label: str Gene symbol
     :param anatomy_curie: str curified anatomy term
     :param rank: str rank
     :return: None
     """
     g2a_association = Assoc(self.graph, self.name)
     genotype = Genotype(self.graph)
     model = Model(self.graph)
     gene_curie = "ENSEMBL:{}".format(gene_id)
     rank = re.sub(r',', '', rank)
     model.addIndividualToGraph(ind_id=gene_curie, label=None,
                                ind_type=genotype.genoparts['gene'])
     g2a_association.sub = gene_curie
     g2a_association.obj = anatomy_curie
     g2a_association.rel = Assoc.object_properties['expressed_in']
     g2a_association.add_association_to_graph()
     g2a_association.add_predicate_object(
         Assoc.datatype_properties['has_quantifier'],
         float(rank), 'Literal', 'xsd:float')
     return
Пример #21
0
class Decipher(Source):
    """
    Deprecated - please see the EBIGene2Phen class, which parses the same
    file but fetches it from EBI which has clearer terms for redistribution,
    while Decipher has restrictive terms due to containing patient data in
    password protected datasets.

    The Decipher group curates and assembles the Development Disorder Genotype
    Phenotype Database (DDG2P) which is a curated list of genes reported to be
    associated with developmental disorders, compiled by clinicians as part of
    the DDD study to facilitate clinical feedback of likely causal variants.

    Beware that the redistribution of this data is a bit unclear from the
    [license](https://decipher.sanger.ac.uk/legal).
    If you intend to distribute this data, be sure to have the appropriate
    licenses in place.

    """

    files = {
        'annot': {
            'file': 'ddg2p.zip',
            'url': 'https://decipher.sanger.ac.uk/files/ddd/ddg2p.zip',
            'headers'}
    }

    def __init__(self, graph_type, are_bnodes_skolemized):
        super().__init__(
            graph_type,
            are_bnodes_skolemized,
            'decipher',
            ingest_title='Development Disorder Genotype Phenotype Database',
            ingest_url='https://decipher.sanger.ac.uk/',
            license_url='https://decipher.sanger.ac.uk/legal',
            data_rights='https://decipher.sanger.ac.uk/datasharing',
            # file_handle=None
        )

        if 'disease' not in self.all_test_ids:
            LOG.warning("not configured with disease test ids.")
            self.test_ids = []
        else:
            self.test_ids = self.all_test_ids['disease']

        self.graph = self.graph
        self.geno = Genotype(self.graph)
        self.model = Model(self.graph)
        self.graph_type = graph_type
        self.are_bnodes_skolemized = are_bnodes_skolemized

        return

    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)

        # since there's a dependency on HGNC files; fetch those too

        hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized)
        hgnc.fetch(is_dl_forced)

        return

    def parse(self, limit=None):
        if limit is not None:
            LOG.info("Only parsing first %s rows", limit)

        LOG.info("Parsing files...")

        if self.test_only:
            self.test_mode = True
            self.graph = self.testgraph
        else:
            self.graph = self.graph

        self.geno = Genotype(self.graph)

        # rare disease-phenotype associations
        self._process_ddg2p_annotations(limit)

        LOG.info("Finished parsing.")

        return

    def _process_ddg2p_annotations(self, limit):
        """
        The ddg2p annotations associate a gene symbol to an omim disease,
        along with some HPO ids and pubs. The gene symbols come from gencode,
        which in turn come from HGNC official gene symbols.  Therefore,
        we use the HGNC source class to get the id/symbol mapping for
        use in our annotations here.

        According to http://www.gencodegenes.org/faq.html,
        "Gene names are usually HGNC or MGI-approved gene symbols mapped
        to the GENCODE genes by the Ensembl xref pipeline. Sometimes,
        when there is no official gene symbol, the Havana clone-based
        name is used."

        The kind of variation that is linked to a disease is indicated
        (LOF, GOF, CNV, etc) in the source data.
        Here, we create an anonymous variant of the specified gene of
        the indicated type (mapped to the sequence ontology (SO)).

        :param limit:
        :return:

        """

        line_counter = 0
        if self.graph is not None:
            graph = self.graph
        else:
            graph = self.graph

        # in order for this to work, we need to map the HGNC id-symbol;
        hgnc = HGNC(self.graph_type, self.are_bnodes_skolemized)
        hgnc_symbol_id_map = hgnc.get_symbol_id_map()

        myzip = ZipFile(
            '/'.join((self.rawdir, self.files['annot']['file'])), 'r')

        # use the ddg2p.txt file
        fname = 'ddg2p.txt'

        unmapped_omim_counter = 0
        unmapped_gene_count = 0
        with myzip.open(fname, 'r') as f:
            f = io.TextIOWrapper(f)
            reader = csv.reader(f, delimiter='\t', quotechar='\"')
            # score_means_by_measure = {}
            # strain_scores_by_measure = {}   # TODO theseare unused
            for row in reader:
                line_counter += 1
                if re.match(r'#', row[0]):   # skip comments
                    continue

                (gencode_gene_name, mode, category, consequence, disease, omim,
                 ddg2p_id, pubmed_ids, hpo_codes) = row

                hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip())
                if hgnc_id is None:
                    LOG.error(
                        "Couldn't map the gene symbol %s to HGNC.",
                        gencode_gene_name)
                    unmapped_gene_count += 1
                    continue
                # add the gene
                self.model.addClassToGraph(hgnc_id, gencode_gene_name)

                # TODO make VSLC with the variation
                #   to associate with the disorder
                # TODO use the Inheritance and Mutation consequence
                #   to classify the VSLCs

                allele_id = self.make_allele_by_consequence(
                    consequence, hgnc_id, gencode_gene_name)

                if omim.strip() != '':
                    omim_id = 'OMIM:'+str(omim.strip())
                    # assume this is declared elsewhere in ontology
                    self.model.addClassToGraph(omim_id, None)

                    # ??? rel is never used
                    # if category.strip() == 'Confirmed DD gene':
                    #     rel = self.self.globaltt['has phenotype']
                    # elif category.strip() == 'Probable DD gene':
                    #    rel = self.self.globaltt['has phenotype']
                    # elif category.strip() == 'Possible DD gene':
                    #    rel = self.self.globaltt['contributes to']
                    # elif category.strip() == 'Not DD gene':
                    #    # TODO negative annotation
                    #    continue
                    assoc = G2PAssoc(graph, self.name, allele_id, omim_id)
                    # TODO 'rel' is assigned to but never used

                    for p in re.split(r';', pubmed_ids):
                        p = p.strip()
                        if p != '':
                            pmid = 'PMID:' + str(p)
                            r = Reference(
                                graph, pmid, self.globaltt['journal article'])
                            r.addRefToGraph()
                            assoc.add_source(pmid)

                    assoc.add_association_to_graph()
                else:
                    # these are unmapped to a disease id.
                    # note that some match OMIM disease labels
                    # but the identifiers are just not included.
                    # TODO consider mapping to OMIM or DOIDs in other ways
                    LOG.warning(
                        "No omim id on line %d\n%s", line_counter, str(row))
                    unmapped_omim_counter += 1

                # TODO hpo phenotypes
                # since the DDG2P file is not documented,
                # I don't know what the HPO annotations are actually about
                # are they about the gene?  the omim disease?  something else?
                # So, we wont create associations until this is clarified

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        myzip.close()
        LOG.warning(
            "gene-disorder associations with no omim id: %d",
            unmapped_omim_counter)
        LOG.warning("unmapped gene count: %d", unmapped_gene_count)

        return

    def make_allele_by_consequence(self, consequence, gene_id, gene_symbol):
        """
        Given a "consequence" label that describes a variation type,
        create an anonymous variant of the specified gene as an instance of
        that consequence type.

        :param consequence:
        :param gene_id:
        :param gene_symbol:
        :return: allele_id
        """

        allele_id = None

        # Loss of function : Nonsense, frame-shifting indel,
        #   essential splice site mutation, whole gene deletion or any other
        #   mutation where functional analysis demonstrates clear reduction
        #   or loss of function
        # All missense/in frame : Where all the mutations described in the data
        #   source are either missense or in frame deletions and there is no
        #   evidence favoring either loss-of-function, activating or
        #   dominant negative effect
        # Dominant negative : Mutation within one allele of a gene that creates
        #   a significantly greater deleterious effect on gene product
        #   function than a monoallelic loss of function mutation
        # Activating : Mutation, usually missense that results in
        #   a constitutive functional activation of the gene product
        # Increased gene dosage : Copy number variation that increases
        #   the functional dosage of the gene
        # Cis-regulatory or promotor mutation : Mutation in cis-regulatory
        #   elements that lies outwith the known transcription unit and
        #   promotor of the controlled gene
        # Uncertain : Where the exact nature of the mutation is unclear or
        #   not recorded

        type_id = self.resolve(consequence, mandatory=False)
        if type_id == consequence:
            LOG.warning("Consequence type unmapped: %s", str(consequence))
            type_id = self.globaltt['sequence_variant']

        # make the allele
        allele_id = ''.join((gene_id, type_id))
        allele_id = re.sub(r':', '', allele_id)
        allele_id = '_:'+allele_id  # make this a BNode
        allele_label = ' '.join((consequence, 'allele in', gene_symbol))

        self.model.addIndividualToGraph(allele_id, allele_label, type_id)
        self.geno.addAlleleOfGene(allele_id, gene_id)

        return allele_id
Пример #22
0
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """
        src_key = 'allele_pheno'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        col = self.files[src_key]['columns']

        graph = self.graph
        model = Model(self.graph)

        LOG.info("Processing Allele phenotype associations")
        geno = Genotype(graph)
        with open(raw, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            row = next(reader)
            if row[0] != '!gaf-version: 2.0':
                LOG.error('Not a vlaid gaf v2.0 formatted file: %s', raw)
                # raise
            for row in reader:
                if row[0][0] == '!':
                    continue
                # db = row[col.index('DB')]
                gene_num = row[col.index('DB Object ID')]
                # gene_symbol = row[col.index('DB Object Symbol')]
                is_not = row[col.index('Qualifier')]
                phenotype_id = row[col.index('GO ID')]
                ref = row[col.index('DB:Reference (|DB:Reference)')].strip()
                eco_symbol = row[col.index('Evidence Code')]
                with_or_from = row[col.index('With (or) From')]
                # aspect = row[col.index('Aspect')]
                # gene_name = row[col.index('DB Object Name')]
                # gene_synonym = row[col.index('DB Object Synonym (|Synonym)')]
                # gene_class = row[col.index('DB Object Type')]
                # taxon = row[col.index('Taxon(|taxon)')]
                # date = row[col.index('Date')]
                # assigned_by = row[col.index('Assigned By')]
                # blank = row[col.index('Annotation Extension')]
                # blank2 = row[col.index('Gene Product Form ID')]

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_symbol = eco_symbol.strip()
                eco_curie = None
                if eco_symbol.strip() != '' and eco_symbol in self.gaf_eco:
                    eco_curie = self.gaf_eco[eco_symbol]
                else:
                    LOG.warning(
                        'Evidence code %s is not found in the (gaf) gaf_eco',
                        eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    LOG.error(
                        "Missing alleles from phenotype assoc at line %d",
                        reader.line_num)
                    continue
                else:
                    for allele in allele_list:
                        allele_num = re.sub(r'WB:', '', allele.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling
                            # make the WormBase:WBRNAi* id
                            # a self.globaltt['reagent_targeted_gene'], and attach
                            # phenotype to this ID

                            # Previous model - make a bnode reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            # rnai_num = re.sub(r'WormBase:', '', allele_id)
                            # rnai_id = allele_id
                            # rtg_id = self.make_reagent_targeted_gene_id(
                            #    gene_num, rnai_num)
                            # geno.addReagentTargetedGene(
                            #    rnai_id, 'WormBase:' + gene_num, rtg_id)
                            # allele_id = rtg_id

                            # Could type the IRI as both the reagant and reagant
                            # targeted gene but not sure if this needed
                            # geno.addGeneTargetingReagent(
                            #   allele_id, None, self.globaltt['RNAi_reagent'], gene_id)

                            model.addIndividualToGraph(
                                allele_id, None,
                                self.globaltt['reagent_targeted_gene'])

                            self.graph.addTriple(
                                allele_id,
                                self.globaltt['is_expression_variant_of'],
                                gene_id)

                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations
                            # the public name will come from elsewhere

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling, treat variant
                            # like an allele
                            # vl_id = '_:'+'-'.join((gene_num, allele_num))
                            # geno.addSequenceAlterationToVariantLocus(
                            #    allele_id, vl_id)
                            # geno.addAlleleOfGene(vl_id, gene_id)

                            geno.addSequenceAlteration(allele_id, None)
                            geno.addAlleleOfGene(allele_id, gene_id)
                        else:
                            LOG.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(graph, self.name, allele_id,
                                         phenotype_id)

                        if eco_curie is not None:
                            assoc.add_evidence(eco_curie)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(graph, ref)
                            if re.search(r'Person', ref):
                                reference.setType(self.globaltt['person'])
                                assoc.add_evidence(self.globaltt[
                                    'inference from background scientific knowledge']
                                                   )
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if limit is not None and reader.line_num > limit:
                    break
Пример #23
0
    def _get_gene2pubmed(self, limit):
        """
        Loops through the gene2pubmed file and adds a simple triple to say
        that a given publication is_about a gene.
        Publications are added as NamedIndividuals.

        These are filtered on the taxon.

        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file']))
        logger.info("FILE: %s", myfile)
        assoc_counter = 0
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, pubmed_num) = line.split('\t')

                # ## set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #        or (self.filter == 'geneids' and \
                #            (int(gene_num) not in self.gene_ids))):
                #         continue
                # #### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                if gene_num == '-' or pubmed_num == '-':
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                pubmed_id = ':'.join(('PMID', pubmed_num))

                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                else:
                    model.addIndividualToGraph(gene_id, None)
                # add the publication as a NamedIndividual
                # add type publication
                model.addIndividualToGraph(pubmed_id, None, None)
                reference = Reference(g, pubmed_id,
                                      Reference.ref_types['journal_article'])
                reference.addRefToGraph()
                g.addTriple(pubmed_id, model.object_properties['is_about'],
                            gene_id)
                assoc_counter += 1
                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info("Processed %d pub-gene associations", assoc_counter)

        return
Пример #24
0
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and
        creates the genes as classes, typed with SO.  It will add their label,
        any alternate labels as synonyms, alternate ids as equivlaent classes.
        HPRDs get added as protein products.
        The chromosome and chr band get added as blank node regions,
        and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        model = Model(g)

        # not unzipping the file
        logger.info("Processing 'Gene Info' records")
        line_counter = 0
        gene_info = '/'.join((self.rawdir, self.files['gene_info']['file']))
        logger.info("FILE: %s", gene_info)
        # Add taxa and genome classes for those in our filter
        for tax_num in self.tax_ids:
            tax_id = ':'.join(('NCBITaxon', str(tax_num)))
            # tax label can get added elsewhere
            geno.addGenome(tax_id, str(tax_num))
            # label added elsewhere
            model.addClassToGraph(tax_id, None)
        with gzip.open(gene_info, 'rb') as f:
            row = f.readline().decode().strip().split('\t')
            logger.info("Header has %i columns", len(row))
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom,
                 map_loc, desc, gtype, authority_symbol, name,
                 nomenclature_status, other_designations, modification_date,
                 feature_type) = line.split('\t')

                # ##set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #           or (self.filter == 'geneids' and \
                #               (int(gene_num) not in self.gene_ids))):
                #         continue
                # #### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1

                gene_id = ':'.join(('NCBIGene', gene_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))
                gene_type_id = self.map_type_of_gene(gtype.strip())

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol
                # sequence feature, not a gene
                if gene_type_id == 'SO:0000110':
                    self.class_or_indiv[gene_id] = 'I'
                else:
                    self.class_or_indiv[gene_id] = 'C'

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    continue

                if self.class_or_indiv[gene_id] == 'C':
                    model.addClassToGraph(gene_id, label, gene_type_id, desc)
                    # NCBI will be the default leader,
                    # so we will not add the leader designation here.
                else:
                    model.addIndividualToGraph(gene_id, label, gene_type_id,
                                               desc)
                    # in this case, they aren't genes.
                    # so we want someone else to be the leader.

                if name != '-':
                    model.addSynonym(gene_id, name)
                if synonyms.strip() != '-':
                    for s in synonyms.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if other_designations.strip() != '-':
                    for s in other_designations.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if xrefs.strip() != '-':
                    self._add_gene_equivalencies(xrefs, gene_id, tax_num)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # no idea why there's two bands listed - possibly 2 assemblies
                # 419     ART3      4    with   4q21.1|4p15.1-p14
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # this is of "unknown" type == susceptibility
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3
                # unlocated scaffold
                # 101928066       LOC101928066    1|Un    -\
                # mouse --> 2C3
                # 11435   Chrna1  2       2 C3|2 43.76 cM
                # mouse --> 11B1.1
                # 11548   Adra1b  11      11 B1.1|11 25.81 cM
                # 11717   Ampd3   7       7 57.85 cM|7 E2-E3        # mouse
                # 14421   B4galnt1        10      10 D3|10 74.5 cM  # mouse
                # 323212  wu:fb92e12      19|20   -                 # fish
                # 323368  ints10  6|18    -                         # fish
                # 323666  wu:fc06e02      11|23   -                 # fish

                # feel that the chr placement can't be trusted in this table
                # when there is > 1 listed
                # with the exception of human X|Y,
                # we will only take those that align to one chr

                # FIXME remove the chr mapping below
                # when we pull in the genomic coords
                if str(chrom) != '-' and str(chrom) != '':
                    if re.search(r'\|', str(chrom)) and \
                            str(chrom) not in ['X|Y', 'X; Y']:
                        # means that there's uncertainty in the mapping.
                        # so skip it
                        # TODO we'll need to figure out how to deal with
                        # >1 loc mapping
                        logger.info(
                            '%s is non-uniquely mapped to %s.' +
                            ' Skipping for now.', gene_id, str(chr))
                        continue
                        # X|Y	Xp22.33;Yp11.3

                    # if(not re.match(
                    #        r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
                    #    print('odd chr=',str(chr))
                    if str(chrom) == 'X; Y':
                        chrom = 'X|Y'  # rewrite the PAR regions for processing
                    # do this in a loop to allow PAR regions like X|Y
                    for c in re.split(r'\|', str(chrom)):
                        # assume that the chromosome label is added elsewhere
                        geno.addChromosomeClass(c, tax_id, None)
                        mychrom = makeChromID(c, tax_num, 'CHR')
                        # temporarily use taxnum for the disambiguating label
                        mychrom_syn = makeChromLabel(c, tax_num)
                        model.addSynonym(mychrom, mychrom_syn)
                        band_match = re.match(r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$',
                                              map_loc)
                        if band_match is not None and \
                                len(band_match.groups()) > 0:
                            # if tax_num != '9606':
                            #     continue
                            # this matches the regular kind of chrs,
                            # so make that kind of band
                            # not sure why this matches?
                            #   chrX|Y or 10090chr12|Un"
                            # TODO we probably need a different regex
                            # per organism
                            # the maploc_id already has the numeric chromosome
                            # in it, strip it first
                            bid = re.sub(r'^' + c, '', map_loc)
                            # the generic location (no coordinates)
                            maploc_id = makeChromID(c + bid, tax_num, 'CHR')
                            # print(map_loc,'-->',bid,'-->',maploc_id)
                            # Assume it's type will be added elsewhere
                            band = Feature(g, maploc_id, None, None)
                            band.addFeatureToGraph()
                            # add the band as the containing feature
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                maploc_id)
                        else:
                            # TODO handle these cases: examples are:
                            # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24,
                            # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1,
                            # 12cen-q21,22q13.3|22q13.3
                            logger.debug('not regular band pattern for %s: %s',
                                         gene_id, map_loc)
                            # add the gene as a subsequence of the chromosome
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                mychrom)

                geno.addTaxon(tax_id, gene_id)

        return
Пример #25
0
class Provenance:
    """
    To model provenance as the basis for an association.
    This encompasses:
        * Process history leading to a claim being made,
          including processes through which evidence is evaluated
        * Processes through which information used as evidence is created.

    Provenance metadata includes accounts of who conducted these processes,
     what entities participated in them, and when/where they occurred.

    """
    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        return

    def add_date_created(self, prov_type, date):
        self.graph.addTriple(object_is_literal=True,
                             subject_id=prov_type,
                             predicate_id=self.globaltt['created_on'],
                             obj=date)
        return

    def add_study_parts(self, study, study_parts):
        for part in study_parts:
            self.graph.addTriple(study, self.globaltt['has_part'], part)
        return

    def add_study_to_measurements(self, study, measurements):
        for measurement in measurements:
            self.graph.addTriple(measurement, self.globaltt['output_of'],
                                 study)
        return

    def add_study_measure(self, study, measure):
        self.graph.addTriple(study, self.globaltt['measures_parameter'],
                             measure)
        return

    def add_assertion(self, assertion, agent, agent_label, date=None):
        """
        Add assertion to graph
        :param assertion:
        :param agent:
        :param evidence_line:
        :param date:
        :return: None
        """
        self.model.addIndividualToGraph(assertion, None,
                                        self.globaltt['assertion'])

        self.add_agent_to_graph(agent, agent_label,
                                self.globaltt['organization'])

        self.graph.addTriple(assertion, self.globaltt['created_by'], agent)

        if date is not None:
            self.graph.addTriple(self.graph, assertion,
                                 self.globaltt['date_created'], date)

        return

    def add_agent_to_graph(self,
                           agent_id,
                           agent_label,
                           agent_type=None,
                           agent_description=None):

        if agent_type is None:
            agent_type = self.globaltt['organization']
        self.model.addIndividualToGraph(agent_id, agent_label, agent_type,
                                        agent_description)

        return

    def add_assay_to_graph(self,
                           assay_id,
                           assay_label,
                           assay_type=None,
                           assay_description=None):
        if assay_type is None:
            assay_type = self.globaltt['assay']
        self.model.addIndividualToGraph(assay_id, assay_label, assay_type,
                                        assay_description)

        return
Пример #26
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
    the graph additions are in the addXToFeature functions,
    but should be separated.
    TODO:
    this will need to be extended to properly deal with
    fuzzy positions in faldo.

    """

    def __init__(
            self, graph, feature_id=None, label=None,
            feature_type=None, description=None):

        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.fid = feature_id
        self.label = label
        self.ftype = feature_type
        self.description = description
        self.start = None
        self.stop = None
        self.taxon = None
        return

    def addFeatureStartLocation(
            self, coordinate, reference_id, strand=None, position_types=None):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        :return:

        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand, position_types)

        return

    def addFeatureEndLocation(
            self, coordinate, reference_id, strand=None, position_types=None):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:

        :return:

        """

        self.stop = self._getLocation(coordinate, reference_id, strand, position_types)

        return

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        :return:

        """

        loc = {}
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.globaltt['Position'])

        return loc

    def _getStrandType(self, strand):
        """
        :param strand:
        :return:
        """

        # TODO make this a dictionary/enum:  PLUS, MINUS, BOTH, UNKNOWN
        strand_id = None
        if strand == '+':
            strand_id = self.globaltt['plus_strand']
        elif strand == '-':
            strand_id = self.globaltt['minus_strand']
        elif strand == '.':
            strand_id = self.globaltt['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            LOG.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(
            self, add_region=True, region_id=None, feature_as_class=False):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
        which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
        which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
        faldo:location region_id
        region_id a faldo:region
        faldo:begin start_position
        faldo:end end_position
        start_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id
        end_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:

        :return:

        """

        if feature_as_class:
            self.model.addClassToGraph(
                self.fid, self.label, self.ftype, self.description)
        else:
            self.model.addIndividualToGraph(
                self.fid, self.label, self.ftype, self.description)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(self.start['type'])
                if self.stop is not None and self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                rid = '_:'+rid+"-Region"
                region_id = rid

            self.graph.addTriple(self.fid, self.globaltt['location'], region_id)
            self.model.addIndividualToGraph(region_id, None, self.globaltt['Region'])
        else:
            region_id = self.fid
            self.model.addType(region_id, self.globaltt['region'])

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(
                self.start['reference'], self.start['coordinate'], self.start['type'])
            self.addPositionToGraph(
                self.start['reference'], self.start['coordinate'], self.start['type'])

        if self.stop is not None:
            endp = self._makePositionId(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])
            self.addPositionToGraph(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])

        self.addRegionPositionToGraph(region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

        return

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.globaltt['plus_strand'] in tylist:
            strand = 'plus'
        elif self.globaltt['minus_strand'] in tylist:
            strand = 'minus'
        elif self.globaltt['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
        Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return:
        """

        if reference is None:
            LOG.error("Trying to make position with no reference.")
            return None

        curie = '_:'
        reference = re.sub(r'\w+\:', '', reference, 1)
        if re.match(r'^_', reference):
            # this is in the case if the reference is a bnode
            reference = re.sub(r'^_', '', reference)
        curie += reference
        if coordinate is not None:
            # just in case it isn't a string already
            curie = '-'.join((curie, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                curie = '-'.join((curie, tstring))

        return curie

    def addRegionPositionToGraph(self, region_id, begin_position_id, end_position_id):

        if begin_position_id is None:
            pass
            # LOG.warn("No begin position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['begin'], begin_position_id)

        if end_position_id is None:
            pass
            # LOG.warn("No end position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['end'], end_position_id)

        return

    def addPositionToGraph(
            self, reference_id, position, position_types=None, strand=None):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
        we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:

        :return:  Identifier of the position created

        """
        pos_id = self._makePositionId(reference_id, position, position_types)
        if position is not None:
            self.graph.addTriple(
                pos_id, self.globaltt['position'], position, object_is_literal=True,
                literal_type="xsd:integer")
        self.graph.addTriple(pos_id, self.globaltt['reference'], reference_id)
        if position_types is not None:
            for pos_type in position_types:
                self.model.addType(pos_id, pos_type)
        strnd = None
        if strand is not None:
            strnd = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                strnd = self._getStrandType(strand)
        # else:
        #    strnd = self.globaltt['both_strand']
        if strnd is None and (position_types is None or position_types == []):
            strnd = self.globaltt['Position']

        if strnd is not None:
            self.model.addType(pos_id, strnd)

        return pos_id

    def addSubsequenceOfFeature(self, parentid):
        """
        This will add reciprocal triples like:
        feature <is subsequence of> parent
        parent has_subsequence feature
        :param graph:
        :param parentid:

        :return:

        """
        self.graph.addTriple(self.fid, self.globaltt['is subsequence of'], parentid)
        # this should be expected to be done in reasoning not ETL
        self.graph.addTriple(parentid, self.globaltt['has subsequence'], self.fid)

        return

    def addTaxonToFeature(self, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        self.taxon = taxonid
        self.graph.addTriple(self.fid, self.globaltt['in taxon'], self.taxon)

        return

    def addFeatureProperty(self, property_type, feature_property):
        self.graph.addTriple(self.fid, property_type, feature_property)
        return
Пример #27
0
    def _add_evidence(
            self,
            assoc_id,
            eco_id,
            p_value,
            percentage_change,
            effect_size,
            study_bnode
    ):
        """
        :param assoc_id: assoc curie used to reify a
        genotype to phenotype association, generated in _process_data()
        :param eco_id: eco_id as curie, hardcoded in _process_data()
        :param p_value: str, from self.files['all']
        :param percentage_change: str, from self.files['all']
        :param effect_size: str, from self.files['all']
        :param study_bnode: str, from self.files['all']
        :param phenotyping_center: str, from self.files['all']
        :return: str, evidence_line_bnode as curie
        """

        evidence_model = Evidence(self.graph, assoc_id)
        provenance_model = Provenance(self.graph)
        model = Model(self.graph)

        # Add line of evidence
        evidence_line_bnode = self.make_id(
            "{0}{1}".format(assoc_id, study_bnode), '_')
        evidence_model.add_supporting_evidence(evidence_line_bnode)
        model.addIndividualToGraph(evidence_line_bnode, None, eco_id)

        # Add supporting measurements to line of evidence
        measurements = {}
        if p_value is not None or p_value != "":
            p_value_bnode = self.make_id(
                "{0}{1}{2}".format(evidence_line_bnode, 'p_value', p_value), '_')
            model.addIndividualToGraph(p_value_bnode, None, self.globaltt['p-value'])
            try:
                measurements[p_value_bnode] = float(p_value)
            except ValueError:
                measurements[p_value_bnode] = p_value
        if percentage_change is not None and percentage_change != '':

            fold_change_bnode = self.make_id(
                "{0}{1}{2}".format(
                    evidence_line_bnode, 'percentage_change', percentage_change), '_')
            model.addIndividualToGraph(
                fold_change_bnode, None, self.resolve('percentage_change'))
            measurements[fold_change_bnode] = percentage_change
        if effect_size is not None or effect_size != "":
            fold_change_bnode = self.make_id(
                "{0}{1}{2}".format(
                    evidence_line_bnode, 'effect_size', effect_size), '_')
            model.addIndividualToGraph(
                fold_change_bnode, None, self.globaltt['effect size estimate'])
            measurements[fold_change_bnode] = effect_size

        evidence_model.add_supporting_data(evidence_line_bnode, measurements)

        # Link evidence to provenance by connecting to study node
        provenance_model.add_study_to_measurements(study_bnode, measurements.keys())
        self.graph.addTriple(
            evidence_line_bnode, self.globaltt['has_supporting_activity'],
            study_bnode)

        return evidence_line_bnode
Пример #28
0
    def _process_qtls_genomic_location(
            self, raw, txid, build_id, build_label, common_name, limit=None):
        """
        This method

        Triples created:

        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        line_counter = 0
        geno = Genotype(graph)
        # assume that chrs get added to the genome elsewhere

        taxon_curie = 'NCBITaxon:' + txid
        eco_id = self.globaltt['quantitative trait analysis evidence']
        LOG.info("Processing QTL locations for %s from %s", taxon_curie, raw)
        with gzip.open(raw, 'rt', encoding='ISO-8859-1') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                line_counter += 1
                if re.match(r'^#', ' '.join(row)):
                    continue

                (chromosome, qtl_source, qtl_type, start_bp, stop_bp, frame, strand,
                 score, attr) = row
                example = '''
Chr.Z   Animal QTLdb    Production_QTL  33954873      34023581...
QTL_ID=2242;Name="Spleen percentage";Abbrev="SPLP";PUBMED_ID=17012160;trait_ID=2234;
trait="Spleen percentage";breed="leghorn";"FlankMarkers=ADL0022";VTO_name="spleen mass";
MO_name="spleen weight to body weight ratio";Map_Type="Linkage";Model="Mendelian";
Test_Base="Chromosome-wise";Significance="Significant";P-value="<0.05";F-Stat="5.52";
Variance="2.94";Dominance_Effect="-0.002";Additive_Effect="0.01
                '''
                str(example)
                # make dictionary of attributes
                # keys are:
                # QTL_ID,Name,Abbrev,PUBMED_ID,trait_ID,trait,FlankMarkers,
                # VTO_name,Map_Type,Significance,P-value,Model,
                # Test_Base,Variance, Bayes-value,PTO_name,gene_IDsrc,peak_cM,
                # CMO_name,gene_ID,F-Stat,LOD-score,Additive_Effect,
                # Dominance_Effect,Likelihood_Ratio,LS-means,Breed,
                # trait (duplicate with Name),Variance,Bayes-value,
                # F-Stat,LOD-score,Additive_Effect,Dominance_Effect,
                # Likelihood_Ratio,LS-means

                # deal with poorly formed attributes
                if re.search(r'"FlankMarkers";', attr):
                    attr = re.sub(r'FlankMarkers;', '', attr)
                attr_items = re.sub(r'"', '', attr).split(";")
                bad_attrs = set()
                for attributes in attr_items:
                    if not re.search(r'=', attributes):
                        # remove this attribute from the list
                        bad_attrs.add(attributes)

                attr_set = set(attr_items) - bad_attrs
                attribute_dict = dict(item.split("=") for item in attr_set)

                qtl_num = attribute_dict.get('QTL_ID')
                if self.test_mode and int(qtl_num) not in self.test_ids:
                    continue
                # make association between QTL and trait based on taxon

                qtl_id = common_name + 'QTL:' + str(qtl_num)
                model.addIndividualToGraph(qtl_id, None, self.globaltt['QTL'])
                geno.addTaxon(taxon_curie, qtl_id)

                #
                trait_id = 'AQTLTrait:' + attribute_dict.get('trait_ID')

                # if pub is in attributes, add it to the association
                pub_id = None
                if 'PUBMED_ID' in attribute_dict.keys():
                    pub_id = attribute_dict.get('PUBMED_ID')
                    if re.match(r'ISU.*', pub_id):
                        pub_id = 'AQTLPub:' + pub_id.strip()
                        reference = Reference(graph, pub_id)
                    else:
                        pub_id = 'PMID:' + pub_id.strip()
                        reference = Reference(
                            graph, pub_id, self.globaltt['journal article'])
                    reference.addRefToGraph()

                # Add QTL to graph
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id,
                    self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)
                if 'P-value' in attribute_dict.keys():
                    scr = re.sub(r'<', '', attribute_dict.get('P-value'))
                    if ',' in scr:
                        scr = re.sub(r',', '.', scr)
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)

                assoc.add_association_to_graph()
                # TODO make association to breed
                # (which means making QTL feature in Breed background)

                # get location of QTL
                chromosome = re.sub(r'Chr\.', '', chromosome)
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                qtl_feature = Feature(graph, qtl_id, None, self.globaltt['QTL'])
                if start_bp == '':
                    start_bp = None
                qtl_feature.addFeatureStartLocation(
                    start_bp, chrom_in_build_id, strand,
                    [self.globaltt['FuzzyPosition']])
                if stop_bp == '':
                    stop_bp = None
                qtl_feature.addFeatureEndLocation(
                    stop_bp, chrom_in_build_id, strand,
                    [self.globaltt['FuzzyPosition']])
                qtl_feature.addTaxonToFeature(taxon_curie)
                qtl_feature.addFeatureToGraph()

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        # LOG.warning("Bad attribute flags in this file")  # what does this even mean??
        LOG.info("Done with QTL genomic mappings for %s", taxon_curie)
        return
Пример #29
0
    def _process_qtls_genetic_location(
            self, raw, txid, common_name, limit=None):
        """
        This function processes

        Triples created:

        :param limit:
        :return:

        """
        aql_curie = self.files[common_name + '_cm']['curie']

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['quantitative trait analysis evidence']

        taxon_curie = 'NCBITaxon:' + txid

        LOG.info("Processing genetic location for %s from %s", taxon_curie, raw)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (qtl_id,
                 qtl_symbol,
                 trait_name,
                 assotype,
                 empty,
                 chromosome,
                 position_cm,
                 range_cm,
                 flankmark_a2,
                 flankmark_a1,
                 peak_mark,
                 flankmark_b1,
                 flankmark_b2,
                 exp_id,
                 model_id,
                 test_base,
                 sig_level,
                 lod_score,
                 ls_mean,
                 p_values,
                 f_statistics,
                 variance,
                 bayes_value,
                 likelihood_ratio,
                 trait_id, dom_effect,
                 add_effect,
                 pubmed_id,
                 gene_id,
                 gene_id_src,
                 gene_id_type,
                 empty2) = row

                if self.test_mode and int(qtl_id) not in self.test_ids:
                    continue

                qtl_id = common_name + 'QTL:' + qtl_id.strip()
                trait_id = ':'.join((aql_curie, trait_id.strip()))

                # Add QTL to graph
                feature = Feature(graph, qtl_id, qtl_symbol, self.globaltt['QTL'])
                feature.addTaxonToFeature(taxon_curie)

                # deal with the chromosome
                chrom_id = makeChromID(chromosome, taxon_curie, 'CHR')

                # add a version of the chromosome which is defined as
                # the genetic map
                build_id = 'MONARCH:'+common_name.strip()+'-linkage'
                build_label = common_name+' genetic map'
                geno.addReferenceGenome(build_id, build_label, taxon_curie)
                chrom_in_build_id = makeChromID(chromosome, build_id, 'MONARCH')
                geno.addChromosomeInstance(
                    chromosome, build_id, build_label, chrom_id)
                start = stop = None
                # range_cm sometimes ends in "(Mb)"  (i.e pig 2016 Nov)
                range_mb = re.split(r'\(', range_cm)
                if range_mb is not None:
                    range_cm = range_mb[0]

                if re.search(r'[0-9].*-.*[0-9]', range_cm):
                    range_parts = re.split(r'-', range_cm)

                    # check for poorly formed ranges
                    if len(range_parts) == 2 and\
                            range_parts[0] != '' and range_parts[1] != '':
                        (start, stop) = [
                            int(float(x.strip())) for x in re.split(r'-', range_cm)]
                    else:
                        LOG.info(
                            "A cM range we can't handle for QTL %s: %s",
                            qtl_id, range_cm)
                elif position_cm != '':
                    match = re.match(r'([0-9]*\.[0-9]*)', position_cm)
                    if match is not None:
                        position_cm = match.group()
                        start = stop = int(float(position_cm))

                # FIXME remove converion to int for start/stop
                # when schema can handle floats add in the genetic location
                # based on the range
                feature.addFeatureStartLocation(
                    start, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureEndLocation(
                    stop, chrom_in_build_id, None,
                    [self.globaltt['FuzzyPosition']])
                feature.addFeatureToGraph()

                # sometimes there's a peak marker, like a rsid.
                # we want to add that as a variant of the gene,
                # and xref it to the qtl.
                dbsnp_id = None
                if peak_mark != '' and peak_mark != '.' and \
                        re.match(r'rs', peak_mark.strip()):
                    dbsnp_id = 'dbSNP:'+peak_mark.strip()

                    model.addIndividualToGraph(
                        dbsnp_id, None,
                        self.globaltt['sequence_alteration'])
                    model.addXref(qtl_id, dbsnp_id)

                gene_id = gene_id.replace('uncharacterized ', '').strip()
                if gene_id is not None and gene_id != '' and gene_id != '.'\
                        and re.fullmatch(r'[^ ]*', gene_id) is not None:

                    # we assume if no src is provided and gene_id is an integer,
                    # then it is an NCBI gene ... (okay, lets crank that back a notch)
                    if gene_id_src == '' and gene_id.isdigit() and \
                            gene_id in self.gene_info:
                        # LOG.info(
                        #    'Warm & Fuzzy saying %s is a NCBI gene for %s',
                        #    gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '' and gene_id.isdigit():
                        LOG.warning(
                            'Cold & Prickely saying %s is a NCBI gene for %s',
                            gene_id, common_name)
                        gene_id_src = 'NCBIgene'
                    elif gene_id_src == '':
                        LOG.error(
                            ' "%s" is a NOT NCBI gene for %s', gene_id, common_name)
                        gene_id_src = None

                    if gene_id_src == 'NCBIgene':
                        gene_id = 'NCBIGene:' + gene_id
                        # we will expect that these will get labels elsewhere
                        geno.addGene(gene_id, None)
                        # FIXME what is the right relationship here?
                        geno.addAffectedLocus(qtl_id, gene_id)

                        if dbsnp_id is not None:
                            # add the rsid as a seq alt of the gene_id
                            vl_id = '_:' + re.sub(
                                r':', '', gene_id) + '-' + peak_mark.strip()
                            geno.addSequenceAlterationToVariantLocus(
                                dbsnp_id, vl_id)
                            geno.addAffectedLocus(vl_id, gene_id)

                # add the trait
                model.addClassToGraph(trait_id, trait_name)

                # Add publication
                reference = None
                if re.match(r'ISU.*', pubmed_id):
                    pub_id = 'AQTLPub:'+pubmed_id.strip()
                    reference = Reference(graph, pub_id)
                elif pubmed_id != '':
                    pub_id = 'PMID:' + pubmed_id.strip()
                    reference = Reference(
                        graph, pub_id, self.globaltt['journal article'])

                if reference is not None:
                    reference.addRefToGraph()

                # make the association to the QTL
                assoc = G2PAssoc(
                    graph, self.name, qtl_id, trait_id, self.globaltt['is marker for'])
                assoc.add_evidence(eco_id)
                assoc.add_source(pub_id)

                # create a description from the contents of the file
                # desc = ''

                # assoc.addDescription(g, assoc_id, desc)

                # TODO add exp_id as evidence
                # if exp_id != '':
                #     exp_id = 'AQTLExp:'+exp_id
                #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                if p_values != '':
                    scr = re.sub(r'<', '', p_values)
                    scr = re.sub(r',', '.', scr)  # international notation
                    if scr.isnumeric():
                        score = float(scr)
                        assoc.set_score(score)  # todo add score type
                # TODO add LOD score?
                assoc.add_association_to_graph()

                # make the association to the dbsnp_id, if found
                if dbsnp_id is not None:
                    # make the association to the dbsnp_id
                    assoc = G2PAssoc(
                        graph, self.name, dbsnp_id, trait_id,
                        self.globaltt['is marker for'])
                    assoc.add_evidence(eco_id)
                    assoc.add_source(pub_id)

                    # create a description from the contents of the file
                    # desc = ''
                    # assoc.addDescription(g, assoc_id, desc)

                    # TODO add exp_id
                    # if exp_id != '':
                    #     exp_id = 'AQTLExp:'+exp_id
                    #     gu.addIndividualToGraph(g, exp_id, None, eco_id)

                    if p_values != '':
                        scr = re.sub(r'<', '', p_values)
                        scr = re.sub(r',', '.', scr)
                        if scr.isnumeric():
                            score = float(scr)
                            assoc.set_score(score)  # todo add score type
                    # TODO add LOD score?

                    assoc.add_association_to_graph()

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        LOG.info("Done with QTL genetic info")
        return
Пример #30
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol,
                 discontinued_date) = line.split('\t')

                # set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))

                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(
                        discontinued_gene_id, discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(
                        discontinued_gene_id, discontinued_symbol)
                    model.addDeprecatedIndividual(
                        discontinued_gene_id, [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if (not self.testMode) and\
                        (limit is not None and line_counter > limit):
                    break

        return
Пример #31
0
class Genotype():
    """
    These methods provide convenient methods to
    add items related to a genotype and it's parts to a supplied graph.
    They follow the patterns set out in
    GENO https://github.com/monarch-initiative/GENO-ontology.
    For specific sequence features,
    we use the GenomicFeature class to create them.

    """

    # special genotype parts mapped to their
    # GENO and SO classes that we explicitly reference here
    genoparts = {
        'intrinsic_genotype': 'GENO:0000000',
        'extrinsic_genotype': 'GENO:0000524',
        'effective_genotype': 'GENO:0000525',
        'sex_qualified_genotype': 'GENO:0000645',
        'male_genotype': 'GENO:0000646',
        'female_genotype': 'GENO:0000647',
        'genomic_background': 'GENO:0000611',
        'unspecified_genomic_background': 'GENO:0000649',
        'genomic_variation_complement': 'GENO:0000009',
        'karyotype_variation_complement': 'GENO:0000644',
        'variant_single_locus_complement': 'GENO:0000030',
        'variant_locus': 'GENO:0000002',
        'reference_locus': 'GENO:0000036',
        'allele': 'GENO:0000512',
        'gene': 'SO:0000704',
        'QTL': 'SO:0000771',
        'transgene': 'SO:0000902',  # not really used any more
        'transgenic_insertion': 'SO:0001218',
        'pseudogene': 'SO:0000336',
        'cytogenetic marker': 'SO:0000341',
        'sequence_feature': 'SO:0000110',
        'sequence_alteration': 'SO:0001059',
        'insertion': 'SO:0000667',
        'deletion': 'SO:0000159',
        'substitution': 'SO:1000002',
        'duplication': 'SO:1000035',
        'translocation': 'SO:0000199',
        'inversion': 'SO:1000036',
        'tandem_duplication': 'SO:1000173',
        'point_mutation': 'SO:1000008',
        'population': 'PCO:0000001',  # population
        'family': 'PCO:0000020',  # family
        'wildtype': 'GENO:0000511',
        'reagent_targeted_gene': 'GENO:0000504',
        'targeted_gene_subregion': 'GENO:0000534',
        'targeted_gene_complement': 'GENO:0000527',
        'biological_region': 'SO:0001411',
        'missense_variant': 'SO:0001583',
        'transcript': 'SO:0000233',
        'polypeptide': 'SO:0000104',
        'cDNA': 'SO:0000756',
        'sequence_variant_causing_loss_of_function_of_polypeptide':
            'SO:1000118',
        'sequence_variant_causing_gain_of_function_of_polypeptide':
            'SO:1000125',
        'sequence_variant_causing_inactive_catalytic_site': 'SO:1000120',
        'sequence_variant_affecting_polypeptide_function': 'SO:1000117',
        'regulatory_transgene_feature': 'GENO:0000637',
        'coding_transgene_feature': 'GENO:0000638',
        'protein_coding_gene': 'SO:0001217',
        'ncRNA_gene': 'SO:0001263',
        'RNAi_reagent': 'SO:0000337',
        'heritable_phenotypic_marker': 'SO:0001500'
    }

    object_properties = {
        'is_mutant_of': 'GENO:0000440',
        'derives_from': 'RO:0001000',
        'has_alternate_part': 'GENO:0000382',
        'has_reference_part': 'GENO:0000385',
        'has_sex_agnostic_genotype_part': 'GENO:0000650',
        'in_taxon': 'RO:0002162',
        'has_zygosity': 'GENO:0000608',
        # is_seq_var_inst_of links a alternate locus (instance)
        # to a gene (class)
        'is_sequence_variant_instance_of': 'GENO:0000408',
        'targets_instance_of': 'GENO:0000414',
        'is_reference_instance_of': 'GENO:0000610',
        'has_part': 'BFO:0000051',
        # use has_member_with_allelotype when relating populations
        'has_member_with_allelotype': 'GENO:0000225',
        'is_allelotype_of': 'GENO:0000206',
        'has_genotype': 'GENO:0000222',
        'has_phenotype': 'RO:0002200',
        'has_gene_product': 'RO:0002205',
        'translates_to': 'RO:0002513',
        'is_targeted_expression_variant_of': 'GENO:0000443',
        'is_transgene_variant_of': 'GENO:0000444',
        'has_variant_part': 'GENO:0000382',
        # targeted_by isa between a (reagent-targeted gene) and a morpholino
        'targeted_by': 'GENO:0000634',
        # FIXME should derives_sequence_from_gene just be subsequence of?
        'derives_sequence_from_gene': 'GENO:0000639',
        'has_affected_locus': 'GENO:0000418'
    }

    annotation_properties = {
        # TODO change properties with
        # https://github.com/monarch-initiative/GENO-ontology/issues/21
        # FIXME
        # reference_nucleotide, reference_amino_acid, altered_nucleotide
        # results_in_amino_acid_change are FIXME Made up terms
        'reference_nucleotide': 'GENO:reference_nucleotide',
        'reference_amino_acid': 'GENO:reference_amino_acid',
        'altered_nucleotide': 'GENO:altered_nucleotide',
        'results_in_amino_acid_change': 'GENO:results_in_amino_acid_change'
    }

    zygosity = {
        'homoplasmic': 'GENO:0000602',
        'heterozygous': 'GENO:0000135',
        'indeterminate': 'GENO:0000137',
        'heteroplasmic': 'GENO:0000603',
        'hemizygous-y': 'GENO:0000604',
        'hemizygous-x': 'GENO:0000605',
        'homozygous': 'GENO:0000136',
        'hemizygous': 'GENO:0000606',
        'complex_heterozygous': 'GENO:0000402',
        'simple_heterozygous': 'GENO:0000458'
    }

    properties = object_properties.copy()
    properties.update(annotation_properties)

    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)

        return

    def addGenotype(
            self, genotype_id, genotype_label, genotype_type=None,
            genotype_description=None):
        """
        If a genotype_type is not supplied,
        we will default to 'intrinsic_genotype'
        :param genotype_id:
        :param genotype_label:
        :param genotype_type:
        :param genotype_description:
        :return:

        """

        if genotype_type is None:
            genotype_type = self.genoparts['intrinsic_genotype']

        self.model.addIndividualToGraph(
            genotype_id, genotype_label, genotype_type, genotype_description)
        return

    def addAllele(
            self, allele_id, allele_label, allele_type=None,
            allele_description=None):
        """
        Make an allele object.
        If no allele_type is added, it will default to a geno:allele
        :param allele_id: curie for allele (required)
        :param allele_label: label for allele (required)
        :param allele_type: id for an allele type (optional,
        recommended SO or GENO class)
        :param allele_description: a free-text description of the allele
        :return:

        """

        # TODO should we accept a list of allele types?
        if allele_type is None:
            allele_type = self.genoparts['allele']  # TODO is this a good idea?
        self.model.addIndividualToGraph(
            allele_id, allele_label, allele_type, allele_description)

        return

    def addGene(
            self, gene_id, gene_label, gene_type=None,
            gene_description=None):
        if gene_type is None:
            gene_type = self.genoparts['gene']
        # genes are classes
        self.model.addClassToGraph(
            gene_id, gene_label, gene_type, gene_description)

        return

    def addConstruct(self, construct_id, construct_label, construct_type=None,
                     construct_description=None):
        # TODO add base type for construct
        # if (constrcut_type is None):
        #    constrcut_type=self.construct_base_type
        self.model.addIndividualToGraph(construct_id, construct_label,
                                        construct_type, construct_description)

        return

    def addDerivesFrom(self, child_id, parent_id):
        """
        We add a derives_from relationship between the child and parent id.
        Examples of uses include between:
        an allele and a construct or strain here,
        a cell line and it's parent genotype.  Adding the parent and child to
        the graph should happen outside of this function call to ensure graph
        integrity.
        :param child_id:
        :param parent_id:
        :return:

        """

        self.graph.addTriple(
            child_id, self.properties['derives_from'], parent_id)

        return

    def addSequenceDerivesFrom(self, child_id, parent_id):
        self.graph.addTriple(
            child_id, self.properties['derives_sequence_from_gene'], parent_id)
        return

    def addAlleleOfGene(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:is_sequence_variant_instance_of.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.properties['is_sequence_variant_instance_of']
        self.graph.addTriple(allele_id, rel_id, gene_id)
        return

    def addAffectedLocus(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:is_sequence_variant_instance_of.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.properties['has_affected_locus']
        self.graph.addTriple(allele_id, rel_id, gene_id)
        return

    def addGeneProduct(
            self, sequence_id, product_id,
            product_label=None, product_type=None):
        """
        Add gene/variant/allele has_gene_product relationship
        Can be used to either describe a gene to transcript relationship
        or gene to protein
        :param sequence_id:
        :param product_id:
        :param product_label:
        :param product_type:
        :return:

        """
        if product_label is not None and product_type is not None:
            self.model.addIndividualToGraph(
                product_id, product_label, product_type)
        self.graph.addTriple(
            sequence_id, self.properties['has_gene_product'], product_id)

        return

    def addPolypeptide(
            self, polypeptide_id, polypeptide_label=None,
            transcript_id=None, polypeptide_type=None, ):
        """
        :param polypeptide_id:
        :param polypeptide_label:
        :param polypeptide_type:
        :param transcript_id:
        :return:

        """

        if polypeptide_type is None:
            polypeptide_type = self.genoparts['polypeptide']
        self.model.addIndividualToGraph(
            polypeptide_id, polypeptide_label, polypeptide_type)
        if transcript_id is not None:
            self.graph.addTriple(
                transcript_id, self.properties['translates_to'],
                polypeptide_id)

        return

    def addPartsToVSLC(
            self, vslc_id, allele1_id, allele2_id, zygosity_id=None,
            allele1_rel=None, allele2_rel=None):
        """
        Here we add the parts to the VSLC.  While traditionally alleles
        (reference or variant loci) are traditionally added, you can add any
        node (such as sequence_alterations for unlocated variations) to a vslc
        if they are known to be paired.  However, if a sequence_alteration's
        loci is unknown, it probably should be added directly to the GVC.
        :param vslc_id:
        :param allele1_id:
        :param allele2_id:
        :param zygosity_id:
        :param allele1_rel:
        :param allele2_rel:
        :return:

        """

        # vslc has parts allele1/allele2

        # vslc = gu.getNode(vslc_id)  # TODO unused
        if allele1_id is not None:
            self.addParts(allele1_id, vslc_id, allele1_rel)
        if allele2_id is not None and allele2_id.strip() != '':
            self.addParts(allele2_id, vslc_id, allele2_rel)

        # figure out zygosity if it's not supplied
        if zygosity_id is None:
            if allele1_id == allele2_id:
                zygosity_id = self.zygosity['homozygous']
            else:
                zygosity_id = self.zygosity['heterozygous']

        if zygosity_id is not None:
            self.graph.addTriple(
                vslc_id, self.properties['has_zygosity'], zygosity_id)

        return

    def addVSLCtoParent(self, vslc_id, parent_id):
        """
        The VSLC can either be added to a genotype or to a GVC.
        The vslc is added as a part of the parent.
        :param vslc_id:
        :param parent_id:
        :return:
        """

        self.addParts(
            vslc_id, parent_id, self.properties['has_alternate_part'])

        return

    def addParts(self, part_id, parent_id, part_relationship=None):
        """
        This will add a has_part (or subproperty) relationship between
        a parent_id and the supplied part.
        By default the relationship will be BFO:has_part,
        but any relationship could be given here.
        :param part_id:
        :param parent_id:
        :param part_relationship:
        :return:

        """

        if part_relationship is None:
            part_relationship = self.properties['has_part']

        self.graph.addTriple(parent_id, part_relationship, part_id)

        return

    def addSequenceAlteration(
            self, sa_id, sa_label, sa_type=None, sa_description=None):
        if sa_type is None:
            sa_type = self.genoparts['sequence_alteration']
        self.model.addIndividualToGraph(
            sa_id, sa_label, sa_type, sa_description)

        return

    def addSequenceAlterationToVariantLocus(self, sa_id, vl_id):
        self.addParts(sa_id, vl_id, self.properties['has_alternate_part'])
        return

    def addGenomicBackground(
            self, background_id, background_label,
            background_type=None, background_description=None):
        if background_type is None:
            background_type = self.genoparts['genomic_background']
        self.model.addIndividualToGraph(
            background_id, background_label, background_type,
            background_description)

        return

    def addGenomicBackgroundToGenotype(
            self, background_id, genotype_id, background_type=None):
        if background_type is None:
            background_type = self.genoparts['genomic_background']
        self.model.addType(background_id, background_type)
        self.addParts(background_id, genotype_id,
                      self.object_properties['has_reference_part'])

        return

    def addTaxon(self, taxon_id, genopart_id):
        """
        The supplied geno part will have the specified taxon added with
        RO:in_taxon relation.
        Generally the taxon is associated with a genomic_background,
        but could be added to any genotype part (including a gene,
        regulatory element, or sequence alteration).
        :param taxon_id:
        :param genopart_id:

        :return:

        """
        self.graph.addTriple(
            genopart_id, self.properties['in_taxon'], taxon_id)

        return

    def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id):
        # for example, add a morphant reagent thingy to the genotype,
        # assuming it's a extrinsic_genotype
        self.graph.addTriple(
            genotype_id, self.properties['has_variant_part'], reagent_id)

        return

    def addGeneTargetingReagent(
            self, reagent_id, reagent_label, reagent_type, gene_id,
            description=None):
        """
        Here, a gene-targeting reagent is added.
        The actual targets of this reagent should be added separately.
        :param reagent_id:
        :param reagent_label:
        :param reagent_type:

        :return:

        """

        # TODO add default type to reagent_type
        self.model.addIndividualToGraph(
            reagent_id, reagent_label, reagent_type, description)

        self.graph.addTriple(
            reagent_id, self.object_properties['targets_instance_of'], gene_id)

        return

    def addReagentTargetedGene(
            self, reagent_id, gene_id, targeted_gene_id=None,
            targeted_gene_label=None, description=None):
        """
        This will create the instance of a gene that is targeted by a molecular
        reagent (such as a morpholino or rnai).
        If an instance id is not supplied,
        we will create it as an anonymous individual which is of the type
        GENO:reagent_targeted_gene.
        We will also add the targets relationship between the reagent and
        gene class.

        <targeted_gene_id> a GENO:reagent_targeted_gene
        rdf:label targeted_gene_label
        dc:description description
        <reagent_id> GENO:targets_instance_of <gene_id>

        :param reagent_id:
        :param gene_id:
        :param targeted_gene_id:
        :return:

        """

        # akin to a variant locus
        if targeted_gene_id is None:
            targeted_gene_id = '_' + gene_id + '-' + reagent_id
            targeted_gene_id = targeted_gene_id.replace(":", "")
        self.model.addIndividualToGraph(
            targeted_gene_id, targeted_gene_label,

            self.genoparts['reagent_targeted_gene'], description)

        if gene_id is not None:
            self.graph.addTriple(
                targeted_gene_id,
                self.object_properties['is_targeted_expression_variant_of'],
                gene_id)

        self.graph.addTriple(
            targeted_gene_id, self.properties['targeted_by'], reagent_id)

        return

    def addTargetedGeneSubregion(
            self, tgs_id, tgs_label, tgs_type=None, tgs_description=None):
        if tgs_type is None:
            tgs_type = self.genoparts['targeted_gene_subregion']

        self.model.addIndividualToGraph(
            tgs_id, tgs_label, tgs_type, tgs_description)

    def addMemberOfPopulation(self, member_id, population_id):
        self.graph.addTriple(
            population_id,
            self.properties['has_member_with_allelotype'],
            member_id)
        return

    def addTargetedGeneComplement(
            self, tgc_id, tgc_label, tgc_type=None, tgc_description=None):
        if tgc_type is None:
            tgc_type = self.genoparts['targeted_gene_complement']
        self.model.addIndividualToGraph(
            tgc_id, tgc_label, tgc_type, tgc_description)

        return

    def addGenome(self, taxon_id, taxon_label=None):
        if taxon_label is None:
            taxon_label = taxon_id
        genome_label = taxon_label+' genome'
        genome_id = self.makeGenomeID(taxon_id)
        self.model.addClassToGraph(
            genome_id, genome_label, Feature.types['genome'])

        return

    def addReferenceGenome(self, build_id, build_label, taxon_id):
        genome_id = self.makeGenomeID(taxon_id)
        self.model.addIndividualToGraph(
            build_id, build_label, Feature.types['reference_genome'])
        self.model.addType(build_id, genome_id)
        self.addTaxon(taxon_id, build_id)

        return

    def makeGenomeID(self, taxon_id):
        # scrub off the taxon prefix.  put it in base space
        # TODO: revisit as BNODE?

        genome_id = re.sub(r'.*\:', ':', taxon_id) + 'genome'

        return genome_id

    def addChromosome(
            self, chr, tax_id, tax_label=None, build_id=None,
            build_label=None):
        """
        if it's just the chromosome, add it as an instance of a SO:chromosome,
        and add it to the genome. If a build is included,
        punn the chromosome as a subclass of SO:chromsome, and make the
        build-specific chromosome an instance of the supplied chr.
        The chr then becomes part of the build or genome.
        """
        family = Family()
        # first, make the chromosome class, at the taxon level
        chr_id = makeChromID(str(chr), tax_id)
        if tax_label is not None:
            chr_label = makeChromLabel(chr, tax_label)
        else:
            chr_label = makeChromLabel(chr)
        genome_id = self.makeGenomeID(tax_id)
        self.model.addClassToGraph(
            chr_id, chr_label, Feature.types['chromosome'])
        self.addTaxon(tax_id, genome_id)  # add the taxon to the genome

        if build_id is not None:
            # the build-specific chromosome
            chrinbuild_id = makeChromID(chr, build_id)
            if build_label is None:
                build_label = build_id
            chrinbuild_label = makeChromLabel(chr, build_label)
            # add the build-specific chromosome as an instance of the chr class

            self.model.addIndividualToGraph(
                chrinbuild_id, chrinbuild_label, chr_id)

            # add the build-specific chromosome
            # as a member of the build (both ways)
            family.addMember(build_id, chrinbuild_id)
            family.addMemberOf(chrinbuild_id, build_id)

        return

    def addChromosomeClass(self, chrom_num, taxon_id, taxon_label):
        taxon = re.sub('NCBITaxon:', '', taxon_id)
        # the chrom class (generic) id
        chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')
        chrom_class_label = makeChromLabel(chrom_num, taxon_label)
        self.model.addClassToGraph(
            chrom_class_id, chrom_class_label, Feature.types['chromosome'])

        return

    def addChromosomeInstance(
            self, chr_num, reference_id, reference_label, chr_type=None):
        """
        Add the supplied chromosome as an instance within the given reference
        :param chr_num:
        :param reference_id: for example, a build id like UCSC:hg19
        :param reference_label:
        :param chr_type: this is the class that this is an instance of.
        typically a genome-specific chr

        :return:

        """
        family = Family(self.graph)
        chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH')
        chr_label = makeChromLabel(str(chr_num), reference_label)

        self.model.addIndividualToGraph(
            chr_id, chr_label, Feature.types['chromosome'])
        if chr_type is not None:
            self.model.addType(chr_id, chr_type)

        # add the build-specific chromosome
        # as a member of the build  (both ways)
        family.addMember(reference_id, chr_id)
        family.addMemberOf(chr_id, reference_id)

        return

    def make_variant_locus_label(self, gene_label, allele_label):
        if gene_label is None:
            gene_label = ''
        label = gene_label.strip()+'<' + allele_label.strip() + '>'

        return label

    def make_vslc_label(self, gene_label, allele1_label, allele2_label):
        """
        Make a Variant Single Locus Complement (VSLC) in monarch-style.
        :param gene_label:
        :param allele1_label:
        :param allele2_label:
        :return:
        """

        vslc_label = ''

        if gene_label is None and \
                allele1_label is None and allele2_label is None:
            logger.error("Not enough info to make vslc label")
            return None

        top = self.make_variant_locus_label(gene_label, allele1_label)
        bottom = ''
        if allele2_label is not None:
            bottom = self.make_variant_locus_label(gene_label, allele2_label)

        vslc_label = '/'.join((top, bottom))

        return vslc_label

    def make_experimental_model_with_genotype(
             self, genotype_id, genotype_label, taxon_id, taxon_label):

        animal_id = '-'.join((taxon_id, 'with', genotype_id))
        animal_id = re.sub(r':', '', animal_id)
        animal_id = '_:'+animal_id

        animal_label = ' '.join((genotype_label, taxon_label))
        self.model.addIndividualToGraph(animal_id, animal_label, taxon_id)
        self.graph.addTriple(
            animal_id, Genotype.object_properties['has_genotype'], genotype_id)
        return animal_id
Пример #32
0
    def _process_haplotype(
            self, hap_id, hap_label, chrom_num, chrom_pos, context,
            risk_allele_frequency, mapped_gene, so_ontology):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency != '' and risk_allele_frequency != 'NR':
            hap_description = str(risk_allele_frequency) + ' [risk allele frequency]'

        model.addIndividualToGraph(
            hap_id, hap_label.strip(), self.globaltt['haplotype'], hap_description)
        geno.addTaxon(self.globaltt["H**o sapiens"], hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)
        snp_curies = list()

        for index, snp in enumerate(snp_labels):
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                # make blank node
                snp_curie = self.make_id(snp, "_")

            graph.addTriple(hap_id, self.globaltt['has_variant_part'], snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        length = len(snp_labels)
        if not all(len(lst) == length
                   for lst in [chrom_nums, chrom_positions, context_list]):
            LOG.warning(
                "Unexpected data field for haplotype %s \n "
                "will not add snp details", hap_label)
            return

        variant_in_gene_count = 0
        for index, snp_curie in enumerate(snp_curies):
            self._add_snp_to_graph(
                snp_curie, snp_labels[index], chrom_nums[index],
                chrom_positions[index], context_list[index])

            if len(mapped_genes) == len(snp_labels):
                so_class = self.resolve(context_list[index])
                # removed the '+' for recursive  one-or-more rdfs:subClassOf  paths
                # just so it did not return an empty graph
                so_query = """
SELECT ?variant_label
    WHERE {{
        {0} rdfs:subClassOf {1} ;
        rdfs:label ?variant_label .
    }}
                """.format(so_class, self.globaltt['gene_variant'])

                query_result = so_ontology.query(so_query)

                if len(list(query_result)) == 1:
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[index])

                    if gene_id is not None:
                        geno.addAffectedLocus(snp_curie, gene_id)
                        geno.addAffectedLocus(hap_id, gene_id)
                        variant_in_gene_count += 1

                gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[index])
                if gene_id is not None:
                    graph.addTriple(
                        snp_curie, self.resolve(context_list[index]), gene_id)

            else:
                LOG.warning(
                    "More mapped genes than snps, cannot disambiguate for %s",
                    hap_label)

        # Seperate in case we want to apply a different relation
        # If not this is redundant with triples added above
        if len(mapped_genes) == variant_in_gene_count and len(set(mapped_genes)) == 1:
            gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0])
            geno.addAffectedLocus(hap_id, gene_id)

        return
Пример #33
0
    def _process_haplotype(
            self, hap_id, hap_label, chrom_num, chrom_pos, context,
            risk_allele_frequency, mapped_gene, so_ontology):
        tax_id = 'NCBITaxon:9606'

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        geno = Genotype(g)
        model = Model(g)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency != '' and \
                risk_allele_frequency != 'NR':
            hap_description = \
                str(risk_allele_frequency) + \
                ' [risk allele frequency]'

        model.addIndividualToGraph(hap_id, hap_label.strip(),
                                   Feature.types['haplotype'], hap_description)
        geno.addTaxon(tax_id, hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)
        snp_curies = list()

        for index, snp in enumerate(snp_labels):
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                # make blank node
                snp_curie = self.make_id(snp, "_")

            g.addTriple(hap_id, geno.object_properties['has_variant_part'],
                        snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        length = len(snp_labels)
        if not all(len(lst) == length
                   for lst in [chrom_nums, chrom_positions, context_list]):
            logger.warn(
                "Unexpected data field for haplotype {} \n "
                "will not add snp details".format(hap_label))
            return

        variant_in_gene_count = 0
        for index, snp_curie in enumerate(snp_curies):
            self._add_snp_to_graph(
                snp_curie, snp_labels[index], chrom_nums[index],
                chrom_positions[index], context_list[index])

            if len(mapped_genes) == len(snp_labels):

                so_class = self._map_variant_type(context_list[index])

                if so_class is None:
                    raise ValueError("Unknown SO class {} in haplotype {}"
                                     .format(context_list[index], hap_label))
                so_query = """
                    SELECT ?variant_label
                    WHERE {{
                        {0} rdfs:subClassOf+ SO:0001564 ;
                            rdfs:label ?variant_label .
                    }}
                """.format(so_class)

                query_result = so_ontology.query(so_query)
                if len(list(query_result)) > 0:
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(
                        mapped_genes[index])
                    if gene_id is not None:
                        geno.addAffectedLocus(snp_curie, gene_id)
                        geno.addAffectedLocus(hap_id, gene_id)
                        variant_in_gene_count += 1

                if context_list[index] == 'upstream_gene_variant':
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(
                        mapped_genes[index])
                    if gene_id is not None:
                        g.addTriple(
                            snp_curie,
                            Feature.object_properties[
                                'upstream_of_sequence_of'],
                            gene_id)
                elif context_list[index] == 'downstream_gene_variant':
                    gene_id = DipperUtil.get_ncbi_id_from_symbol(
                        mapped_genes[index])
                    if gene_id is not None:
                        g.addTriple(
                            snp_curie,
                            Feature.object_properties[
                                'downstream_of_sequence_of'],
                            gene_id)
            else:
                logger.warn("More mapped genes than snps, "
                            "cannot disambiguate for {}".format(hap_label))

        # Seperate in case we want to apply a different relation
        # If not this is redundant with triples added above
        if len(mapped_genes) == variant_in_gene_count \
                and len(set(mapped_genes)) == 1:
            gene_id = DipperUtil.get_ncbi_id_from_symbol(mapped_genes[0])
            geno.addAffectedLocus(hap_id, gene_id)

        return
Пример #34
0
    def _process_haplotype(self, hap_id, hap_label, chrom_num, chrom_pos,
                           context, risk_allele_frequency, mapped_gene,
                           so_ontology):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        # add the feature to the graph
        hap_description = None
        if risk_allele_frequency not in ['', 'NR']:
            hap_description = str(
                risk_allele_frequency) + ' [risk allele frequency]'

        model.addIndividualToGraph(hap_id, hap_label.strip(),
                                   self.globaltt['haplotype'], hap_description)
        geno.addTaxon(self.globaltt["H**o sapiens"], hap_id)

        snp_labels = re.split(r';\s?', hap_label)
        chrom_nums = re.split(r';\s?', chrom_num)
        chrom_positions = re.split(r';\s?', chrom_pos)
        context_list = re.split(r';\s?', context)
        mapped_genes = re.split(r';\s?', mapped_gene)

        # Not having four "PAX5" as a list might be better, but it breaks unit tests
        # mapped_genes = list(set(mapped_genes)) # make uniq
        # snp_labels = list(set(snp_labels)) # make uniq

        snp_curies = list()

        for snp in snp_labels:
            snp_curie, snp_type = self._get_curie_and_type_from_id(snp)
            if snp_type is None:
                LOG.info('cant find type for SNP in %s', snp)
                # make blank node
                snp_curie = self.make_id(snp, "_")
                model.addLabel(snp_curie, snp)
            elif snp_curie[0] == '_':  # arrived an unlabeled blanknode
                model.addLabel(snp_curie, snp)

            graph.addTriple(hap_id, self.globaltt['has_variant_part'],
                            snp_curie)
            snp_curies.append(snp_curie)

        # courtesy http://stackoverflow.com/a/16720915
        # check lengths of mutiple lists
        length = len(snp_curies)
        if not all(
                len(lst) == length for lst in
            [snp_labels, chrom_nums, chrom_positions, context_list]):
            LOG.warning(
                "Incongruous data field(s) for haplotype %s \n "
                "will not add snp details", hap_label)
        else:

            variant_in_gene_count = 0
            for index, snp_curie in enumerate(snp_curies):
                self._add_snp_to_graph(snp_curie, snp_labels[index],
                                       chrom_nums[index],
                                       chrom_positions[index],
                                       context_list[index])

                if mapped_genes and len(mapped_genes) != len(snp_labels):
                    LOG.warning("More mapped genes than snps,"
                                " cannot disambiguate for\n%s\n%s",
                                mapped_genes, snp_labels)  # hap_label)
                else:
                    so_class = self.resolve(context_list[index])
                    so_query = """
        SELECT ?variant_label
        WHERE {{
            {0} rdfs:subClassOf+ {1} ;
            rdfs:label ?variant_label .
        }}
                    """.format(so_class, self.globaltt['gene_variant'])

                    query_result = so_ontology.query(so_query)

                    gene_id = DipperUtil.get_hgnc_id_from_symbol(
                        mapped_genes[index])

                    if gene_id is not None and len(list(query_result)) == 1:
                        if context_list[index] in [
                                'upstream_gene_variant',
                                'downstream_gene_variant'
                        ]:
                            graph.addTriple(snp_curie,
                                            self.resolve(context_list[index]),
                                            gene_id)
                        else:
                            geno.addAffectedLocus(snp_curie, gene_id)
                            variant_in_gene_count += 1

            # Seperate in case we want to apply a different relation
            # If not this is redundant with triples added above
            if len(mapped_genes) == variant_in_gene_count and \
                    len(set(mapped_genes)) == 1:
                gene_id = DipperUtil.get_hgnc_id_from_symbol(mapped_genes[0])
                geno.addAffectedLocus(hap_id, gene_id)
Пример #35
0
class Reference:
    """
    To model references for associations
        (such as journal articles, books, etc.).

    By default, references will be typed as "documents",
        unless if the type is set otherwise.

    If a short_citation is set, this will be used for the individual's label.
        We may wish to subclass this later.

    """

    def __init__(self, graph, ref_id=None, ref_type=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("%s is not a graph", graph)

        # assert ref_id is not None

        self.ref_id = ref_id
        self.ref_url = None
        self.title = None
        self.year = None
        self.author_list = None
        self.short_citation = None

        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map

        if ref_type is None:
            self.ref_type = self.globaltt['document']
        else:
            self.ref_type = ref_type
            if ref_type[:4] not in ('IAO:', 'SIO:'):
                LOG.warning("Got Pub ref type of:  %s", ref_type)

        if ref_id is not None and ref_id[:4] == 'http':
            self.ref_url = ref_id

        return

    def setTitle(self, title):
        self.title = title
        return

    def setYear(self, year):

        self.year = year

        return

    def setType(self, reference_type):

        self.ref_type = reference_type

        return

    def setAuthorList(self, author_list):
        """

        :param author_list: Array of authors
        :return:
        """

        self.author_list = author_list
        return

    def addAuthor(self, author):

        self.author_list += [author]

        return

    def setShortCitation(self, citation):
        self.short_citation = citation
        return

    def addPage(self, subject_id, page_url):
        self.graph.addTriple(
            subject_id, self.globaltt['page'],  # foaf:page  not  <sio:web page>
            page_url, object_is_literal=True)
        return

    def addTitle(self, subject_id, title):
        if title is not None and title != '':
            self.graph.addTriple(
                subject_id, self.globaltt['title (dce)'], title, object_is_literal=True)
        return

    def addRefToGraph(self):

        cite = self.short_citation
        if cite is None and self.title is not None:
            cite = self.title

        if self.ref_url is not None:
            if self.title is not None:
                self.addTitle(self.ref_url, self.title)
            self.model.addType(self.ref_url, self.ref_type)
            if cite is not None:
                self.model.addLabel(self.ref_url, cite)
        elif self.ref_id is not None:
            self.model.addIndividualToGraph(self.ref_id, cite, self.ref_type)
            if self.title is not None:
                self.addTitle(self.ref_id, self.title)
        else:
            # should never be true
            LOG.error("You are missing an identifier for a reference.")

        # TODO what is the property here to add the date?
        # if self.year is not None:
        #    gu.addTriple()

        # if self.author_list is not None:
        #    for auth in self.author_list:
        #        gu.addTriple(
        #           graph, self.ref_id, self.props['has_author'], auth, True)
        return
Пример #36
0
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and
        creates the genes as classes, typed with SO.  It will add their label,
        any alternate labels as synonyms, alternate ids as equivlaent classes.
        HPRDs get added as protein products.
        The chromosome and chr band get added as blank node regions,
        and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        model = Model(g)

        # not unzipping the file
        logger.info("Processing 'Gene Info' records")
        line_counter = 0
        gene_info = '/'.join((self.rawdir, self.files['gene_info']['file']))
        logger.info("FILE: %s", gene_info)
        # Add taxa and genome classes for those in our filter
        for tax_num in self.tax_ids:
            tax_id = ':'.join(('NCBITaxon', str(tax_num)))
            # tax label can get added elsewhere
            geno.addGenome(tax_id, str(tax_num))
            # label added elsewhere
            model.addClassToGraph(tax_id, None)
        with gzip.open(gene_info, 'rb') as f:
            row = f.readline().decode().strip().split('\t')
            logger.info("Header has %i columns", len(row))
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, symbol, locustag, synonyms, xrefs, chrom,
                 map_loc, desc, gtype, authority_symbol, name,
                 nomenclature_status, other_designations,
                 modification_date, feature_type) = line.split('\t')

                # ##set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #           or (self.filter == 'geneids' and \
                #               (int(gene_num) not in self.gene_ids))):
                #         continue
                # #### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1

                gene_id = ':'.join(('NCBIGene', gene_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))
                gene_type_id = self.map_type_of_gene(gtype.strip())

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol
                # sequence feature, not a gene
                if gene_type_id == 'SO:0000110':
                    self.class_or_indiv[gene_id] = 'I'
                else:
                    self.class_or_indiv[gene_id] = 'C'

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    continue

                if self.class_or_indiv[gene_id] == 'C':
                    model.addClassToGraph(gene_id, label, gene_type_id, desc)
                    # NCBI will be the default leader,
                    # so we will not add the leader designation here.
                else:
                    model.addIndividualToGraph(
                        gene_id, label, gene_type_id, desc)
                    # in this case, they aren't genes.
                    # so we want someone else to be the leader.

                if name != '-':
                    model.addSynonym(gene_id, name)
                if synonyms.strip() != '-':
                    for s in synonyms.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if other_designations.strip() != '-':
                    for s in other_designations.split('|'):
                        model.addSynonym(
                            gene_id, s.strip(),
                            Assoc.annotation_properties['hasRelatedSynonym'])
                if xrefs.strip() != '-':
                    self._add_gene_equivalencies(xrefs, gene_id, tax_num)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # no idea why there's two bands listed - possibly 2 assemblies
                # 419     ART3      4    with   4q21.1|4p15.1-p14
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # this is of "unknown" type == susceptibility
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3
                # unlocated scaffold
                # 101928066       LOC101928066    1|Un    -\
                # mouse --> 2C3
                # 11435   Chrna1  2       2 C3|2 43.76 cM
                # mouse --> 11B1.1
                # 11548   Adra1b  11      11 B1.1|11 25.81 cM
                # 11717   Ampd3   7       7 57.85 cM|7 E2-E3        # mouse
                # 14421   B4galnt1        10      10 D3|10 74.5 cM  # mouse
                # 323212  wu:fb92e12      19|20   -                 # fish
                # 323368  ints10  6|18    -                         # fish
                # 323666  wu:fc06e02      11|23   -                 # fish

                # feel that the chr placement can't be trusted in this table
                # when there is > 1 listed
                # with the exception of human X|Y,
                # we will only take those that align to one chr

                # FIXME remove the chr mapping below
                # when we pull in the genomic coords
                if str(chrom) != '-' and str(chrom) != '':
                    if re.search(r'\|', str(chrom)) and \
                            str(chrom) not in ['X|Y', 'X; Y']:
                        # means that there's uncertainty in the mapping.
                        # so skip it
                        # TODO we'll need to figure out how to deal with
                        # >1 loc mapping
                        logger.info(
                            '%s is non-uniquely mapped to %s.' +
                            ' Skipping for now.',
                            gene_id, str(chr))
                        continue
                        # X|Y	Xp22.33;Yp11.3

                    # if(not re.match(
                    #        r'(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
                    #    print('odd chr=',str(chr))
                    if str(chrom) == 'X; Y':
                        chrom = 'X|Y'  # rewrite the PAR regions for processing
                    # do this in a loop to allow PAR regions like X|Y
                    for c in re.split(r'\|', str(chrom)):
                        # assume that the chromosome label is added elsewhere
                        geno.addChromosomeClass(c, tax_id, None)
                        mychrom = makeChromID(c, tax_num, 'CHR')
                        # temporarily use taxnum for the disambiguating label
                        mychrom_syn = makeChromLabel(c, tax_num)
                        model.addSynonym(mychrom, mychrom_syn)
                        band_match = re.match(
                            r'[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc)
                        if band_match is not None and \
                                len(band_match.groups()) > 0:
                            # if tax_num != '9606':
                            #     continue
                            # this matches the regular kind of chrs,
                            # so make that kind of band
                            # not sure why this matches?
                            #   chrX|Y or 10090chr12|Un"
                            # TODO we probably need a different regex
                            # per organism
                            # the maploc_id already has the numeric chromosome
                            # in it, strip it first
                            bid = re.sub(r'^'+c, '', map_loc)
                            # the generic location (no coordinates)
                            maploc_id = makeChromID(c+bid, tax_num, 'CHR')
                            # print(map_loc,'-->',bid,'-->',maploc_id)
                            # Assume it's type will be added elsewhere
                            band = Feature(g, maploc_id, None, None)
                            band.addFeatureToGraph()
                            # add the band as the containing feature
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                maploc_id)
                        else:
                            # TODO handle these cases: examples are:
                            # 15q11-q22,Xp21.2-p11.23,15q22-qter,10q11.1-q24,
                            # 12p13.3-p13.2|12p13-p12,1p13.3|1p21.3-p13.1,
                            # 12cen-q21,22q13.3|22q13.3
                            logger.debug(
                                'not regular band pattern for %s: %s',
                                gene_id, map_loc)
                            # add the gene as a subsequence of the chromosome
                            g.addTriple(
                                gene_id,
                                Feature.object_properties['is_subsequence_of'],
                                mychrom)

                geno.addTaxon(tax_id, gene_id)

        return
Пример #37
0
class Pathway():
    """
    This provides convenience methods to deal with gene and protein collections
    in the context of pathways.
    """
    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.gut = GraphUtils(self.curie_map)

    def addPathway(self,
                   pathway_id,
                   pathway_label,
                   pathway_type=None,
                   pathway_description=None):
        """
        Adds a pathway as a class.  If no specific type is specified, it will
        default to a subclass of "GO:cellular_process" and "PW:pathway".
        :param pathway_id:
        :param pathway_label:
        :param pathway_type:
        :param pathway_description:
        :return:
        """

        if pathway_type is None:
            pathway_type = self.globaltt['cellular_process']
        self.model.addClassToGraph(pathway_id, pathway_label, pathway_type,
                                   pathway_description)
        self.model.addSubClass(pathway_id, self.globaltt['pathway'])

    def addGeneToPathway(self, gene_id, pathway_id):
        """
        When adding a gene to a pathway, we create an intermediate
        'gene product' that is involved in
        the pathway, through a blank node.

        gene_id RO:has_gene_product _gene_product
        _gene_product RO:involved_in pathway_id

        :param pathway_id:
        :param gene_id:
        :return:
        """
        # bnode
        gene_product = ':'.join(
            ('_', self.gut.digest_id(gene_id.replace(':', '') + 'product')))
        self.model.addIndividualToGraph(gene_product, None,
                                        self.globaltt['gene_product'])
        self.graph.addTriple(gene_product, self.globaltt['label'], pathway_id)

        self.graph.addTriple(gene_id, self.globaltt['has gene product'],
                             gene_product)
        self.addComponentToPathway(gene_product, pathway_id)

    def addComponentToPathway(self, component_id, pathway_id):
        """
        This can be used directly when the component is directly involved in
        the pathway.  If a transforming event is performed on the component
        first, then the addGeneToPathway should be used instead.

        :param pathway_id:
        :param component_id:
        :param component_category: biolink category for component_id
        :param pathway_category: biolink category for pathway_id
        :return:
        """
        self.graph.addTriple(component_id, self.globaltt['involved in'],
                             pathway_id)
Пример #38
0
    def _process_genes(self, taxid, limit=None):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        LOG.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    LOG.warning("Too few columns in: " + row)
                    raise ValueError("Data error for file %s", raw)
                (ensembl_gene_id, external_gene_name, description, gene_biotype,
                 entrezgene, ensembl_peptide_id, uniprotswissprot) = row[0:7]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[7]
                else:
                    hgnc_id = None

                if self.test_mode and entrezgene != '' and \
                        int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:' + ensembl_gene_id
                peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id)
                uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot)
                entrez_curie = 'NCBIGene:{}'.format(entrezgene)

                if description == '':
                    description = None
                gene_biotype = gene_biotype.strip()
                gene_type_id = self.resolve(gene_biotype, False)
                if gene_type_id == gene_biotype.strip():   # did not resolve
                    gene_type_id = self.globaltt['polypeptide']

                model.addClassToGraph(
                    gene_id, external_gene_name, gene_type_id, description)
                model.addIndividualToGraph(peptide_curie, None, gene_type_id)
                model.addIndividualToGraph(uniprot_curie, None, gene_type_id)

                if entrezgene != '':
                    if taxid == '9606':
                        # Use HGNC for eq in human data
                        model.addXref(gene_id, entrez_curie)
                    else:
                        model.addEquivalentClass(gene_id, entrez_curie)
                if hgnc_id is not None and hgnc_id != '':
                    model.addEquivalentClass(gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:'+taxid, gene_id)
                if ensembl_peptide_id != '':
                    geno.addGeneProduct(gene_id, peptide_curie)
                    if uniprotswissprot != '':
                        geno.addGeneProduct(gene_id, uniprot_curie)
                        model.addXref(peptide_curie, uniprot_curie)

                if not self.test_mode and limit is not None and line_counter > limit:
                    break

        return
Пример #39
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        model = Model(g)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        # PYLINT complains iterparse deprecated,
        # but as of py 3.4 only the optional & unsupplied parse arg is.
        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignoreS element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text

                disorder_id = 'Orphanet:'+str(disorder_num)

                if self.testMode and \
                        disorder_id not in \
                        config.get_config()['test_ids']['disease']:
                    continue

                disorder_label = elem.find('Name').text

                # make a hash of internal gene id to type for later lookup
                gene_iid_to_type = {}
                gene_list = elem.find('GeneList')
                for gene in gene_list.findall('Gene'):
                    gene_iid = gene.get('id')
                    gene_type = gene.find('GeneType').get('id')
                    gene_iid_to_type[gene_iid] = gene_type

                # assuming that these are in the ontology
                model.addClassToGraph(disorder_id, disorder_label)

                assoc_list = elem.find('DisorderGeneAssociationList')
                for a in assoc_list.findall('DisorderGeneAssociation'):
                    gene_iid = a.find('.//Gene').get('id')
                    gene_name = a.find('.//Gene/Name').text
                    gene_symbol = a.find('.//Gene/Symbol').text
                    gene_num = a.find('./Gene/OrphaNumber').text
                    gene_id = 'Orphanet:'+str(gene_num)
                    gene_type_id = \
                        self._map_gene_type_id(gene_iid_to_type[gene_iid])
                    model.addClassToGraph(
                        gene_id, gene_symbol, gene_type_id, gene_name)
                    syn_list = a.find('./Gene/SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for s in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_id, s.text)

                    dgtype = a.find('DisorderGeneAssociationType').get('id')
                    rel_id = self._map_rel_id(dgtype)
                    dg_label = \
                        a.find('./DisorderGeneAssociationType/Name').text
                    if rel_id is None:
                        logger.warning(
                            "Cannot map association type (%s) to RO " +
                            "for association (%s | %s).  Skipping.",
                            dg_label, disorder_label, gene_symbol)
                        continue

                    alt_locus_id = '_:'+gene_num+'-'+disorder_num+'VL'
                    alt_label = \
                        ' '.join(('some variant of', gene_symbol.strip(),
                                  'that is a', dg_label.lower(),
                                  disorder_label))

                    model.addIndividualToGraph(alt_locus_id, alt_label,
                                               geno.genoparts['variant_locus'])
                    geno.addAffectedLocus(alt_locus_id, gene_id)
                    model.addBlankNodeAnnotation(alt_locus_id)

                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use "assessed" status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    status_code = \
                        a.find('DisorderGeneAssociationStatus').get('id')
                    # imported automatically asserted information
                    # used in automatic assertion
                    eco_id = 'ECO:0000323'
                    # Assessed
                    # TODO are these internal ids stable between releases?
                    if status_code == '17991':
                        # imported manually asserted information
                        # used in automatic assertion
                        eco_id = 'ECO:0000322'
                    # Non-traceable author statement ECO_0000034
                    # imported information in automatic assertion ECO_0000313

                    assoc = G2PAssoc(g, self.name, alt_locus_id,
                                     disorder_id, rel_id)
                    assoc.add_evidence(eco_id)
                    assoc.add_association_to_graph()

                    rlist = a.find('./Gene/ExternalReferenceList')
                    eqid = None

                    for r in rlist.findall('ExternalReference'):
                        if r.find('Source').text == 'Ensembl':
                            eqid = 'ENSEMBL:'+r.find('Reference').text
                        elif r.find('Source').text == 'HGNC':
                            eqid = 'HGNC:'+r.find('Reference').text
                        elif r.find('Source').text == 'OMIM':
                            eqid = 'OMIM:'+r.find('Reference').text
                        else:
                            pass  # skip the others for now
                        if eqid is not None:
                            model.addClassToGraph(eqid, None)
                            model.addEquivalentClass(gene_id, eqid)
                elem.clear()  # empty the element

            if self.testMode and limit is not None and line_counter > limit:
                return

        return
Пример #40
0
class Pathway():
    """
    This provides convenience methods to deal with gene and protein collections
    in the context of pathways.
    """

    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        return

    def addPathway(
            self, pathway_id, pathway_label, pathway_type=None,
            pathway_description=None):
        """
        Adds a pathway as a class.  If no specific type is specified, it will
        default to a subclass of "GO:cellular_process" and "PW:pathway".
        :param pathway_id:
        :param pathway_label:
        :param pathway_type:
        :param pathway_description:
        :return:
        """

        if pathway_type is None:
            pathway_type = self.globaltt['cellular_process']
        self.model.addClassToGraph(
            pathway_id, pathway_label, pathway_type, pathway_description)
        self.model.addSubClass(pathway_id, self.globaltt['pathway'])

        return

    def addGeneToPathway(self, gene_id, pathway_id):
        """
        When adding a gene to a pathway, we create an intermediate
        'gene product' that is involved in
        the pathway, through a blank node.

        gene_id RO:has_gene_product _gene_product
        _gene_product RO:involved_in pathway_id

        :param pathway_id:
        :param gene_id:
        :return:
        """

        gene_product = '_:'+re.sub(r':', '', gene_id) + 'product'
        self.model.addIndividualToGraph(
            gene_product, None, self.globaltt['gene_product'])
        self.graph.addTriple(
            gene_id, self.globaltt['has gene product'], gene_product)
        self.addComponentToPathway(gene_product, pathway_id)

        return

    def addComponentToPathway(self, component_id, pathway_id):
        """
        This can be used directly when the component is directly involved in
        the pathway.  If a transforming event is performed on the component
        first, then the addGeneToPathway should be used instead.

        :param pathway_id:
        :param component_id:
        :return:
        """
        self.graph.addTriple(component_id, self.globaltt['involved in'], pathway_id)

        return
Пример #41
0
    def _process_data(self, src_key, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files[src_key]['file']))

        LOG.info("Processing Data from %s", raw)

        if self.test_mode:      # set the graph to build
            graph = self.testgraph
        else:
            graph = self.graph

        family = Family(graph)
        model = Model(graph)

        line_counter = 1
        geno = Genotype(graph)
        diputil = DipperUtil()
        col = self.files[src_key]['columns']
        # affords access with
        # x = row[col.index('x')].strip()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"')
            # we can keep a close watch on changing file formats
            fileheader = next(filereader, None)
            fileheader = [c.lower() for c in fileheader]
            if col != fileheader:  # assert
                LOG.error('Expected  %s to have columns: %s', raw, col)
                LOG.error('But Found %s to have columns: %s', raw, fileheader)
                raise AssertionError('Incomming data headers have changed.')

            for row in filereader:
                line_counter += 1
                if len(row) != len(col):
                    LOG.warning(
                        'Expected %i values but find %i in  row %i',
                        len(col), len(row), line_counter)
                    continue

                # (catalog_id, description, omim_number, sample_type,
                # cell_line_available, dna_in_stock, dna_ref, gender, age,
                # race, ethnicity, affected, karyotype, relprob, mutation,
                # gene, family_id, collection, url, cat_remark, pubmed_ids,
                # family_member, variant_id, dbsnp_id, species) = row

                # example:
                # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,
                #       ,Female,26 YR,Caucasian,,,,
                # parent,,,39,NIGMS Human Genetic Cell Repository,
                # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                # 46;XX; clinically normal mother of a child with Hurler syndrome;
                #       proband not in Repository,,
                # 2,,18343,H**o sapiens

                catalog_id = row[col.index('catalog_id')].strip()

                if self.test_mode and catalog_id not in self.test_lines:
                    # skip rows not in our test lines, when in test mode
                    continue

                # ###########    BUILD REQUIRED VARIABLES    ###########

                # Make the cell line ID
                cell_line_id = 'Coriell:' + catalog_id
                # Map the cell/sample type
                cell_type = self.resolve(row[col.index('sample_type')].strip())
                # on fail cell_type = self.globaltt['cell'] ?

                # Make a cell line label
                collection = row[col.index('collection')].strip()
                line_label = collection.partition(' ')[0] + '-' + catalog_id

                # Map the repository/collection
                repository = self.localtt[collection]

                # patients are uniquely identified by one of:
                # dbsnp id (which is == an individual haplotype)
                # family id + family member (if present) OR
                # probands are usually family member zero
                # cell line id
                # since some patients have >1 cell line derived from them,
                # we must make sure that the genotype is attached to
                # the patient, and can be inferred to the cell line
                # examples of repeated patients are:
                #   famid=1159, member=1; fam=152,member=1

                # Make the patient ID

                # make an anonymous patient
                patient_id = '_:person'
                fam_id = row[col.index('fam')].strip()
                fammember = row[col.index('fammember')].strip()
                if fam_id != '':
                    patient_id = '-'.join((patient_id, fam_id, fammember))
                else:
                    # make an anonymous patient
                    patient_id = '-'.join((patient_id, catalog_id))

                # properties of the individual patients:  sex, family id,
                # member/relproband, description descriptions are
                # really long and ugly SCREAMING text, so need to clean up
                # the control cases are so odd with this labeling scheme;
                # but we'll deal with it as-is for now.
                description = row[col.index('description')].strip()
                short_desc = (description.split(';')[0]).capitalize()

                gender = row[col.index('gender')].strip().lower()
                affected = row[col.index('affected')].strip()
                relprob = row[col.index('relprob')].strip()

                if affected == '':
                    affected = 'unspecified'
                elif affected in self.localtt:
                    affected = self.localtt[affected]
                else:
                    LOG.warning(
                        'Novel Affected status  %s at row: %i of %s',
                        affected, line_counter, raw)
                patient_label = ' '.join((affected, gender, relprob))
                if relprob == 'proband':
                    patient_label = ' '.join((
                        patient_label.strip(), 'with', short_desc))
                else:
                    patient_label = ' '.join((
                        patient_label.strip(), 'of proband with', short_desc))

                # #############    BUILD THE CELL LINE    #############

                # Adding the cell line as a typed individual.
                cell_line_reagent_id = self.globaltt['cell line']

                model.addIndividualToGraph(
                    cell_line_id, line_label, cell_line_reagent_id)

                # add the equivalent id == dna_ref
                dna_ref = row[col.index('dna_ref')].strip()
                if dna_ref != '' and dna_ref != catalog_id:
                    equiv_cell_line = 'Coriell:' + dna_ref
                    # some of the equivalent ids are not defined
                    # in the source data; so add them
                    model.addIndividualToGraph(
                        equiv_cell_line, None, cell_line_reagent_id)
                    model.addSameIndividual(cell_line_id, equiv_cell_line)

                # Cell line derives from patient
                geno.addDerivesFrom(cell_line_id, patient_id)
                geno.addDerivesFrom(cell_line_id, cell_type)

                # Cell line a member of repository
                family.addMember(repository, cell_line_id)

                cat_remark = row[col.index('cat_remark')].strip()

                if cat_remark != '':
                    model.addDescription(cell_line_id, cat_remark)

                # Cell age_at_sampling
                # TODO add the age nodes when modeled properly in #78
                # if (age != ''):
                    # this would give a BNode that is an instance of Age.
                    # but i don't know how to connect
                    # the age node to the cell line? we need to ask @mbrush
                    # age_id = '_'+re.sub('\s+','_',age)
                    # gu.addIndividualToGraph(
                    #   graph,age_id,age,self.globaltt['age'])
                    # gu.addTriple(
                    #   graph,age_id,self.globaltt['has measurement value'],age,
                    #   True)

                # #############    BUILD THE PATIENT    #############

                # Add the patient ID as an individual.
                model.addPerson(patient_id, patient_label)
                # TODO map relationship to proband as a class
                # (what ontology?)

                # Add race of patient
                # FIXME: Adjust for subcategories based on ethnicity field
                # EDIT: There are 743 different entries for ethnicity...
                # Too many to map?
                # Add ethnicity as literal in addition to the mapped race?
                # Adjust the ethnicity txt (if using)
                # to initial capitalization to remove ALLCAPS

                # TODO race should go into the individual's background
                # and abstracted out to the Genotype class punting for now.
                # if race != '':
                #    mapped_race = self.resolve(race)
                #    if mapped_race is not None:
                #        gu.addTriple(
                #           g,patient_id,self.globaltt['race'], mapped_race)
                #        model.addSubClass(
                #           mapped_race,self.globaltt['ethnic_group'])

                # #############    BUILD THE FAMILY    #############

                # Add triples for family_id, if present.
                if fam_id != '':
                    family_comp_id = 'CoriellFamily:' + fam_id

                    family_label = ' '.join(('Family of proband with', short_desc))

                    # Add the family ID as a named individual
                    model.addIndividualToGraph(
                        family_comp_id, family_label, self.globaltt['family'])

                    # Add the patient as a member of the family
                    family.addMemberOf(patient_id, family_comp_id)

                # #############    BUILD THE GENOTYPE   #############

                # the important things to pay attention to here are:
                # karyotype = chr rearrangements  (somatic?)
                # mutation = protein-level mutation as a label,
                # often from omim
                # gene = gene symbol - TODO get id
                # variant_id = omim variant ids (; delimited)
                # dbsnp_id = snp individual ids = full genotype?

                # note GM00633 is a good example of chromosomal variation
                # - do we have enough to capture this?
                # GM00325 has both abnormal karyotype and variation

                # make an assumption that if the taxon is blank,
                # that it is human!
                species = row[col.index('species')].strip()
                if species is None or species == '':
                    species = 'H**o sapiens'
                taxon = self.resolve(species)

                # if there's a dbSNP id,
                # this is actually the individual's genotype
                genotype_id = None
                genotype_label = None

                dbsnp_id = row[col.index('dbsnp_id')].strip()
                if dbsnp_id != '':
                    genotype_id = 'dbSNPIndividual:' + dbsnp_id

                omim_map = {}
                gvc_id = None

                # some of the karyotypes are encoded
                # with terrible hidden codes. remove them here
                # i've seen a <98> character
                karyotype = row[col.index('karyotype')].strip()
                karyotype = diputil.remove_control_characters(karyotype)
                karyotype_id = None
                if karyotype.strip() != '':
                    karyotype_id = '_:'+re.sub(
                        'MONARCH:', '', self.make_id(karyotype))
                    # add karyotype as karyotype_variation_complement
                    model.addIndividualToGraph(
                        karyotype_id, karyotype,
                        self.globaltt['karyotype_variation_complement'])
                    # TODO break down the karyotype into parts
                    # and map into GENO. depends on #77

                    # place the karyotype in a location(s).
                    karyo_chrs = self._get_affected_chromosomes_from_karyotype(
                        karyotype)
                    for chrom in karyo_chrs:
                        chr_id = makeChromID(chrom, taxon, 'CHR')
                        # add an anonymous sequence feature,
                        # each located on chr
                        karyotype_feature_id = '-'.join((karyotype_id, chrom))
                        karyotype_feature_label = \
                            'some karyotype alteration on chr' + str(chrom)
                        feat = Feature(
                            graph, karyotype_feature_id, karyotype_feature_label,
                            self.globaltt['sequence_alteration'])
                        feat.addFeatureStartLocation(None, chr_id)
                        feat.addFeatureToGraph()
                        geno.addParts(
                            karyotype_feature_id, karyotype_id,
                            self.globaltt['has_variant_part'])

                gene = row[col.index('gene')].strip()
                mutation = row[col.index('mutation')].strip()
                if gene != '':
                    varl = gene + '(' + mutation + ')'

                # fix the variant_id so it's always in the same order
                variant_id = row[col.index('variant_id')].strip()
                vids = variant_id.split(';')
                variant_id = ';'.join(sorted(list(set(vids))))

                if karyotype.strip() != '' and not self._is_normal_karyotype(
                        karyotype):

                    gvc_id = karyotype_id
                    if variant_id != '':
                        gvc_id = '_:' + variant_id.replace(';', '-') + '-' \
                            + re.sub(r'\w*:', '', karyotype_id)
                    if mutation.strip() != '':
                        gvc_label = '; '.join((varl, karyotype))
                    else:
                        gvc_label = karyotype
                elif variant_id.strip() != '':
                    gvc_id = '_:' + variant_id.replace(';', '-')
                    gvc_label = varl
                else:
                    # wildtype?
                    pass

                # add the karyotype to the gvc.
                # use reference if normal karyotype
                karyo_rel = self.globaltt['has_variant_part']
                if self._is_normal_karyotype(karyotype):
                    karyo_rel = self.globaltt['has_reference_part']
                if karyotype_id is not None \
                        and not self._is_normal_karyotype(karyotype) \
                        and gvc_id is not None and karyotype_id != gvc_id:
                    geno.addParts(karyotype_id, gvc_id, karyo_rel)

                if variant_id.strip() != '':
                    # split the variants & add them as part of the genotype
                    # we don't necessarily know their zygosity,
                    # just that they are part of the genotype variant ids
                    # are from OMIM, so prefix as such we assume that the
                    # sequence alts will be defined in OMIM not here
                    # TODO sort the variant_id list, if the omim prefix is
                    # the same, then assume it's the locus make a hashmap
                    # of the omim id to variant id list;
                    # then build the genotype hashmap is also useful for
                    # removing the "genes" from the list of "phenotypes"

                    # will hold gene/locus id to variant list
                    omim_map = {}

                    locus_num = None
                    for var in variant_id.split(';'):
                        # handle omim-style and odd var ids
                        # like 610661.p.R401X
                        mch = re.match(r'(\d+)\.+(.*)', var.strip())
                        if mch is not None and len(mch.groups()) == 2:
                            (locus_num, var_num) = mch.groups()

                        if locus_num is not None and locus_num not in omim_map:
                            omim_map[locus_num] = [var_num]
                        else:
                            omim_map[locus_num] += [var_num]

                    for omim in omim_map:
                        # gene_id = 'OMIM:' + omim  # TODO unused
                        vslc_id = '_:' + '-'.join(
                            [omim + '.' + a for a in omim_map.get(omim)])
                        vslc_label = varl
                        # we don't really know the zygosity of
                        # the alleles at all.
                        # so the vslcs are just a pot of them
                        model.addIndividualToGraph(
                            vslc_id, vslc_label,
                            self.globaltt['variant single locus complement'])
                        for var in omim_map.get(omim):
                            # this is actually a sequence alt
                            allele1_id = 'OMIM:' + omim + '.' + var
                            geno.addSequenceAlteration(allele1_id, None)

                            # assume that the sa -> var_loc -> gene
                            # is taken care of in OMIM
                            geno.addPartsToVSLC(
                                vslc_id, allele1_id, None,
                                self.globaltt['indeterminate'],
                                self.globaltt['has_variant_part'])

                        if vslc_id != gvc_id:
                            geno.addVSLCtoParent(vslc_id, gvc_id)

                if affected == 'unaffected':
                    # let's just say that this person is wildtype
                    model.addType(patient_id, self.globaltt['wildtype'])
                elif genotype_id is None:
                    # make an anonymous genotype id (aka blank node)
                    genotype_id = '_:geno' + catalog_id.strip()

                # add the gvc
                if gvc_id is not None:
                    model.addIndividualToGraph(
                        gvc_id, gvc_label,
                        self.globaltt['genomic_variation_complement'])

                    # add the gvc to the genotype
                    if genotype_id is not None:
                        if affected == 'unaffected':
                            rel = self.globaltt['has_reference_part']
                        else:
                            rel = self.globaltt['has_variant_part']
                        geno.addParts(gvc_id, genotype_id, rel)

                    if karyotype_id is not None \
                            and self._is_normal_karyotype(karyotype):
                        if gvc_label is not None and gvc_label != '':
                            genotype_label = '; '.join((gvc_label, karyotype))
                        elif karyotype is not None:
                            genotype_label = karyotype
                        if genotype_id is None:
                            genotype_id = karyotype_id
                        else:
                            geno.addParts(
                                karyotype_id, genotype_id,
                                self.globaltt['has_reference_part'])
                    else:
                        genotype_label = gvc_label
                        # use the catalog id as the background
                    genotype_label += ' ['+catalog_id.strip()+']'

                if genotype_id is not None and gvc_id is not None:
                    # only add the genotype if it has some parts
                    geno.addGenotype(
                        genotype_id, genotype_label,
                        self.globaltt['intrinsic_genotype'])
                    geno.addTaxon(taxon, genotype_id)
                    # add that the patient has the genotype
                    # TODO check if the genotype belongs to
                    # the cell line or to the patient
                    graph.addTriple(
                        patient_id, self.globaltt['has_genotype'], genotype_id)
                else:
                    geno.addTaxon(taxon, patient_id)

                # TODO: Add sex/gender  (as part of the karyotype?)
                # = row[col.index('')].strip()
                # #############    DEAL WITH THE DISEASES   #############
                omim_num = row[col.index('omim_num')].strip()

                # we associate the disease to the patient
                if affected == 'affected' and omim_num != '':
                    for disease in omim_num.split(';'):
                        if disease is not None and disease != '':
                            # if the omim number is in omim_map,
                            # then it is a gene not a pheno

                            # TEC - another place to use the mimTitle omim
                            # classifier omia & genereviews are using

                            if disease not in omim_map:
                                disease_id = 'OMIM:' + disease.strip()
                                # assume the label is taken care of in OMIM
                                model.addClassToGraph(disease_id, None)

                                # add the association:
                                #   the patient has the disease
                                assoc = G2PAssoc(
                                    graph, self.name,
                                    patient_id, disease_id)
                                assoc.add_association_to_graph()

                                # this line is a model of this disease
                                # TODO abstract out model into
                                # it's own association class?
                                graph.addTriple(
                                    cell_line_id,
                                    self.globaltt['is model of'],
                                    disease_id)
                            else:
                                LOG.info('drop gene %s from disease list', disease)

                # #############    ADD PUBLICATIONS   #############
                pubmed_ids = row[col.index('pubmed_ids')].strip()
                if pubmed_ids != '':
                    for pmid in pubmed_ids.split(';'):
                        pubmed_id = 'PMID:' + pmid.strip()
                        ref = Reference(graph, pubmed_id)
                        ref.setType(self.globaltt['journal article'])
                        ref.addRefToGraph()
                        graph.addTriple(
                            pubmed_id, self.globaltt['mentions'], cell_line_id)

                if not self.test_mode and (
                        limit is not None and line_counter > limit):
                    break
        return
Пример #42
0
    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:' + tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)", num_cols,
                        expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)), tax_num,
                                'CHR')
                            geno.addChromosomeInstance(cytogenetic_loc,
                                                       build_id, assembly,
                                                       band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)), assembly,
                                'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(str(chr), build_id, assembly,
                                               chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs' + str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:' + dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_' + gene_num + '-' + variant_num
                    if self.nobnodes:
                        vl_id = ':' + vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(vl_id, vl_label,
                                               geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info("No gene listed for variant %d",
                                    int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)",
                                     phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(m.group(1), 'Orphanet:',
                                               phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(r'^ORPHA', 'Orphanet',
                                               phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(r'^Human Phenotype Ontology',
                                               '', phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(r'SNOMED CT:\s?', 'SNOMED:',
                                               phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(g, self.name, seqalt_id,
                                         phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:' + xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return
Пример #43
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids
        as deprecated classes, where the new gene id is the replacement for it.
        The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol,
                 discontinued_date) = line.split('\t')

                # set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #             or (self.filter == 'geneids' and \
                #                 (int(gene_num) not in self.gene_ids))):
                #         continue
                #  end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))

                # add the two genes
                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                    model.addClassToGraph(discontinued_gene_id,
                                          discontinued_symbol)

                    # add the new gene id to replace the old gene id
                    model.addDeprecatedClass(discontinued_gene_id, [gene_id])
                else:
                    model.addIndividualToGraph(gene_id, None)
                    model.addIndividualToGraph(discontinued_gene_id,
                                               discontinued_symbol)
                    model.addDeprecatedIndividual(discontinued_gene_id,
                                                  [gene_id])

                # also add the old symbol as a synonym of the new gene
                model.addSynonym(gene_id, discontinued_symbol)

                if (not self.testMode) and\
                        (limit is not None and line_counter > limit):
                    break

        return
Пример #44
0
    def _process_disease2gene(self, row):
        """
        Here, we process the disease-to-gene associations.
        Note that we ONLY process direct associations
        (not inferred through chemicals).
        Furthermore, we also ONLY process "marker/mechanism" associations.

        We preferentially utilize OMIM identifiers over MESH identifiers
        for disease/phenotype.
        Therefore, if a single OMIM id is listed under the "omim_ids" list,
        we will choose this over any MeSH id that might be listed as
        the disease_id. If multiple OMIM ids are listed in the omim_ids column,
        we toss this for now.
        (Mostly, we are not sure what to do with this information.)

        We associate "some variant of gene X" with the phenotype,
        rather than the gene directly.

        We also pull in the MeSH labels here (but not OMIM) to ensure that
        we have them (as they may not be brought in separately).
        :param row:
        :return:

        """

        # if self.testMode:
        # g = self.testgraph
        # else:
        #     g = self.graph
        # self._check_list_len(row, 9)
        # geno = Genotype(g)
        # gu = GraphUtils(curie_map.get())
        model = Model(self.g)
        (gene_symbol, gene_id, disease_name, disease_id, direct_evidence,
         inference_chemical_name, inference_score, omim_ids, pubmed_ids) = row

        # we only want the direct associations; skipping inferred for now
        if direct_evidence == '' or direct_evidence != 'marker/mechanism':
            return

        # scrub some of the associations...
        # it seems odd to link human genes to the following "diseases"
        diseases_to_scrub = [
            'MESH:D004283',  # dog diseases
            'MESH:D004195',  # disease models, animal
            'MESH:D030342',  # genetic diseases, inborn
            'MESH:D040181',  # genetic dieases, x-linked
            'MESH:D020022']   # genetic predisposition to a disease

        if disease_id in diseases_to_scrub:
            logger.info(
                "Skipping association between NCBIGene:%s and %s",
                str(gene_id), disease_id)
            return

        intersect = list(
            set(['OMIM:' + str(i) for i in omim_ids.split('|')] +
                [disease_id]) & set(self.test_diseaseids))
        if self.testMode and (
                int(gene_id) not in self.test_geneids or len(intersect) < 1):
            return

        # there are three kinds of direct evidence:
        # (marker/mechanism | marker/mechanism|therapeutic | therapeutic)
        # we are only using the "marker/mechanism" for now
        # TODO what does it mean for a gene to be therapeutic for disease?
        # a therapeutic target?

        gene_id = 'NCBIGene:' + gene_id

        preferred_disease_id = disease_id
        if omim_ids is not None and omim_ids != '':
            omim_id_list = re.split(r'\|', omim_ids)
            # If there is only one OMIM ID for the Disease ID
            # or in the omim_ids list,
            # use the OMIM ID preferentially over any MeSH ID.
            if re.match(r'OMIM:.*', disease_id):
                if len(omim_id_list) > 1:
                    # the disease ID is an OMIM ID and
                    # there is more than one OMIM entry in omim_ids.
                    # Currently no entries satisfy this condition
                    pass
                elif disease_id != ('OMIM:' + omim_ids):
                    # the disease ID is an OMIM ID and
                    # there is only one non-equiv OMIM entry in omim_ids
                    # we preferentially use the disease_id here
                    logger.warning(
                        "There may be alternate identifier for %s: %s",
                        disease_id, omim_ids)
                    # TODO: What should be done with the alternate disease IDs?
            else:
                if len(omim_id_list) == 1:
                    # the disease ID is not an OMIM ID
                    # and there is only one OMIM entry in omim_ids.
                    preferred_disease_id = 'OMIM:' + omim_ids
                elif len(omim_id_list) > 1:
                    # This is when the disease ID is not an OMIM ID and
                    # there is more than one OMIM entry in omim_ids.
                    pass

        # we actually want the association between the gene and the disease
        # to be via an alternate locus not the "wildtype" gene itself. So we
        # make an anonymous alternate locus, and put that in the association.
        alt_id = gene_id + '-' + preferred_disease_id + 'VL'
        # can't have colons in the bnodes
        alt_locus = re.sub(r':', '', alt_id)
        alt_locus = "_:" + alt_locus

        alt_label = 'some variant of ' + gene_symbol + ' that is ' \
                    + direct_evidence + ' for ' + disease_name
        model.addIndividualToGraph(
            alt_locus, alt_label,
            self.geno.genoparts['variant_locus'])
        # assume that the label gets added elsewhere
        model.addClassToGraph(gene_id, None)
        self.geno.addAffectedLocus(alt_locus, gene_id)
        model.addBlankNodeAnnotation(alt_locus)

        # not sure if MESH is getting added separately.
        # adding labels here for good measure
        dlabel = None
        if re.match(r'MESH', preferred_disease_id):
            dlabel = disease_name
        model.addClassToGraph(preferred_disease_id, dlabel)

        # Add the disease to gene relationship.
        rel_id = self._get_relationship_id(direct_evidence)
        refs = self._process_pubmed_ids(pubmed_ids)

        self._make_association(alt_locus, preferred_disease_id, rel_id, refs)

        return
Пример #45
0
    def _process_genes(self, taxid, limit=None):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        col = list(self.columns['bmq_attributes'])
        if taxid != '9606' and 'hgnc_id' in col:
            col.remove('hgnc_id')
        col_exp = [
            self.columns['bmq_headers'][self.columns['bmq_attributes'].index(x)]
            for x in col]

        LOG.info("Processing Ensembl genes for NCBITaxon:%s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')
            row = next(reader)
            if not self.check_fileheader(col_exp, row):
                pass
            for row in reader:
                ensembl_gene_id = row[col.index('ensembl_gene_id')]
                external_gene_name = row[col.index('external_gene_name')]
                description = row[col.index('description')].strip()
                gene_biotype = row[col.index('gene_biotype')].strip()
                entrezgene = row[col.index('entrezgene_id')].strip()
                ensembl_peptide_id = row[col.index('ensembl_peptide_id')].strip()
                uniprotswissprot = row[col.index('uniprotswissprot')].strip()
                hgnc_curie = None
                # in the case of human genes, we also get the hgnc id,
                if taxid == '9606' and 'hgnc_id' in col:
                    hgnc_curie = row[col.index('hgnc_id')].strip()

                if self.test_mode and entrezgene != '' and \
                        entrezgene not in self.gene_ids:
                    continue

                gene_id = 'ENSEMBL:' + ensembl_gene_id
                entrez_curie = 'NCBIGene:{}'.format(entrezgene)

                if description == '':
                    description = None

                gene_type_id = self.resolve(
                    gene_biotype, mandatory=False,
                    default=self.globaltt['polypeptide'])

                model.addClassToGraph(
                    gene_id, external_gene_name, gene_type_id, description)

                if entrezgene != '':
                    if taxid == '9606':
                        # Use HGNC for eq in human data
                        model.addXref(gene_id, entrez_curie)
                    else:
                        model.addEquivalentClass(gene_id, entrez_curie)

                if hgnc_curie is not None and hgnc_curie != '':
                    model.addEquivalentClass(gene_id, hgnc_curie)
                geno.addTaxon('NCBITaxon:' + taxid, gene_id)
                if ensembl_peptide_id is not None and ensembl_peptide_id != '':
                    peptide_curie = 'ENSEMBL:{}'.format(ensembl_peptide_id)
                    model.addIndividualToGraph(peptide_curie, None, gene_type_id)
                    geno.addGeneProduct(gene_id, peptide_curie)
                    if uniprotswissprot != '':
                        uniprot_curie = 'UniProtKB:{}'.format(uniprotswissprot)
                        model.addIndividualToGraph(uniprot_curie, None, gene_type_id)
                        geno.addGeneProduct(gene_id, uniprot_curie)
                        model.addXref(peptide_curie, uniprot_curie)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Пример #46
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
    the graph additions are in the addXToFeature functions,
    but should be separated.
    TODO:
    this will need to be extended to properly deal with
    fuzzy positions in faldo.

    """

    object_properties = {
        'location': 'faldo:location',
        'begin': 'faldo:begin',
        'end': 'faldo:end',
        'reference': 'faldo:reference',
        'gene_product_of': 'RO:0002204',
        'has_gene_product': 'RO:0002205',
        'is_about': 'IAO:0000136',
        'has_subsequence': 'RO:0002524',
        'is_subsequence_of': 'RO:0002525',
        'has_staining_intensity': 'GENO:0000207',
        'upstream_of_sequence_of': 'RO:0002528',
        'downstream_of_sequence_of': 'RO:0002529'
    }

    data_properties = {
        'position': 'faldo:position',
    }

    annotation_properties = {}

    properties = object_properties.copy()
    properties.update(data_properties)
    properties.update(annotation_properties)

    types = {
        'region': 'faldo:Region',
        'Position': 'faldo:Position',
        # big P for Position type.  little p for position property
        'FuzzyPosition': 'faldo:FuzzyPosition',
        'chromosome': 'SO:0000340',
        'chromosome_arm': 'SO:0000105',
        'chromosome_band': 'SO:0000341',
        'chromosome_part': 'SO:0000830',
        'long_chromosome_arm': 'GENO:0000629',
        'short_chromosome_arm': 'GENO:0000628',
        'chromosome_region': 'GENO:0000614',
        'chromosome_subband': 'GENO:0000616',
        'centromere': 'SO:0000577',
        'plus_strand': 'faldo:PlusStrandPosition',
        'minus_strand': 'faldo:MinusStrandPosition',
        'both_strand': 'faldo:BothStrandPosition',
        'score': 'SO:0001685',
        # FIXME - score is not a good solution, too generic
        'reference_genome': 'SO:0001505',
        'genome': 'SO:0001026',
        'assembly_component': 'SO:0000143',
        'SNP': 'SO:0000694',
        'haplotype': 'GENO:0000871',

        # the following are sequence attributes:
        'band_intensity': 'GENO:0000618',
        'gneg': 'GENO:0000620',
        'gpos': 'GENO:0000619',
        'gpos100': 'GENO:0000622',
        'gpos75': 'GENO:0000623',
        'gpos50': 'GENO:0000624',
        'gpos25': 'GENO:0000625',
        'gvar': 'GENO:0000621',
        'gpos33': 'GENO:0000633',
        'gpos66': 'GENO:0000632'
    }

    def __init__(self,
                 graph,
                 feature_id=None,
                 label=None,
                 feature_type=None,
                 description=None):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)
        self.id = feature_id
        self.label = label
        self.type = feature_type
        self.description = description
        self.start = None
        self.stop = None
        return

    def addFeatureStartLocation(self,
                                coordinate,
                                reference_id,
                                strand=None,
                                position_types=None):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        :return:

        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand,
                                       position_types)

        return

    def addFeatureEndLocation(self,
                              coordinate,
                              reference_id,
                              strand=None,
                              position_types=None):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:

        :return:

        """

        self.stop = self._getLocation(coordinate, reference_id, strand,
                                      position_types)

        return

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        :return:

        """

        loc = dict()
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.types['Position'])

        return loc

    def _getStrandType(self, strand):
        """
        :param strand:
        :return:
        """

        # TODO make this a dictionary/enum:  PLUS, MINUS, BOTH, UNKNOWN
        strand_id = None
        if strand == '+':
            strand_id = self.types['plus_strand']
        elif strand == '-':
            strand_id = self.types['minus_strand']
        elif strand == '.':
            strand_id = self.types['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            logger.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(self,
                          add_region=True,
                          region_id=None,
                          feature_as_class=False):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
        which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
        which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
        faldo:location region_id
        region_id a faldo:region
        faldo:begin start_position
        faldo:end end_position
        start_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id
        end_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:

        :return:

        """

        if feature_as_class:
            self.model.addClassToGraph(self.id, self.label, self.type,
                                       self.description)
        else:
            self.model.addIndividualToGraph(self.id, self.label, self.type,
                                            self.description)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and \
                        self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(
                        self.start['type'])
                if self.stop is not None and\
                        self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                rid = '_:' + rid + "-Region"
                region_id = rid

            self.graph.addTriple(self.id, self.properties['location'],
                                 region_id)
            self.model.addIndividualToGraph(region_id, None, 'faldo:Region')
        else:
            region_id = self.id
            self.model.addType(region_id, 'faldo:Region')

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(self.start['reference'],
                                          self.start['coordinate'],
                                          self.start['type'])
            self.addPositionToGraph(self.start['reference'],
                                    self.start['coordinate'],
                                    self.start['type'])

        if self.stop is not None:
            endp = self._makePositionId(self.stop['reference'],
                                        self.stop['coordinate'],
                                        self.stop['type'])
            self.addPositionToGraph(self.stop['reference'],
                                    self.stop['coordinate'], self.stop['type'])

        self.addRegionPositionToGraph(region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

        return

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.types['plus_strand'] in tylist:
            strand = 'plus'
        elif self.types['minus_strand'] in tylist:
            strand = 'minus'
        elif self.types['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
        Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return:
        """

        if reference is None:
            logger.error("Trying to make position with no reference.")
            return None

        curie = '_:'
        reference = re.sub(r'\w+\:', '', reference, 1)
        if re.match(r'^_', reference):
            # this is in the case if the reference is a bnode
            reference = re.sub(r'^_', '', reference)
        curie += reference
        if coordinate is not None:
            # just in case it isn't a string already
            curie = '-'.join((curie, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                curie = '-'.join((curie, tstring))

        return curie

    def addRegionPositionToGraph(self, region_id, begin_position_id,
                                 end_position_id):

        if begin_position_id is None:
            pass
            # logger.warn(
            #   "No begin position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.properties['begin'],
                                 begin_position_id)

        if end_position_id is None:
            pass
            # logger.warn("No end position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.properties['end'],
                                 end_position_id)

        return

    def addPositionToGraph(self,
                           reference_id,
                           position,
                           position_types=None,
                           strand=None):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
        we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:

        :return:  Identifier of the position created

        """
        pos_id = self._makePositionId(reference_id, position, position_types)
        if position is not None:
            self.graph.addTriple(pos_id,
                                 self.properties['position'],
                                 position,
                                 object_is_literal=True,
                                 literal_type="xsd:integer")
        self.graph.addTriple(pos_id, self.properties['reference'],
                             reference_id)
        if position_types is not None:
            for pos_type in position_types:
                self.model.addType(pos_id, pos_type)
        s = None
        if strand is not None:
            s = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                s = self._getStrandType(strand)
        # else:
        #    s = self.types['both_strand']
        if s is None and (position_types is None or position_types == []):
            s = self.types['Position']

        if s is not None:
            self.model.addType(pos_id, s)

        return pos_id

    def addSubsequenceOfFeature(self, parentid):
        """
        This will add reciprocal triples like:
        feature is_subsequence_of parent
        parent has_subsequence feature
        :param graph:
        :param parentid:

        :return:

        """
        self.graph.addTriple(self.id, self.properties['is_subsequence_of'],
                             parentid)
        self.graph.addTriple(parentid, self.properties['has_subsequence'],
                             self.id)

        return

    def addTaxonToFeature(self, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        # TEC: should taxon be set in __init__()?
        self.taxon = taxonid
        self.graph.addTriple(self.id, Assoc.properties['in_taxon'], self.taxon)

        return

    def addFeatureProperty(self, property_type, property):
        self.graph.addTriple(self.id, property_type, property)
        return
Пример #47
0
    def _process_data(self, source, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files[source]['file']))

        LOG.info("Processing Data from %s", raw)

        if self.testMode:  # set the graph to build
            graph = self.testgraph
        else:
            graph = self.graph

        family = Family(graph)
        model = Model(graph)

        line_counter = 1
        geno = Genotype(graph)
        diputil = DipperUtil()
        col = self.files[source]['columns']
        # affords access with
        # x = row[col.index('x')].strip()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"')
            # we can keep a close watch on changing file formats
            fileheader = next(filereader, None)
            fileheader = [c.lower() for c in fileheader]
            if col != fileheader:  # assert
                LOG.error('Expected  %s to have columns: %s', raw, col)
                LOG.error('But Found %s to have columns: %s', raw, fileheader)
                raise AssertionError('Incomming data headers have changed.')

            for row in filereader:
                line_counter += 1
                if len(row) != len(col):
                    LOG.warning('Expected %i values but find %i in  row %i',
                                len(col), len(row), line_counter)
                    continue

                # (catalog_id, description, omim_number, sample_type,
                # cell_line_available, dna_in_stock, dna_ref, gender, age,
                # race, ethnicity, affected, karyotype, relprob, mutation,
                # gene, family_id, collection, url, cat_remark, pubmed_ids,
                # family_member, variant_id, dbsnp_id, species) = row

                # example:
                # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,
                #       ,Female,26 YR,Caucasian,,,,
                # parent,,,39,NIGMS Human Genetic Cell Repository,
                # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                # 46;XX; clinically normal mother of a child with Hurler syndrome;
                #       proband not in Repository,,
                # 2,,18343,H**o sapiens

                catalog_id = row[col.index('catalog_id')].strip()

                if self.testMode and catalog_id not in self.test_lines:
                    # skip rows not in our test lines, when in test mode
                    continue

                # ###########    BUILD REQUIRED VARIABLES    ###########

                # Make the cell line ID
                cell_line_id = 'Coriell:' + catalog_id
                # Map the cell/sample type
                cell_type = self.resolve(row[col.index('sample_type')].strip())
                # on fail cell_type = self.globaltt['cell'] ?

                # Make a cell line label
                collection = row[col.index('collection')].strip()
                line_label = collection.partition(' ')[0] + '-' + catalog_id

                # Map the repository/collection
                repository = self.localtt[collection]

                # patients are uniquely identified by one of:
                # dbsnp id (which is == an individual haplotype)
                # family id + family member (if present) OR
                # probands are usually family member zero
                # cell line id
                # since some patients have >1 cell line derived from them,
                # we must make sure that the genotype is attached to
                # the patient, and can be inferred to the cell line
                # examples of repeated patients are:
                #   famid=1159, member=1; fam=152,member=1

                # Make the patient ID

                # make an anonymous patient
                patient_id = '_:person'
                fam_id = row[col.index('fam')].strip()
                fammember = row[col.index('fammember')].strip()
                if fam_id != '':
                    patient_id = '-'.join((patient_id, fam_id, fammember))
                else:
                    # make an anonymous patient
                    patient_id = '-'.join((patient_id, catalog_id))

                # properties of the individual patients:  sex, family id,
                # member/relproband, description descriptions are
                # really long and ugly SCREAMING text, so need to clean up
                # the control cases are so odd with this labeling scheme;
                # but we'll deal with it as-is for now.
                description = row[col.index('description')].strip()
                short_desc = (description.split(';')[0]).capitalize()

                gender = row[col.index('gender')].strip().lower()
                affected = row[col.index('affected')].strip()
                relprob = row[col.index('relprob')].strip()

                if affected == '':
                    affected = 'unspecified'
                elif affected in self.localtt:
                    affected = self.localtt[affected]
                else:
                    LOG.warning('Novel Affected status  %s at row: %i of %s',
                                affected, line_counter, raw)
                patient_label = ' '.join((affected, gender, relprob))
                if relprob == 'proband':
                    patient_label = ' '.join(
                        (patient_label.strip(), 'with', short_desc))
                else:
                    patient_label = ' '.join(
                        (patient_label.strip(), 'of proband with', short_desc))

                # #############    BUILD THE CELL LINE    #############

                # Adding the cell line as a typed individual.
                cell_line_reagent_id = self.globaltt['cell line']

                model.addIndividualToGraph(cell_line_id, line_label,
                                           cell_line_reagent_id)

                # add the equivalent id == dna_ref
                dna_ref = row[col.index('dna_ref')].strip()
                if dna_ref != '' and dna_ref != catalog_id:
                    equiv_cell_line = 'Coriell:' + dna_ref
                    # some of the equivalent ids are not defined
                    # in the source data; so add them
                    model.addIndividualToGraph(equiv_cell_line, None,
                                               cell_line_reagent_id)
                    model.addSameIndividual(cell_line_id, equiv_cell_line)

                # Cell line derives from patient
                geno.addDerivesFrom(cell_line_id, patient_id)
                geno.addDerivesFrom(cell_line_id, cell_type)

                # Cell line a member of repository
                family.addMember(repository, cell_line_id)

                cat_remark = row[col.index('cat_remark')].strip()

                if cat_remark != '':
                    model.addDescription(cell_line_id, cat_remark)

                # Cell age_at_sampling
                # TODO add the age nodes when modeled properly in #78
                # if (age != ''):
                # this would give a BNode that is an instance of Age.
                # but i don't know how to connect
                # the age node to the cell line? we need to ask @mbrush
                # age_id = '_'+re.sub('\s+','_',age)
                # gu.addIndividualToGraph(
                #   graph,age_id,age,self.globaltt['age'])
                # gu.addTriple(
                #   graph,age_id,self.globaltt['has measurement value'],age,
                #   True)

                # #############    BUILD THE PATIENT    #############

                # Add the patient ID as an individual.
                model.addPerson(patient_id, patient_label)
                # TODO map relationship to proband as a class
                # (what ontology?)

                # Add race of patient
                # FIXME: Adjust for subcategories based on ethnicity field
                # EDIT: There are 743 different entries for ethnicity...
                # Too many to map?
                # Add ethnicity as literal in addition to the mapped race?
                # Adjust the ethnicity txt (if using)
                # to initial capitalization to remove ALLCAPS

                # TODO race should go into the individual's background
                # and abstracted out to the Genotype class punting for now.
                # if race != '':
                #    mapped_race = self.resolve(race)
                #    if mapped_race is not None:
                #        gu.addTriple(
                #           g,patient_id,self.globaltt['race'], mapped_race)
                #        model.addSubClass(
                #           mapped_race,self.globaltt['ethnic_group'])

                # #############    BUILD THE FAMILY    #############

                # Add triples for family_id, if present.
                if fam_id != '':
                    family_comp_id = 'CoriellFamily:' + fam_id

                    family_label = ' '.join(
                        ('Family of proband with', short_desc))

                    # Add the family ID as a named individual
                    model.addIndividualToGraph(family_comp_id, family_label,
                                               self.globaltt['family'])

                    # Add the patient as a member of the family
                    family.addMemberOf(patient_id, family_comp_id)

                # #############    BUILD THE GENOTYPE   #############

                # the important things to pay attention to here are:
                # karyotype = chr rearrangements  (somatic?)
                # mutation = protein-level mutation as a label,
                # often from omim
                # gene = gene symbol - TODO get id
                # variant_id = omim variant ids (; delimited)
                # dbsnp_id = snp individual ids = full genotype?

                # note GM00633 is a good example of chromosomal variation
                # - do we have enough to capture this?
                # GM00325 has both abnormal karyotype and variation

                # make an assumption that if the taxon is blank,
                # that it is human!
                species = row[col.index('species')].strip()
                if species is None or species == '':
                    species = 'H**o sapiens'
                taxon = self.resolve(species)

                # if there's a dbSNP id,
                # this is actually the individual's genotype
                genotype_id = None
                genotype_label = None

                dbsnp_id = row[col.index('dbsnp_id')].strip()
                if dbsnp_id != '':
                    genotype_id = 'dbSNPIndividual:' + dbsnp_id

                omim_map = {}
                gvc_id = None

                # some of the karyotypes are encoded
                # with terrible hidden codes. remove them here
                # i've seen a <98> character
                karyotype = row[col.index('karyotype')].strip()
                karyotype = diputil.remove_control_characters(karyotype)
                karyotype_id = None
                if karyotype.strip() != '':
                    karyotype_id = '_:' + re.sub('MONARCH:', '',
                                                 self.make_id(karyotype))
                    # add karyotype as karyotype_variation_complement
                    model.addIndividualToGraph(
                        karyotype_id, karyotype,
                        self.globaltt['karyotype_variation_complement'])
                    # TODO break down the karyotype into parts
                    # and map into GENO. depends on #77

                    # place the karyotype in a location(s).
                    karyo_chrs = self._get_affected_chromosomes_from_karyotype(
                        karyotype)
                    for chrom in karyo_chrs:
                        chr_id = makeChromID(chrom, taxon, 'CHR')
                        # add an anonymous sequence feature,
                        # each located on chr
                        karyotype_feature_id = '-'.join((karyotype_id, chrom))
                        karyotype_feature_label = \
                            'some karyotype alteration on chr' + str(chrom)
                        feat = Feature(graph, karyotype_feature_id,
                                       karyotype_feature_label,
                                       self.globaltt['sequence_alteration'])
                        feat.addFeatureStartLocation(None, chr_id)
                        feat.addFeatureToGraph()
                        geno.addParts(karyotype_feature_id, karyotype_id,
                                      self.globaltt['has_variant_part'])

                gene = row[col.index('gene')].strip()
                mutation = row[col.index('mutation')].strip()
                if gene != '':
                    vl = gene + '(' + mutation + ')'

                # fix the variant_id so it's always in the same order
                variant_id = row[col.index('variant_id')].strip()
                vids = variant_id.split(';')
                variant_id = ';'.join(sorted(list(set(vids))))

                if karyotype.strip() != '' and not self._is_normal_karyotype(
                        karyotype):

                    gvc_id = karyotype_id
                    if variant_id != '':
                        gvc_id = '_:' + variant_id.replace(';', '-') + '-' \
                            + re.sub(r'\w*:', '', karyotype_id)
                    if mutation.strip() != '':
                        gvc_label = '; '.join((vl, karyotype))
                    else:
                        gvc_label = karyotype
                elif variant_id.strip() != '':
                    gvc_id = '_:' + variant_id.replace(';', '-')
                    gvc_label = vl
                else:
                    # wildtype?
                    pass

                # add the karyotype to the gvc.
                # use reference if normal karyotype
                karyo_rel = self.globaltt['has_variant_part']
                if self._is_normal_karyotype(karyotype):
                    karyo_rel = self.globaltt['has_reference_part']
                if karyotype_id is not None \
                        and not self._is_normal_karyotype(karyotype) \
                        and gvc_id is not None and karyotype_id != gvc_id:
                    geno.addParts(karyotype_id, gvc_id, karyo_rel)

                if variant_id.strip() != '':
                    # split the variants & add them as part of the genotype
                    # we don't necessarily know their zygosity,
                    # just that they are part of the genotype variant ids
                    # are from OMIM, so prefix as such we assume that the
                    # sequence alts will be defined in OMIM not here
                    # TODO sort the variant_id list, if the omim prefix is
                    # the same, then assume it's the locus make a hashmap
                    # of the omim id to variant id list;
                    # then build the genotype hashmap is also useful for
                    # removing the "genes" from the list of "phenotypes"

                    # will hold gene/locus id to variant list
                    omim_map = {}

                    locus_num = None
                    for var in variant_id.split(';'):
                        # handle omim-style and odd var ids
                        # like 610661.p.R401X
                        mch = re.match(r'(\d+)\.+(.*)', var.strip())
                        if mch is not None and len(mch.groups()) == 2:
                            (locus_num, var_num) = mch.groups()

                        if locus_num is not None and locus_num not in omim_map:
                            omim_map[locus_num] = [var_num]
                        else:
                            omim_map[locus_num] += [var_num]

                    for omim in omim_map:
                        # gene_id = 'OMIM:' + omim  # TODO unused
                        vslc_id = '_:' + '-'.join(
                            [omim + '.' + a for a in omim_map.get(omim)])
                        vslc_label = vl
                        # we don't really know the zygosity of
                        # the alleles at all.
                        # so the vslcs are just a pot of them
                        model.addIndividualToGraph(
                            vslc_id, vslc_label,
                            self.globaltt['variant single locus complement'])
                        for var in omim_map.get(omim):
                            # this is actually a sequence alt
                            allele1_id = 'OMIM:' + omim + '.' + var
                            geno.addSequenceAlteration(allele1_id, None)

                            # assume that the sa -> var_loc -> gene
                            # is taken care of in OMIM
                            geno.addPartsToVSLC(
                                vslc_id, allele1_id, None,
                                self.globaltt['indeterminate'],
                                self.globaltt['has_variant_part'])

                        if vslc_id != gvc_id:
                            geno.addVSLCtoParent(vslc_id, gvc_id)

                if affected == 'unaffected':
                    # let's just say that this person is wildtype
                    model.addType(patient_id, self.globaltt['wildtype'])
                elif genotype_id is None:
                    # make an anonymous genotype id (aka blank node)
                    genotype_id = '_:geno' + catalog_id.strip()

                # add the gvc
                if gvc_id is not None:
                    model.addIndividualToGraph(
                        gvc_id, gvc_label,
                        self.globaltt['genomic_variation_complement'])

                    # add the gvc to the genotype
                    if genotype_id is not None:
                        if affected == 'unaffected':
                            rel = self.globaltt['has_reference_part']
                        else:
                            rel = self.globaltt['has_variant_part']
                        geno.addParts(gvc_id, genotype_id, rel)

                    if karyotype_id is not None \
                            and self._is_normal_karyotype(karyotype):
                        if gvc_label is not None and gvc_label != '':
                            genotype_label = '; '.join((gvc_label, karyotype))
                        elif karyotype is not None:
                            genotype_label = karyotype
                        if genotype_id is None:
                            genotype_id = karyotype_id
                        else:
                            geno.addParts(karyotype_id, genotype_id,
                                          self.globaltt['has_reference_part'])
                    else:
                        genotype_label = gvc_label
                        # use the catalog id as the background
                    genotype_label += ' [' + catalog_id.strip() + ']'

                if genotype_id is not None and gvc_id is not None:
                    # only add the genotype if it has some parts
                    geno.addGenotype(genotype_id, genotype_label,
                                     self.globaltt['intrinsic_genotype'])
                    geno.addTaxon(taxon, genotype_id)
                    # add that the patient has the genotype
                    # TODO check if the genotype belongs to
                    # the cell line or to the patient
                    graph.addTriple(patient_id, self.globaltt['has_genotype'],
                                    genotype_id)
                else:
                    geno.addTaxon(taxon, patient_id)

                # TODO: Add sex/gender  (as part of the karyotype?)
                # = row[col.index('')].strip()
                # #############    DEAL WITH THE DISEASES   #############
                omim_num = row[col.index('omim_num')].strip()

                # we associate the disease to the patient
                if affected == 'affected' and omim_num != '':
                    for d in omim_num.split(';'):
                        if d is not None and d != '':
                            # if the omim number is in omim_map,
                            # then it is a gene not a pheno

                            # TEC - another place to use the mimTitle omim
                            # classifier omia & genereviews are using

                            if d not in omim_map:
                                disease_id = 'OMIM:' + d.strip()
                                # assume the label is taken care of in OMIM
                                model.addClassToGraph(disease_id, None)

                                # add the association:
                                #   the patient has the disease
                                assoc = G2PAssoc(graph, self.name, patient_id,
                                                 disease_id)
                                assoc.add_association_to_graph()

                                # this line is a model of this disease
                                # TODO abstract out model into
                                # it's own association class?
                                graph.addTriple(cell_line_id,
                                                self.globaltt['is model of'],
                                                disease_id)
                            else:
                                LOG.info('drop gene %s from disease list', d)

                # #############    ADD PUBLICATIONS   #############
                pubmed_ids = row[col.index('pubmed_ids')].strip()
                if pubmed_ids != '':
                    for s in pubmed_ids.split(';'):
                        pubmed_id = 'PMID:' + s.strip()
                        ref = Reference(graph, pubmed_id)
                        ref.setType(self.globaltt['journal article'])
                        ref.addRefToGraph()
                        graph.addTriple(pubmed_id, self.globaltt['mentions'],
                                        cell_line_id)

                if not self.testMode and (limit is not None
                                          and line_counter > limit):
                    break
        return
Пример #48
0
class Pathway():
    """
    This provides convenience methods to deal with gene and protein collections
    in the context of pathways.
    """

    pathway_parts = {
        'signal_transduction': 'GO:0007165',
        'cellular_process': 'GO:0009987',
        'pathway': 'PW:0000001',
        'gene_product': 'CHEBI:33695'  # bioinformation molecule
    }

    object_properties = {
        'involved_in': 'RO:0002331',
        'gene_product_of': 'RO:0002204',
        'has_gene_product': 'RO:0002205'
    }

    properties = object_properties.copy()

    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)

        return

    def addPathway(self,
                   pathway_id,
                   pathway_label,
                   pathway_type=None,
                   pathway_description=None):
        """
        Adds a pathway as a class.  If no specific type is specified, it will
        default to a subclass of "GO:cellular_process" and "PW:pathway".
        :param pathway_id:
        :param pathway_label:
        :param pathway_type:
        :param pathway_description:
        :return:
        """

        if pathway_type is None:
            pathway_type = self.pathway_parts['cellular_process']
        self.model.addClassToGraph(pathway_id, pathway_label, pathway_type,
                                   pathway_description)
        self.model.addSubClass(pathway_id, self.pathway_parts['pathway'])

        return

    def addGeneToPathway(self, gene_id, pathway_id):
        """
        When adding a gene to a pathway, we create an intermediate
        'gene product' that is involved in
        the pathway, through a blank node.

        gene_id RO:has_gene_product _gene_product
        _gene_product RO:involved_in pathway_id

        :param pathway_id:
        :param gene_id:
        :return:
        """

        gene_product = '_:' + re.sub(r':', '', gene_id) + 'product'
        self.model.addIndividualToGraph(gene_product, None,
                                        self.pathway_parts['gene_product'])
        self.graph.addTriple(gene_id,
                             self.object_properties['has_gene_product'],
                             gene_product)
        self.addComponentToPathway(gene_product, pathway_id)

        return

    def addComponentToPathway(self, component_id, pathway_id):
        """
        This can be used directly when the component is directly involved in
        the pathway.  If a transforming event is performed on the component
        first, then the addGeneToPathway should be used instead.

        :param pathway_id:
        :param component_id:
        :return:
        """
        self.graph.addTriple(component_id,
                             self.object_properties['involved_in'], pathway_id)

        return
Пример #49
0
class Genotype():
    """
    These methods provide convenient methods to
    add items related to a genotype and it's parts to a supplied graph.
    They follow the patterns set out in
    GENO https://github.com/monarch-initiative/GENO-ontology.
    For specific sequence features,
    we use the GenomicFeature class to create them.

    """
    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        return

    def addGenotype(self,
                    genotype_id,
                    genotype_label,
                    genotype_type=None,
                    genotype_description=None):
        """
        If a genotype_type is not supplied,
        we will default to 'intrinsic_genotype'
        :param genotype_id:
        :param genotype_label:
        :param genotype_type:
        :param genotype_description:
        :return:

        """
        if genotype_type is None:
            genotype_type = self.globaltt['intrinsic_genotype']

        self.model.addIndividualToGraph(genotype_id, genotype_label,
                                        genotype_type, genotype_description)
        return

    def addAllele(self,
                  allele_id,
                  allele_label,
                  allele_type=None,
                  allele_description=None):
        """
        Make an allele object.
        If no allele_type is added, it will default to a geno:allele
        :param allele_id: curie for allele (required)
        :param allele_label: label for allele (required)
        :param allele_type: id for an allele type (optional,
        recommended SO or GENO class)
        :param allele_description: a free-text description of the allele
        :return:

        """

        # TODO should we accept a list of allele types?
        if allele_type is None:
            allele_type = self.globaltt['allele']  # TODO is this a good idea?
        self.model.addIndividualToGraph(allele_id, allele_label, allele_type,
                                        allele_description)

        return

    def addGene(self,
                gene_id,
                gene_label=None,
                gene_type=None,
                gene_description=None):
        ''' genes are classes '''
        if gene_type is None:
            gene_type = self.globaltt['gene']
        self.model.addClassToGraph(gene_id, gene_label, gene_type,
                                   gene_description)

        return

    def addConstruct(self,
                     construct_id,
                     construct_label,
                     construct_type=None,
                     construct_description=None):
        # TODO add base type for construct
        # if (constrcut_type is None):
        #    constrcut_type=self.construct_base_type
        self.model.addIndividualToGraph(construct_id, construct_label,
                                        construct_type, construct_description)

        return

    def addDerivesFrom(self, child_id, parent_id):
        """
        We add a derives_from relationship between the child and parent id.
        Examples of uses include between:
        an allele and a construct or strain here,
        a cell line and it's parent genotype.  Adding the parent and child to
        the graph should happen outside of this function call to ensure graph
        integrity.
        :param child_id:
        :param parent_id:
        :return:

        """

        self.graph.addTriple(child_id, self.globaltt['derives_from'],
                             parent_id)

        return

    def addSequenceDerivesFrom(self, child_id, parent_id):
        self.graph.addTriple(child_id, self.globaltt['sequence_derives_from'],
                             parent_id)

        return

    def addAlleleOfGene(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:is_allele_of.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.globaltt["is_allele_of"]
        self.graph.addTriple(allele_id, rel_id, gene_id)
        return

    def addAffectedLocus(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided,
        it is a
        GENO:has_affected_feature.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:

        """
        if rel_id is None:
            rel_id = self.globaltt['has_affected_feature']
        self.graph.addTriple(allele_id, rel_id, gene_id)
        return

    def addGeneProduct(self,
                       sequence_id,
                       product_id,
                       product_label=None,
                       product_type=None):
        """
        Add gene/variant/allele has_gene_product relationship
        Can be used to either describe a gene to transcript relationship
        or gene to protein
        :param sequence_id:
        :param product_id:
        :param product_label:
        :param product_type:
        :return:

        """
        if product_label is not None and product_type is not None:
            self.model.addIndividualToGraph(product_id, product_label,
                                            product_type)
        self.graph.addTriple(sequence_id, self.globaltt['has gene product'],
                             product_id)

        return

    def addPolypeptide(self,
                       polypeptide_id,
                       polypeptide_label=None,
                       transcript_id=None,
                       polypeptide_type=None):
        """
        :param polypeptide_id:
        :param polypeptide_label:
        :param polypeptide_type:
        :param transcript_id:
        :return:

        """
        if polypeptide_type is None:
            polypeptide_type = self.globaltt['polypeptide']
        self.model.addIndividualToGraph(polypeptide_id, polypeptide_label,
                                        polypeptide_type)
        if transcript_id is not None:
            self.graph.addTriple(transcript_id, self.globaltt['translates_to'],
                                 polypeptide_id)

        return

    def addPartsToVSLC(self,
                       vslc_id,
                       allele1_id,
                       allele2_id,
                       zygosity_id=None,
                       allele1_rel=None,
                       allele2_rel=None):
        """
        Here we add the parts to the VSLC.  While traditionally alleles
        (reference or variant loci) are traditionally added, you can add any
        node (such as sequence_alterations for unlocated variations) to a vslc
        if they are known to be paired.  However, if a sequence_alteration's
        loci is unknown, it probably should be added directly to the GVC.
        :param vslc_id:
        :param allele1_id:
        :param allele2_id:
        :param zygosity_id:
        :param allele1_rel:
        :param allele2_rel:
        :return:

        """

        # vslc has parts allele1/allele2

        if allele1_id is not None:
            self.addParts(allele1_id, vslc_id, allele1_rel)
        if allele2_id is not None and allele2_id.strip() != '':
            self.addParts(allele2_id, vslc_id, allele2_rel)

        # figure out zygosity if it's not supplied
        if zygosity_id is None:
            if allele1_id == allele2_id:
                zygosity_id = self.globaltt['homozygous']
            else:
                zygosity_id = self.globaltt['heterozygous']

        if zygosity_id is not None:
            self.graph.addTriple(vslc_id, self.globaltt['has_zygosity'],
                                 zygosity_id)

        return

    def addVSLCtoParent(self, vslc_id, parent_id):
        """
        The VSLC can either be added to a genotype or to a GVC.
        The vslc is added as a part of the parent.
        :param vslc_id:
        :param parent_id:
        :return:
        """

        self.addParts(vslc_id, parent_id, self.globaltt['has_variant_part'])

        return

    def addParts(self, part_id, parent_id, part_relationship=None):
        """
        This will add a has_part (or subproperty) relationship between
        a parent_id and the supplied part.
        By default the relationship will be BFO:has_part,
        but any relationship could be given here.
        :param part_id:
        :param parent_id:
        :param part_relationship:
        :return:

        """
        if part_relationship is None:
            part_relationship = self.globaltt['has_part']
        # Fail loudly if parent or child identifiers are None
        if parent_id is None:
            raise TypeError('Attempt to pass None as parent')
        elif part_id is None:
            raise TypeError('Attempt to pass None as child')
        elif part_relationship is None:
            part_relationship = self.globaltt['has_part']

        self.graph.addTriple(parent_id, part_relationship, part_id)

        return

    def addSequenceAlteration(self,
                              sa_id,
                              sa_label,
                              sa_type=None,
                              sa_description=None):

        if sa_type is None:
            sa_type = self.globaltt['sequence_alteration']

        self.model.addIndividualToGraph(sa_id, sa_label, sa_type,
                                        sa_description)

        return

    def addSequenceAlterationToVariantLocus(self, sa_id, vl_id):
        self.addParts(sa_id, vl_id, self.globaltt['has_variant_part'])
        return

    def addGenomicBackground(self,
                             background_id,
                             background_label,
                             background_type=None,
                             background_description=None):
        if background_type is None:
            background_type = self.globaltt['genomic_background']
        self.model.addIndividualToGraph(background_id, background_label,
                                        background_type,
                                        background_description)

        return

    def addGenomicBackgroundToGenotype(self,
                                       background_id,
                                       genotype_id,
                                       background_type=None):
        if background_type is None:
            background_type = self.globaltt['genomic_background']
        self.model.addType(background_id, background_type)
        self.addParts(background_id, genotype_id,
                      self.globaltt['has_reference_part'])

        return

    def addTaxon(self, taxon_id, genopart_id):
        """
        The supplied geno part will have the specified taxon added with
        RO:in_taxon relation.
        Generally the taxon is associated with a genomic_background,
        but could be added to any genotype part (including a gene,
        regulatory element, or sequence alteration).
        :param taxon_id:
        :param genopart_id:

        :return:

        """
        self.graph.addTriple(genopart_id, self.globaltt['in taxon'], taxon_id)

        return

    def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id):
        # for example, add a morphant reagent thingy to the genotype,
        # assuming it's a extrinsic_genotype
        self.graph.addTriple(genotype_id, self.globaltt['has_variant_part'],
                             reagent_id)

        return

    def addGeneTargetingReagent(self,
                                reagent_id,
                                reagent_label,
                                reagent_type,
                                gene_id,
                                description=None):
        """
        Here, a gene-targeting reagent is added.
        The actual targets of this reagent should be added separately.
        :param reagent_id:
        :param reagent_label:
        :param reagent_type:

        :return:

        """

        # TODO add default type to reagent_type
        self.model.addIndividualToGraph(reagent_id, reagent_label,
                                        reagent_type, description)

        self.graph.addTriple(reagent_id, self.globaltt['targets_gene'],
                             gene_id)

        return

    def addReagentTargetedGene(self,
                               reagent_id,
                               gene_id,
                               targeted_gene_id=None,
                               targeted_gene_label=None,
                               description=None):
        """
        This will create the instance of a gene that is targeted by a molecular
        reagent (such as a morpholino or rnai).
        If an instance id is not supplied,
        we will create it as an anonymous individual which is of the type
        GENO:reagent_targeted_gene.
        We will also add the targets relationship between the reagent and
        gene class.

        <targeted_gene_id> a GENO:reagent_targeted_gene
        rdf:label targeted_gene_label
        dc:description description
        <reagent_id> GENO:targets_gene <gene_id>

        :param reagent_id:
        :param gene_id:
        :param targeted_gene_id:
        :return:

        """

        # akin to a variant locus
        if targeted_gene_id is None:
            targeted_gene_id = '_' + gene_id + '-' + reagent_id
            targeted_gene_id = targeted_gene_id.replace(":", "")
        self.model.addIndividualToGraph(targeted_gene_id, targeted_gene_label,
                                        self.globaltt['reagent_targeted_gene'],
                                        description)

        if gene_id is not None:
            self.graph.addTriple(targeted_gene_id,
                                 self.globaltt['is_expression_variant_of'],
                                 gene_id)

        self.graph.addTriple(targeted_gene_id, self.globaltt['is_targeted_by'],
                             reagent_id)

        return

    def addTargetedGeneSubregion(self,
                                 tgs_id,
                                 tgs_label,
                                 tgs_type=None,
                                 tgs_description=None):
        if tgs_type is None:
            tgs_type = self.globaltt['targeted_gene_subregion']

        self.model.addIndividualToGraph(tgs_id, tgs_label, tgs_type,
                                        tgs_description)

    def addMemberOfPopulation(self, member_id, population_id):
        self.graph.addTriple(population_id,
                             self.globaltt['has_member_with_allelotype'],
                             member_id)
        return

    def addTargetedGeneComplement(self,
                                  tgc_id,
                                  tgc_label,
                                  tgc_type=None,
                                  tgc_description=None):
        if tgc_type is None:
            tgc_type = self.globaltt['targeted_gene_complement']
        self.model.addIndividualToGraph(tgc_id, tgc_label, tgc_type,
                                        tgc_description)

        return

    def addGenome(self, taxon_num, taxon_label=None, genome_id=None):
        ncbitaxon = 'NCBITaxon:' + taxon_num
        if taxon_label is None:
            if ncbitaxon in self.globaltcid:
                taxon_label = self.globaltcid[ncbitaxon]
            else:
                logging.warning('Add ' + ncbitaxon +
                                ' to global translation table')
                taxon_label = taxon_num
        elif ncbitaxon in self.globaltcid and taxon_label != self.globaltcid[
                ncbitaxon]:
            logging.warning('"' + self.globaltcid[ncbitaxon] +
                            '" may need updating from "' + taxon_label +
                            '" in global translation table')
            logging.warning(
                '"' + taxon_label + '": " ' + self.globaltcid[ncbitaxon] +
                '"' + ' may need to be added to a local translation table')

        genome_label = taxon_label + ' genome'
        if genome_id is None:
            genome_id = self.makeGenomeID(taxon_num)
        self.model.addClassToGraph(genome_id, genome_label,
                                   self.globaltt['genome'])

        return

    def addReferenceGenome(self, build_id, build_label, taxon_id):
        genome_id = self.makeGenomeID(taxon_id)
        self.model.addIndividualToGraph(build_id, build_label,
                                        self.globaltt['reference_genome'])
        self.model.addType(build_id, genome_id)
        if re.match(r'[0-9]+', taxon_id):
            taxon_id = 'NCBITaxon:' + taxon_id
        self.addTaxon(taxon_id, build_id)

        return

    @staticmethod
    def makeGenomeID(taxon_id):
        # scrub off the taxon prefix.  put it in base space
        # TODO: revisit as yet another BNODE?
        # should never be called if a real genome iri exists
        # should create the opaque bode and label together
        # genome_id = re.sub(r'.*\:', '_:', taxon_id) + 'genome'
        genome_id = '_:' + taxon_id + 'genome'
        return genome_id

    def addChromosome(self,
                      chrom,
                      tax_id,
                      tax_label=None,
                      build_id=None,
                      build_label=None):
        """
        if it's just the chromosome, add it as an instance of a SO:chromosome,
        and add it to the genome. If a build is included,
        punn the chromosome as a subclass of SO:chromsome, and make the
        build-specific chromosome an instance of the supplied chr.
        The chr then becomes part of the build or genome.
        """
        family = Family(self.graph)
        # first, make the chromosome class, at the taxon level
        chr_id = makeChromID(str(chrom), tax_id)
        if tax_label is not None:
            chr_label = makeChromLabel(chrom, tax_label)
        else:
            chr_label = makeChromLabel(chrom)
        genome_id = self.makeGenomeID(tax_id)
        self.model.addClassToGraph(chr_id, chr_label,
                                   self.globaltt['chromosome'])
        self.addTaxon(tax_id, genome_id)  # add the taxon to the genome

        if build_id is not None:
            # the build-specific chromosome
            chrinbuild_id = makeChromID(chrom, build_id)
            if build_label is None:
                build_label = build_id
            chrinbuild_label = makeChromLabel(chrom, build_label)
            # add the build-specific chromosome as an instance of the chr class

            self.model.addIndividualToGraph(chrinbuild_id, chrinbuild_label,
                                            chr_id)

            # add the build-specific chromosome
            # as a member of the build (both ways)
            family.addMember(build_id, chrinbuild_id)
            family.addMemberOf(chrinbuild_id, build_id)

        return

    def addChromosomeClass(self, chrom_num, taxon_id, taxon_label):
        taxon = re.sub('NCBITaxon:', '', taxon_id)
        # the chrom class (generic) id
        chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')
        chrom_class_label = makeChromLabel(chrom_num, taxon_label)
        self.model.addClassToGraph(chrom_class_id, chrom_class_label,
                                   self.globaltt['chromosome'])

        return

    def addChromosomeInstance(self,
                              chr_num,
                              reference_id,
                              reference_label,
                              chr_type=None):
        """
        Add the supplied chromosome as an instance within the given reference
        :param chr_num:
        :param reference_id: for example, a build id like UCSC:hg19
        :param reference_label:
        :param chr_type: this is the class that this is an instance of.
        typically a genome-specific chr

        :return:

        """
        family = Family(self.graph)
        chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH')
        chr_label = makeChromLabel(str(chr_num), reference_label)

        self.model.addIndividualToGraph(chr_id, chr_label,
                                        self.globaltt['chromosome'])
        if chr_type is not None:
            self.model.addType(chr_id, chr_type)

        # add the build-specific chromosome
        # as a member of the build  (both ways)
        family.addMember(reference_id, chr_id)
        family.addMemberOf(chr_id,
                           reference_id)  # usage dependent, todo: ommit

        return

    @staticmethod
    def make_variant_locus_label(gene_label, allele_label):
        if gene_label is None:
            gene_label = ''
        label = gene_label.strip() + '<' + allele_label.strip() + '>'

        return label

    def make_vslc_label(self, gene_label, allele1_label, allele2_label):
        """
        Make a Variant Single Locus Complement (VSLC) in monarch-style.
        :param gene_label:
        :param allele1_label:
        :param allele2_label:
        :return:
        """

        vslc_label = ''

        if gene_label is None and allele1_label is None and allele2_label is None:
            LOG.error("Not enough info to make vslc label")
            return None

        top = self.make_variant_locus_label(gene_label, allele1_label)
        bottom = ''
        if allele2_label is not None:
            bottom = self.make_variant_locus_label(gene_label, allele2_label)

        vslc_label = '/'.join((top, bottom))

        return vslc_label

    def make_experimental_model_with_genotype(self, genotype_id,
                                              genotype_label, taxon_id,
                                              taxon_label):

        animal_id = '-'.join((taxon_id, 'with', genotype_id))
        animal_id = re.sub(r':', '', animal_id)
        animal_id = '_:' + animal_id

        animal_label = ' '.join((genotype_label, taxon_label))
        self.model.addIndividualToGraph(animal_id, animal_label, taxon_id)
        self.graph.addTriple(animal_id, self.globaltt['has_genotype'],
                             genotype_id)
        return animal_id
Пример #50
0
class Environment():
    """
    These methods provide convenient methods
    to add items related to an experimental environment
    and it's parts to a supplied graph.

    This is a stub ready for expansion.
    """

    # special genotype parts mapped to their GENO and SO classes
    # that we explicitly reference here
    environment_parts = {
        'environmental_system': 'ENVO:01000254',
        'environmental_condition': 'XCO:0000000',
        'morpholio_reagent': 'REO:0000042',
        'talen_reagent': 'REO:0001022',
        'crispr_reagent': 'REO:crispr_TBD'
    }

    object_properties = {
        'has_part': 'BFO:0000051',
    }

    annotation_properties = {
    }

    properties = object_properties.copy()
    properties.update(annotation_properties)

    def __init__(self, graph):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)
        return

    def addEnvironment(
            self, env_id, env_label, env_type=None, env_description=None):
        if env_type is None:
            env_type = self.environment_parts['environmental_system']

        self.model.addIndividualToGraph(
            env_id, env_label, env_type, env_description)

        return

    def addEnvironmentalCondition(
            self, cond_id, cond_label, cond_type=None, cond_description=None):
        if cond_type is None:
            cond_type = self.environment_parts['environmental_condition']

        self.model.addIndividualToGraph(
            cond_id, cond_label, cond_type, cond_description)

        return

    def addComponentToEnvironment(self, env_id, component_id):

        self.graph.addTriple(
            env_id,
            self.model.object_properties['has_part'],
            component_id)

        return

    def addComponentAttributes(
            self, component_id, entity_id, value=None, unit=None):

        self.graph.addTriple(
            component_id, self.model.object_properties['has_part'],
            entity_id)
        # TODO add value and units

        return
Пример #51
0
    def _process_data(self, raw, limit=None):
        LOG.info("Processing Data from %s", raw)

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)

        # Add the taxon as a class
        taxon_id = self.globaltt['Mus musculus']
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        col = self.files['all']['columns']
        with gzip.open(raw, 'rt') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            row = next(reader)  # presumed header
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g"
                marker_accession_id = row[col.index('marker_accession_id')].strip()
                marker_symbol = row[col.index('marker_symbol')].strip()
                phenotyping_center = row[col.index('phenotyping_center')].strip()
                colony_raw = row[col.index('colony_id')].strip()
                sex = row[col.index('sex')].strip()
                zygosity = row[col.index('zygosity')].strip()
                allele_accession_id = row[col.index('allele_accession_id')].strip()
                allele_symbol = row[col.index('allele_symbol')].strip()
                # allele_name = row[col.index('allele_name')]
                strain_accession_id = row[col.index('strain_accession_id')].strip()
                strain_name = row[col.index('strain_name')].strip()
                # project_name = row[col.index('project_name')]
                project_fullname = row[col.index('project_fullname')].strip()
                pipeline_name = row[col.index('pipeline_name')].strip()
                pipeline_stable_id = row[col.index('pipeline_stable_id')].strip()
                procedure_stable_id = row[col.index('procedure_stable_id')].strip()
                procedure_name = row[col.index('procedure_name')].strip()
                parameter_stable_id = row[col.index('parameter_stable_id')].strip()
                parameter_name = row[col.index('parameter_name')].strip()
                # top_level_mp_term_id = row[col.index('top_level_mp_term_id')]
                # top_level_mp_term_name = row[col.index('top_level_mp_term_name')]
                mp_term_id = row[col.index('mp_term_id')].strip()
                mp_term_name = row[col.index('mp_term_name')].strip()
                p_value = row[col.index('p_value')].strip()
                percentage_change = row[col.index('percentage_change')].strip()
                effect_size = row[col.index('effect_size')].strip()
                statistical_method = row[col.index('statistical_method')].strip()
                resource_name = row[col.index('resource_name')].strip()

                if self.test_mode and marker_accession_id not in self.gene_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity = zygosity.strip()
                zygosity_id = self.resolve(zygosity)
                if zygosity_id == zygosity:
                    LOG.warning(
                        "Zygosity '%s' unmapped. detting to indeterminate", zygosity)
                    zygosity_id = self.globaltt['indeterminate']

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony_raw)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = '_:IMPC-'+re.sub(
                        r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    # TODO blank nodes do not maintain identifiers
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    LOG.info(
                        "Found a strange strain accession...%s", strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = re.match(
                        r'.*<(.*)>', allele_symbol)
                    if sequence_alteration_name is not None:
                        sequence_alteration_name = sequence_alteration_name.group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and marker_accession_id == '':
                    LOG.warning("Marker unspecified on row %d", reader.line_num)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = self.globaltt['variant_locus']
                    geno.addGene(
                        marker_accession_id, marker_symbol, self.globaltt['gene'])

                    geno.addAllele(
                        variant_locus_id, variant_locus_name, variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    # TAG bnode
                    sequence_alteration_id = '_:seqalt' + re.sub(
                        r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(
                    sequence_alteration_id, sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,  with unknown zygosity

                stem_cell_class = self.globaltt['embryonic stem cell line']

                if colony_id is None:
                    print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num)
                model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = '_:'+re.sub(
                    r':', '', allele_accession_id + self.globaltt['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(
                    allele_accession_id, colony_genotype_id,
                    self.globaltt['has_variant_part'])

                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    self.globaltt['indeterminate'], self.globaltt['has_variant_part'])
                graph.addTriple(
                    colony_id, self.globaltt['has_genotype'], colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = self.make_id(
                    (colony_id + phenotyping_center + zygosity + strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = self.globaltt['has_variant_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    LOG.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:'+vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    self.globaltt['variant single locus complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    self.globaltt['has_variant_part'], allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(vslc_id, self.globaltt['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        self.globaltt['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = strain_name + '-' + phenotyping_center \
                        + '-' + colony_raw
                    pheno_center_strain_id = '-'.join((
                        re.sub(r':', '', genomic_background_id),
                        re.sub(r'\s', '_', phenotyping_center),
                        re.sub(r'\W+', '', colony_raw)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        # Tag bnode
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(
                        pheno_center_strain_id, pheno_center_strain_label,
                        self.globaltt['genomic_background'])
                    geno.addSequenceDerivesFrom(
                        pheno_center_strain_id, genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name + ' [' + pheno_center_strain_label + ']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id((
                        colony_id + phenotyping_center + zygosity +
                        strain_accession_id + sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'

                sq_type_id = self.resolve(sex, False)

                if sq_type_id == sex:
                    sq_type_id = self.globaltt['intrinsic_genotype']
                    LOG.warning(
                        "Unknown sex qualifier %s, adding as intrinsic_genotype",
                        sex)

                geno.addGenotype(
                    sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    self.globaltt['has_variant_part'])

                if genomic_background_id is not None and genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                # sometimes phenotype ids are missing.  (about 711 early 2020)
                if mp_term_id is None or mp_term_id == '':
                    LOG.warning(
                        "No phenotype id specified for row %d", reader.line_num)
                    continue
                # hard coded ECO code
                eco_id = self.globaltt['mutant phenotype evidence']

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(
                    graph, self.name, sex_qualified_genotype_id, mp_term_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                model._addSexSpecificity(assoc_id, self.resolve(sex))

                # add a free-text description
                try:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of',
                        str(round(float(effect_size), 5)),
                        '(p =', "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of', str(effect_size),
                        '(p =', "{0}".format(p_value), ').'))

                study_bnode = self._add_study_provenance(
                    phenotyping_center, colony_raw, project_fullname, pipeline_name,
                    pipeline_stable_id, procedure_stable_id, procedure_name,
                    parameter_stable_id, parameter_name, statistical_method,
                    resource_name)

                evidence_line_bnode = self._add_evidence(
                    assoc_id, eco_id, p_value, percentage_change, effect_size,
                    study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(graph, assoc_id, resource_id)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Пример #52
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
    the graph additions are in the addXToFeature functions,
    but should be separated.
    TODO:
    this will need to be extended to properly deal with
    fuzzy positions in faldo.

    """

    def __init__(
            self,
            graph,
            feature_id=None,
            label=None,
            feature_type=None,
            description=None,
            feature_category=None
    ):

        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".format(graph))
        self.model = Model(self.graph)
        self.globaltt = self.graph.globaltt
        self.globaltcid = self.graph.globaltcid
        self.curie_map = self.graph.curie_map
        self.gfxutl = GraphUtils(self.curie_map)
        self.fid = feature_id
        self.feature_category = feature_category
        self.label = label
        self.ftype = feature_type
        self.description = description
        self.start = None
        self.stop = None
        self.taxon = None

    def addFeatureStartLocation(
            self, coordinate, reference_id, strand=None, position_types=None
    ):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand, position_types)

    def addFeatureEndLocation(
            self, coordinate, reference_id, strand=None, position_types=None
    ):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:

        """

        self.stop = self._getLocation(coordinate, reference_id, strand, position_types)

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:

        """

        loc = {}
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.globaltt['Position'])

        return loc

    def _getStrandType(self, strand):
        """
        :param strand:
        """
        strand_id = None
        if strand == '+':
            strand_id = self.globaltt['plus_strand']
        elif strand == '-':
            strand_id = self.globaltt['minus_strand']
        elif strand == '.':
            strand_id = self.globaltt['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            LOG.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(
            self, add_region=True, region_id=None, feature_as_class=False,
            feature_category=None):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
        which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
        which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
        faldo:location region_id
        region_id a faldo:region
        faldo:begin start_position
        faldo:end end_position
        start_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id
        end_position a
        (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param add_region [True]
        :param region_id [None]
        :param feature_as_class [False]
        :param feature_category: a biolink category CURIE for feature
        """

        if feature_category is None:
            feature_category = self.feature_category

        if feature_as_class:
            self.model.addClassToGraph(
                self.fid, self.label, self.ftype, self.description,
                class_category=feature_category)
        else:
            self.model.addIndividualToGraph(
                self.fid, self.label, self.ftype, self.description,
                ind_category=feature_category)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(self.start['type'])
                if self.stop is not None and self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                # blank node, bnode
                rid = rid + "-Region"
                curie = '_:' + self.gfxutl.digest_id(rid)
                self.model.addLabel(curie, rid)
                region_id = curie

            self.graph.addTriple(
                self.fid,
                self.globaltt['location'],
                region_id,
                subject_category=feature_category
            )
            self.model.addIndividualToGraph(region_id, None, self.globaltt['Region'])
        else:
            region_id = self.fid
            self.model.addType(region_id, self.globaltt['region'])

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(
                self.start['reference'], self.start['coordinate'], self.start['type'])
            self.addPositionToGraph(
                self.start['reference'], self.start['coordinate'], self.start['type'],
            )

        if self.stop is not None:
            endp = self._makePositionId(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])
            self.addPositionToGraph(
                self.stop['reference'], self.stop['coordinate'], self.stop['type'])

        self.addRegionPositionToGraph(region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.globaltt['plus_strand'] in tylist:
            strand = 'plus'
        elif self.globaltt['minus_strand'] in tylist:
            strand = 'minus'
        elif self.globaltt['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
        Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return: bnode_curie
        """
        # blank node, bnode
        if reference is None:
            LOG.error("Trying to make position with no reference.")
            return None

        reference = re.sub(r'\w+\:', '', reference, 1)
        if reference[0] == '_':
            # in this case the reference is a bnode curie as well
            # ... this is a bad smell of over modleing
            reference = reference[1:]
        unique_words = reference
        if coordinate is not None:
            # just in case it isn't a string already
            unique_words = '-'.join((unique_words, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                unique_words = '-'.join((unique_words, tstring))

        curie = '_:' + self.gfxutl.digest_id(unique_words)

        # attach the wordage via a label
        # I want to see more of this (TEC 201905)
        # including a type should be mandatory as well
        self.model.addLabel(curie, unique_words)
        return curie

    def addRegionPositionToGraph(self, region_id, begin_position_id, end_position_id):

        if begin_position_id is None:
            pass
            # LOG.warn("No begin position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['begin'], begin_position_id)

        if end_position_id is None:
            pass
            # LOG.warn("No end position specified for region %s", region_id)
        else:
            self.graph.addTriple(region_id, self.globaltt['end'], end_position_id)

    def addPositionToGraph(
            self, reference_id, position, position_types=None, strand=None
    ):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
        we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
        faldo:position Integer(numeric position)
        faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:

        :return:  Identifier of the position created

        """
        pos_id = self._makePositionId(reference_id, position, position_types)
        if position is not None:
            self.graph.addTriple(
                pos_id,
                self.globaltt['position'],
                position,
                object_is_literal=True,
                literal_type="xsd:integer"
            )
        self.graph.addTriple(
            pos_id, self.globaltt['reference'], reference_id
        )
        if position_types is not None:
            for pos_type in position_types:
                self.model.addType(pos_id, pos_type)
        strnd = None
        if strand is not None:
            strnd = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                strnd = self._getStrandType(strand)
        # else:
        #    strnd = self.globaltt['both_strand']
        if strnd is None and (position_types is None or position_types == []):
            strnd = self.globaltt['Position']

        if strnd is not None:
            self.model.addType(pos_id, strnd)

        return pos_id

    def addSubsequenceOfFeature(
            self, parentid, subject_category=None, object_category=None
    ):
        """
        This will add reciprocal triples like:
        feature <is subsequence of> parent
        parent has_subsequence feature
        :param graph:
        :param parentid:

        :return:

        """
        self.graph.addTriple(
            self.fid,
            self.globaltt['is subsequence of'],
            parentid,
            subject_category=subject_category,
            object_category=object_category
        )
        # this should be expected to be done in reasoning not ETL
        self.graph.addTriple(
            parentid,
            self.globaltt['has subsequence'],
            self.fid,
            subject_category=object_category,
            object_category=subject_category
        )

    def addTaxonToFeature(self, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        self.taxon = taxonid
        self.graph.addTriple(
            self.fid,
            self.globaltt['in taxon'],
            self.taxon,
            subject_category=self.feature_category
        )

    def addFeatureProperty(self, property_type, feature_property):

        self.graph.addTriple(
            self.fid,
            property_type,
            feature_property,
            subject_category=self.feature_category
        )
Пример #53
0
    def _add_study_provenance(
            self,
            phenotyping_center,
            colony,
            project_fullname,
            pipeline_name,
            pipeline_stable_id,
            procedure_stable_id,
            procedure_name,
            parameter_stable_id,
            parameter_name,
            statistical_method,
            resource_name
    ):
        """
        :param phenotyping_center: str, from self.files['all']
        :param colony: str, from self.files['all']
        :param project_fullname: str, from self.files['all']
        :param pipeline_name: str, from self.files['all']
        :param pipeline_stable_id: str, from self.files['all']
        :param procedure_stable_id: str, from self.files['all']
        :param procedure_name: str, from self.files['all']
        :param parameter_stable_id: str, from self.files['all']
        :param parameter_name: str, from self.files['all']
        :param statistical_method: str, from self.files['all']
        :param resource_name: str, from self.files['all']
        :return: study bnode
        """

        provenance_model = Provenance(self.graph)
        model = Model(self.graph)

        # Add provenance
        # A study is a blank node equal to its parts
        study_bnode = self.make_id("{0}{1}{2}{3}{4}{5}{6}{7}".format(
            phenotyping_center, colony, project_fullname, pipeline_stable_id,
            procedure_stable_id, parameter_stable_id, statistical_method,
            resource_name), '_')

        model.addIndividualToGraph(
            study_bnode, None, self.globaltt['study'])

        # List of nodes linked to study with has_part property
        study_parts = []

        # Add study parts
        if procedure_stable_id in self.localtt:
            procedure_stable_id2 = self.localtt[procedure_stable_id]
        else:
            procedure_stable_id2 = self.resolve(procedure_stable_id)

        model.addIndividualToGraph(procedure_stable_id2, procedure_name)
        study_parts.append(procedure_stable_id2)

        study_parts.append(self.resolve(statistical_method))
        provenance_model.add_study_parts(study_bnode, study_parts)

        # Add parameter/measure statement: study measures parameter
        parameter_label = "{0} ({1})".format(parameter_name, procedure_name)

        logging.info("Adding Provenance for %s", project_fullname)
        pram_id = self.resolve(parameter_stable_id)
        model.addIndividualToGraph(pram_id, parameter_label)
        provenance_model.add_study_measure(study_bnode, pram_id)

        # Add Colony
        colony_bnode = self.make_id("{0}".format(colony), '_')
        model.addIndividualToGraph(colony_bnode, colony)

        # Add study agent
        phenotyping_center_id = self.localtt[phenotyping_center]
        model.addIndividualToGraph(
            phenotyping_center_id,
            phenotyping_center,
            self.globaltt['organization'])

        # self.graph
        model.addTriple(
            study_bnode, self.globaltt['has_agent'],  phenotyping_center_id)

        if pipeline_stable_id in self.localtt:
            pipeline_stable_id2 = self.localtt[pipeline_stable_id]
        else:
            pipeline_stable_id2 = self.resolve(pipeline_stable_id)

        # add pipeline and project
        model.addIndividualToGraph(pipeline_stable_id2, pipeline_name)
        # self.graph
        model.addTriple(study_bnode, self.globaltt['part_of'], pipeline_stable_id2)

        if project_fullname in self.localtt:
            project_fullname_id = self.localtt[project_fullname]
        else:
            project_fullname_id = self.resolve(project_fullname)

        model.addIndividualToGraph(
            project_fullname_id, project_fullname, self.globaltt['project'])

        # self.graph
        model.addTriple(study_bnode, self.globaltt['part_of'], project_fullname_id)

        return study_bnode
Пример #54
0
class Evidence:
    """
    To model evidence as the basis for an association.
    This encompasses:
        * measurements taken from the lab, and their significance.
            these can be derived from papers or other agents.
        * papers
    >1 measurement may result from an assay,
        each of which may have it's own significance

    """
    evidence_types = {
        'measurement datum': 'IAO:0000109',
        'zscore': 'STATO:0000104',
        'pvalue': 'OBI:0000175',
        'fold_change': 'STATO:0000169',
        'assay': 'OBI:0000070',
        'statistical_hypothesis_test': 'OBI:0000673',
        'effect_size': 'STATO:0000085',
        'percent_change': 'STATO:percent_change',
        'blood test evidence': 'ECO:0001016'
    }

    data_types = {
        'proportional_reporting_ratio': 'OAE:0001563',
        'odds_ratio': 'STATO:0000182',
        'count': 'SIO:000794',
    }

    object_properties = {
        'has_evidence': 'SEPIO:0000006',
        'has_supporting_evidence': 'SEPIO:0000007',
        'has_evidence': 'SEPIO:0000006',
        'has_supporting_data': 'SEPIO:0000084',
        'is_evidence_for': 'SEPIO:0000031',
        'is_refuting_evidence_for': 'SEPIO:0000033',
        'is_supporting_evidence_for': 'SEPIO:0000032',
        'is_evidence_supported_by': 'SEPIO:000010',
        'is_evidence_with_support_from': 'SEPIO:0000059',
        'has_significance': 'STATO:has_significance',
        'has_supporting_reference': 'SEPIO:0000124',
        'source': 'dc:source'
    }

    data_property = {
        'has_value': 'STATO:0000129',
        'has_measurement': 'IAO:0000004'
    }

    def __init__(self, graph, association):
        if isinstance(graph, Graph):
            self.graph = graph
        else:
            raise ValueError("{} is not a graph".graph)
        self.model = Model(self.graph)
        self.association = association

        return

    def add_supporting_evidence(self, evidence_line, type=None, label=None):
        """
        Add supporting line of evidence node to association id

        :param assoc_id: curie or iri, association id
        :param evidence_line: curie or iri, evidence line
        :return: None
        """
        self.graph.addTriple(self.association,
                             self.object_properties['has_supporting_evidence'],
                             evidence_line)
        if type is not None:
            self.model.addIndividualToGraph(evidence_line,
                                            label, type)
        return

    def add_evidence(self, evidence_line, ev_type=None, label=None):
        """
        Add line of evidence node to association id

        :param assoc_id: curie or iri, association id
        :param evidence_line: curie or iri, evidence line
        :return: None
        """
        self.graph.addTriple(self.association,
                             self.object_properties['has_evidence'],
                             evidence_line)
        if ev_type is not None:
            self.model.addIndividualToGraph(evidence_line, label, ev_type)
        return

    def add_data_individual(self, data_curie, label=None, ind_type=None):
        """
        Add data individual
        :param data_curie: str either curie formatted or long string,
                           long strings will be converted to bnodes
        :param type: str curie
        :param label: str
        :return: None
        """
        part_length = len(data_curie.split(':'))
        if part_length == 0:
            curie = "_:{}".format(data_curie)
        elif part_length > 2:
            raise ValueError("Misformatted curie {}".format(data_curie))
        else:
            curie = data_curie

        self.model.addIndividualToGraph(curie, label, ind_type)
        return

    def add_supporting_data(self, evidence_line, measurement_dict):
        """
        Add supporting data
        :param evidence_line:
        :param data_object: dict, where keys are curies or iris
        and values are measurement values for example:
            {
              "_:1234" : "1.53E07"
              "_:4567": "20.25"
            }
        Note: assumes measurements are RDF:Type 'ed elsewhere
        :return: None
        """
        for measurement in measurement_dict:
            self.graph.addTriple(evidence_line,
                                 self.object_properties['has_supporting_data'],
                                 measurement)

            self.graph.addTriple(measurement,
                                 self.data_property['has_value'],
                                 measurement_dict[measurement], True)
        return

    def add_supporting_publication(self, evidence_line, publication,
                                   label=None, pub_type=None):
        """
        <evidence> <SEPIO:0000124> <source>
        <source> <rdf:type> <type>
        <source> <rdfs:label> "label"
        :param evidence_line: str curie
        :param publication: str curie
        :param label: optional, str type as curie
        :param type: optional, str type as curie
        :return:
        """
        self.graph.addTriple(
            evidence_line,
            self.object_properties['has_supporting_reference'], publication
        )
        self.model.addIndividualToGraph(publication, label, pub_type)
        return

    def add_source(self, evidence_line, source, label=None, src_type=None):
        """
        Applies the triples:
        <evidence> <dc:source> <source>
        <source> <rdf:type> <type>
        <source> <rdfs:label> "label"

        TODO this should belong in a higher level class
        :param evidence_line: str curie
        :param source: str source as curie
        :param label: optional, str type as curie
        :param type: optional, str type as curie
        :return: None
        """
        self.graph.addTriple(evidence_line,
                             self.object_properties['source'],
                             source)
        self.model.addIndividualToGraph(source, label, src_type)
        return
Пример #55
0
    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:'+tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)",
                        num_cols, expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                tax_num, 'CHR')
                            geno.addChromosomeInstance(
                                cytogenetic_loc, build_id, assembly, band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                assembly, 'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(
                        str(chr), build_id, assembly, chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:'+dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_'+gene_num+'-'+variant_num
                    if self.nobnodes:
                        vl_id = ':'+vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(
                        vl_id, vl_label, geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info(
                            "No gene listed for variant %d",
                            int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(
                            r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(
                                m.group(1), 'Orphanet:', phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(
                                r'^ORPHA', 'Orphanet', phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(
                                r'^Human Phenotype Ontology', '',
                                phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(
                                r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(
                            g, self.name, seqalt_id, phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:'+xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return
Пример #56
0
    def _build_gene_disease_model(
            self,
            gene_id,
            relation_id,
            disease_id,
            variant_label,
            consequence_predicate=None,
            consequence_id=None,
            allelic_requirement=None,
            pmids=None):
        """
        Builds gene variant disease model

        :return: None
        """
        model = Model(self.graph)
        geno = Genotype(self.graph)

        pmids = [] if pmids is None else pmids

        is_variant = False
        variant_or_gene = gene_id

        variant_id_string = variant_label
        variant_bnode = self.make_id(variant_id_string, "_")

        if consequence_predicate is not None \
                and consequence_id is not None:
            is_variant = True
            model.addTriple(variant_bnode,
                            consequence_predicate,
                            consequence_id)
            # Hack to add labels to terms that
            # don't exist in an ontology
            if consequence_id.startswith(':'):
                model.addLabel(consequence_id,
                               consequence_id.strip(':').replace('_', ' '))

        if is_variant:
            variant_or_gene = variant_bnode
            # Typically we would type the variant using the
            # molecular consequence, but these are not specific
            # enough for us to make mappings (see translation table)
            model.addIndividualToGraph(variant_bnode,
                                       variant_label,
                                       self.globaltt['variant_locus'])
            geno.addAffectedLocus(variant_bnode, gene_id)
            model.addBlankNodeAnnotation(variant_bnode)

        assoc = G2PAssoc(
            self.graph, self.name, variant_or_gene, disease_id, relation_id)
        assoc.source = pmids
        assoc.add_association_to_graph()

        if allelic_requirement is not None and is_variant is False:
            model.addTriple(
                assoc.assoc_id, self.globaltt['has_allelic_requirement'],
                allelic_requirement)
            if allelic_requirement.startswith(':'):
                model.addLabel(
                    allelic_requirement,
                    allelic_requirement.strip(':').replace('_', ' '))
Пример #57
0
    def _get_process_allelic_variants(self, entry, g):
        model = Model(g)
        reference = Reference(g)
        geno = Genotype(g)
        if entry is not None:
            # to hold the entry-specific publication mentions
            # for the allelic variants
            publist = {}
            entry_num = entry['mimNumber']

            # process the ref list just to get the pmids
            ref_to_pmid = self._get_pubs(entry, g)

            if 'allelicVariantList' in entry:
                allelicVariantList = entry['allelicVariantList']
                for al in allelicVariantList:
                    al_num = al['allelicVariant']['number']
                    al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4)
                    al_label = None
                    al_description = None
                    if al['allelicVariant']['status'] == 'live':
                        publist[al_id] = set()
                        if 'mutations' in al['allelicVariant']:
                            al_label = al['allelicVariant']['mutations']
                        if 'text' in al['allelicVariant']:
                            al_description = al['allelicVariant']['text']
                            m = re.findall(r'\{(\d+)\:', al_description)
                            publist[al_id] = set(m)
                        geno.addAllele(
                            al_id, al_label, geno.genoparts['variant_locus'],
                            al_description)
                        geno.addAlleleOfGene(
                            al_id, 'OMIM:'+str(entry_num),
                            geno.object_properties[
                                'is_sequence_variant_instance_of'])
                        for r in publist[al_id]:
                            pmid = ref_to_pmid[int(r)]
                            g.addTriple(
                                pmid, model.object_properties['is_about'],
                                al_id)
                        # look up the pubmed id in the list of references
                        if 'dbSnps' in al['allelicVariant']:
                            dbsnp_ids = \
                                re.split(r',', al['allelicVariant']['dbSnps'])
                            for dnum in dbsnp_ids:
                                did = 'dbSNP:'+dnum.strip()
                                model.addIndividualToGraph(did, None)
                                model.addSameIndividual(al_id, did)
                        if 'clinvarAccessions' in al['allelicVariant']:
                            # clinvarAccessions triple semicolon delimited
                            # each >1 like RCV000020059;;;
                            rcv_ids = \
                                re.split(
                                    r';;;',
                                    al['allelicVariant']['clinvarAccessions'])
                            rcv_ids = [
                                (re.match(r'(RCV\d+);*', r)).group(1)
                                for r in rcv_ids]
                            for rnum in rcv_ids:
                                rid = 'ClinVar:'+rnum
                                model.addXref(al_id, rid)
                        reference.addPage(
                            al_id, "http://omim.org/entry/" +
                            str(entry_num)+"#" + str(al_num).zfill(4))
                    elif re.search(
                            r'moved', al['allelicVariant']['status']):
                        # for both 'moved' and 'removed'
                        moved_ids = None
                        if 'movedTo' in al['allelicVariant']:
                            moved_id = 'OMIM:'+al['allelicVariant']['movedTo']
                            moved_ids = [moved_id]
                        model.addDeprecatedIndividual(al_id, moved_ids)
                    else:
                        logger.error('Uncaught alleleic variant status %s',
                                     al['allelicVariant']['status'])
                # end loop allelicVariantList

        return
Пример #58
0
    def _get_gene2pubmed(self, limit):
        """
        Loops through the gene2pubmed file and adds a simple triple to say
        that a given publication is_about a gene.
        Publications are added as NamedIndividuals.

        These are filtered on the taxon.

        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file']))
        logger.info("FILE: %s", myfile)
        assoc_counter = 0
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, pubmed_num) = line.split('\t')

                # ## set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #        or (self.filter == 'geneids' and \
                #            (int(gene_num) not in self.gene_ids))):
                #         continue
                # #### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                if gene_num == '-' or pubmed_num == '-':
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                pubmed_id = ':'.join(('PMID', pubmed_num))

                if self.class_or_indiv.get(gene_id) == 'C':
                    model.addClassToGraph(gene_id, None)
                else:
                    model.addIndividualToGraph(gene_id, None)
                # add the publication as a NamedIndividual
                # add type publication
                model.addIndividualToGraph(pubmed_id, None, None)
                reference = Reference(
                    g, pubmed_id, Reference.ref_types['journal_article'])
                reference.addRefToGraph()
                g.addTriple(
                    pubmed_id, model.object_properties['is_about'], gene_id)
                assoc_counter += 1
                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info(
            "Processed %d pub-gene associations", assoc_counter)

        return