Пример #1
0
    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())

        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            for line in f1:
                line_counter += 1

                (morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = line.split('\t')

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    gu.addClassToGraph(self.graph, hp_id, None)
                    # Add the HP ID as an equivalent class
                    gu.addEquivalentClass(self.graph, morphology_term_id, hp_id)
                else:
                    logger.warning('No matching HP term for %s', morphology_term_label)

                if limit is not None and line_counter > limit:
                    break

        return
Пример #2
0
    def _parse_curated_chem_disease(self, limit):
        line_counter = 0
        file_path = '/'.join((self.rawdir, self.static_files['publications']['file']))
        gu = GraphUtils(curie_map.get())
        with open(file_path, 'r') as tsvfile:
            reader = csv.reader(tsvfile, delimiter="\t")
            for row in reader:
                # catch comment lines
                if re.match('^#', ' '.join(row)):
                    continue
                line_counter += 1
                self._check_list_len(row, 10)
                (pub_id, disease_label, disease_id, disease_cat, evidence,
                 chem_label, chem_id, cas_rn, gene_symbol, gene_acc) = row

                rel_id = self._get_relationship_id(evidence)
                chem_id = 'MESH:'+chem_id
                gu.addClassToGraph(self.g, chem_id, chem_label)
                gu.addClassToGraph(self.g, disease_id, None)
                if pub_id != '':
                    pub_id = 'PMID:'+pub_id
                    r = Reference(pub_id, Reference.ref_types['journal_article'])
                    r.addRefToGraph(self.g)
                else:
                    pub_id = None
                self._make_association('MESH:'+chem_id, disease_id, rel_id, ['PMID:'+pub_id])

                if not self.testMode and limit is not None and line_counter >= limit:
                    break
        return
Пример #3
0
    def _get_phenotypicseries_parents(entry, g):
        """
        Extract the phenotypic series parent relationship out of the entry
        :param entry:
        :return:
        """
        gu = GraphUtils(curie_map.get())
        omimid = 'OMIM:'+str(entry['mimNumber'])
        # the phenotypic series mappings
        serieslist = []
        if 'phenotypicSeriesExists' in entry:
            if entry['phenotypicSeriesExists'] is True:
                if 'phenotypeMapList' in entry:
                    phenolist = entry['phenotypeMapList']
                    for p in phenolist:
                        serieslist.append(p['phenotypeMap']['phenotypicSeriesNumber'])
                if 'geneMap' in entry and 'phenotypeMapList' in entry['geneMap']:
                    phenolist = entry['geneMap']['phenotypeMapList']
                    for p in phenolist:
                        if 'phenotypicSeriesNumber' in p['phenotypeMap']:
                            serieslist.append(p['phenotypeMap']['phenotypicSeriesNumber'])
        # add this entry as a subclass of the series entry
        for ser in serieslist:
            series_id = 'OMIM:'+ser
            gu.addClassToGraph(g, series_id, None)
            gu.addSubclass(g, series_id, omimid)

        return
Пример #4
0
    def _get_gene_history(self, limit):
        """
        Loops through the gene_history file and adds the old gene ids as deprecated classes, where the new
        gene id is the replacement for it.  The old gene symbol is added as a synonym to the gene.
        :param limit:
        :return:
        """
        gu = GraphUtils(curie_map.get())
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_history']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, discontinued_num, discontinued_symbol, discontinued_date) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #            or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if gene_num == '-' or discontinued_num == '-':
                    continue

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                discontinued_gene_id = ':'.join(('NCBIGene', discontinued_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))

                # add the two genes
                gu.addClassToGraph(g, gene_id, None)
                gu.addClassToGraph(g, discontinued_gene_id, discontinued_symbol)

                # add the new gene id to replace the old gene id
                gu.addDeprecatedClass(g, discontinued_gene_id, [gene_id])

                # also add the old symbol as a synonym of the new gene
                gu.addSynonym(g, gene_id, discontinued_symbol)

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        return
Пример #5
0
    def _process_phenotypicseries(self, limit):
        """
        Creates classes from the OMIM phenotypic series list.  These are grouping classes
        to hook the more granular OMIM diseases.
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        logger.info("getting phenotypic series titles")
        gu = GraphUtils(curie_map.get())
        line_counter = 0
        start = False
        with open('/'.join((self.rawdir, self.files['phenotypicSeries']['file']))) as f:
            for line in f:
                # there's several lines of header in the file, so need to skip several lines:
                if not start:
                    if re.match('Phenotypic Series', line):
                        start = True
                    continue
                if re.match('\w*$', line):
                    # skip blank lines
                    continue
                line = line.strip()
                line_counter += 1
                (ps_label, ps_num) = line.split('\t')
                omim_id = 'OMIM:'+ps_num
                gu.addClassToGraph(g, omim_id, ps_label)

        return
Пример #6
0
    def _process_genes(self, taxid, limit=None):
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        logger.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    logger.error("Data error for file %s", raw)
                    return
                (ensembl_gene_id, external_gene_name, description,
                 gene_biotype, entrezgene) = row[0:5]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[5]
                else:
                    hgnc_id = None

                if self.testMode and entrezgene != '' \
                        and int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:'+ensembl_gene_id
                if description == '':
                    description = None
                gene_type_id = self._get_gene_type(gene_biotype)
                gene_type_id = None
                gu.addClassToGraph(
                    g, gene_id, external_gene_name, gene_type_id, description)

                if entrezgene != '':
                    gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene)
                if hgnc_id is not None and hgnc_id != '':
                    gu.addEquivalentClass(g, gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:'+taxid, gene_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
        gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
        gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        return
Пример #7
0
    def _get_gene2pubmed(self, limit):
        """
        Loops through the gene2pubmed file and adds a simple triple to say that a given publication
        is_about a gene.  Publications are added as NamedIndividuals.
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        is_about = gu.getNode(gu.object_properties['is_about'])
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file']))
        logger.info("FILE: %s", myfile)
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, pubmed_num) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #       or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                if gene_num == '-' or pubmed_num == '-':
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                pubmed_id = ':'.join(('PMID', pubmed_num))

                # add the gene, in case it hasn't before
                gu.addClassToGraph(g, gene_id, None)
                # add the publication as a NamedIndividual
                gu.addIndividualToGraph(g, pubmed_id, None, None)  # add type publication
                self.graph.add((gu.getNode(pubmed_id), is_about, gu.getNode(gene_id)))

                if not self.testMode and limit is not None and line_counter > limit:
                    break

        return
Пример #8
0
    def _process_ortholog_classes(self, limit=None):
        """
        This method add the KEGG orthology classes to the graph.

        Triples created:
        <orthology_class_id> is a class
        <orthology_class_id> has label <orthology_symbols>
        <orthology_class_id> has description <orthology_description>
        :param limit:
        :return:
        """

        logger.info("Processing ortholog classes")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['ortholog_classes']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (orthology_class_id, orthology_class_name) = row

                if self.testMode and orthology_class_id not in self.test_ids['ortholog_classes']:
                    continue

                # FIXME: What's the proper route for this?
                # The orthology class is essentially a KEGG gene ID that is species agnostic.
                # Add the ID and label as a class. Would it be considered a gene as well?

                other_labels = re.split(';', orthology_class_name)
                orthology_label = other_labels[0]  # the first one is the label we'll use

                orthology_class_id = 'KEGG-'+orthology_class_id.strip()

                orthology_type = OrthologyAssoc.terms['gene_family']
                gu.addClassToGraph(g, orthology_class_id, orthology_label, orthology_type)
                if len(other_labels) > 1:
                    # add the rest as synonyms
                    # todo skip the first
                    for s in other_labels:
                        gu.addSynonym(g, orthology_class_id, s)

                    # add the last one as the description
                    gu.addDescription(g, orthology_class_id, other_labels[len(other_labels)-1])

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        logger.info("Done with ortholog classes")
        return
Пример #9
0
    def _process_orthologs(self, raw, limit=None):
        """
        This method maps orthologs for a species to the KEGG orthology classes.

        Triples created:
        <gene_id> is a class
        <orthology_class_id> is a class

        <assoc_id> has subject <gene_id>
        <assoc_id> has object <orthology_class_id>
        :param limit:
        :return:

        """

        logger.info("Processing orthologs")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        gu.loadAllProperties(g)
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (gene_id, orthology_class_id) = row

                orthology_class_id = 'KEGG:'+orthology_class_id.strip()
                gene_id = 'KEGG:'+gene_id.strip()

                # note that the panther_id references a group of orthologs,
                # and is not 1:1 with the rest

                # add the KO id as a gene-family grouping class
                OrthologyAssoc(
                    self.name, gene_id, None).add_gene_family_to_graph(
                        g, orthology_class_id)

                # add gene and orthology class to graph;
                # assume labels will be taken care of elsewhere
                gu.addClassToGraph(g, gene_id, None)
                gu.addClassToGraph(g, orthology_class_id, None)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info("Done with orthologs")
        return
Пример #10
0
    def _process_genes_kegg2ncbi(self, limit=None):
        """
        This method maps the KEGG human gene IDs
            to the corresponding NCBI Gene IDs.

        Triples created:
        <kegg_gene_id> is a class
        <ncbi_gene_id> is a class
        <kegg_gene_id> equivalentClass <ncbi_gene_id>
        :param limit:
        :return:

        """

        logger.info("Processing KEGG gene IDs to NCBI gene IDs")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0

        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['ncbi']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (kegg_gene_id, ncbi_gene_id, link_type) = row

                if self.testMode and \
                        kegg_gene_id not in self.test_ids['genes']:
                    continue

                # Adjust the NCBI gene ID prefix.
                ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id)
                kegg_gene_id = 'KEGG-'+kegg_gene_id

                # Adding the KEGG gene ID to the graph here is redundant,
                # unless there happens to be additional gene IDs in this table
                # not present in the genes table.
                gu.addClassToGraph(g, kegg_gene_id, None)
                gu.addClassToGraph(g, ncbi_gene_id, None)
                gu.addEquivalentClass(g, kegg_gene_id, ncbi_gene_id)

                if (not self.testMode) and (
                        limit is not None and line_counter > limit):
                    break

        logger.info("Done with KEGG gene IDs to NCBI gene IDs")
        return
Пример #11
0
    def _process_diseases(self, limit=None):
        """
        This method processes the KEGG disease IDs.

        Triples created:
        <disease_id> is a class
        <disease_id> rdfs:label <disease_name>
        :param limit:
        :return:

        """

        logger.info("Processing diseases")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['disease']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (disease_id, disease_name) = row

                disease_id = 'KEGG-'+disease_id.strip()
                if disease_id not in self.label_hash:
                    self.label_hash[disease_id] = disease_name

                if self.testMode and\
                        disease_id not in self.test_ids['disease']:
                    continue

                # Add the disease as a class.
                # we don't get all of these from MONDO yet see:
                # https://github.com/monarch-initiative/human-disease-ontology/issues/3
                gu.addClassToGraph(g, disease_id, disease_name)
                # not typing the diseases as DOID:4 yet because
                # I don't want to bulk up the graph unnecessarily

                if (not self.testMode) and (
                        limit is not None and line_counter > limit):
                    break

        logger.info("Done with diseases")
        return
Пример #12
0
 def _add_variant_gene_relationship(self, variant_id, hgnc_symbol):
     """
     :param variant_id
     :param hgnc_symbol
     :return: None
     """
     gu = GraphUtils(curie_map.get())
     geno = Genotype(self.graph)
     if hgnc_symbol in self.gene_map:
         gene_id = self.gene_map[hgnc_symbol]
     else:
         gene_id = self.make_cgd_id("{0}{1}".format(variant_id, hgnc_symbol))
         logger.warn("Can't map gene symbol {0} "
                     "to entrez ID".format(hgnc_symbol))
     gu.addClassToGraph(self.graph, gene_id, hgnc_symbol)
     geno.addAlleleOfGene(variant_id, gene_id)
     return
Пример #13
0
    def process_gene_ids(self, limit):
        raw = '/'.join((self.rawdir, self.files['gene_ids']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        gu = GraphUtils(curie_map.get())

        logger.info("Processing Gene IDs")
        line_counter = 0
        geno = Genotype(g)
        with gzip.open(raw, 'rb') as csvfile:
            filereader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter=',',
                quotechar='\"')
            for row in filereader:
                line_counter += 1
                (taxon_num, gene_num, gene_symbol, gene_synonym, live) = row
                # 6239,WBGene00000001,aap-1,Y110A7A.10,Live

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                taxon_id = 'NCBITaxon:'+taxon_num
                gene_id = 'WormBase:'+gene_num
                if gene_symbol == '':
                    gene_symbol = gene_synonym
                if gene_symbol == '':
                    gene_symbol = None
                gu.addClassToGraph(
                    g, gene_id, gene_symbol, Genotype.genoparts['gene'])
                if live == 'Dead':
                    gu.addDeprecatedClass(g, gene_id)
                geno.addTaxon(taxon_id, gene_id)
                if gene_synonym != '':
                    gu.addSynonym(g, gene_id, gene_synonym)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Пример #14
0
    def _get_titles(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes (not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:
        """
        raw = '/'.join((self.rawdir, self.files['titles']['file']))
        gu = GraphUtils(curie_map.get())
        line_counter = 0
        with open(raw, 'r', encoding='latin-1') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (shortname, title, nbk_num) = row
                gr_id = 'GeneReviews:'+nbk_num

                self.book_ids.add(nbk_num)  # a global set of the book nums

                if limit is None or line_counter < limit:
                    gu.addClassToGraph(self.graph, gr_id, title)
                    gu.addSynonym(self.graph, gr_id, shortname)

        return
Пример #15
0
    def _get_mappedids(self, entry, g):
        """
        Extract the Orphanet and UMLS ids as equivalences from the entry
        :param entry:
        :return:
        """
        # umlsIDs
        gu = GraphUtils(curie_map.get())
        omimid = 'OMIM:'+str(entry['mimNumber'])
        orpha_mappings = []
        if 'externalLinks' in entry:
            links = entry['externalLinks']
            if 'orphanetDiseases' in links:
                # triple semi-colon delimited list of double semi-colon delimited orphanet ID/disease pairs
                # 2970;;566;;Prune belly syndrome
                items = links['orphanetDiseases'].split(';;;')
                for i in items:
                    (orpha_num, internal_num, orpha_label) = i.split(';;')
                    orpha_id = 'Orphanet:'+orpha_num.strip()
                    orpha_mappings.append(orpha_id)
                    gu.addClassToGraph(g, orpha_id, orpha_label.strip())
                    gu.addXref(g, omimid, orpha_id)

            if 'umlsIDs' in links:
                umls_mappings = links['umlsIDs'].split(',')
                for i in umls_mappings:
                    umls_id = 'UMLS:'+i
                    gu.addClassToGraph(g, umls_id, None)
                    gu.addXref(g, omimid, umls_id)

            if self._get_omimtype(entry) == Genotype.genoparts['gene'] and 'geneIDs' in links:
                entrez_mappings = links['geneIDs']
                for i in entrez_mappings.split(','):
                    gu.addEquivalentClass(g, omimid, 'NCBIGene:'+str(i))

        return
Пример #16
0
    def add_disease_drug_variant_to_graph(self, table):
        """
        Takes an iterable of iterables as input with the following structure,
        optional indices can be Null:
        [[variant_key, variant_label, diagnoses_key, diagnoses,
          specific_diagnosis, organ, relationship,
          drug_key, drug, therapy_status (optional), pubmed_id(optional)]]

        See ongoing discussion of how to best model here:
        https://github.com/monarch-initiative/mckb/issues/9

        :param table: iterable of iterables, for example, a tuple of tuples
                      from _get_disease_drug_variant_relationship
        :return: None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)

        for row in table:
            (variant_key, variant_label, diagnoses_key, diagnoses,
             specific_diagnosis, organ, relationship,
             drug_key, drug_label, therapy_status, pubmed_id) = row

            if specific_diagnosis is not None:
                diagnoses_label = specific_diagnosis
            else:
                diagnoses_label = diagnoses

            # Arbitrary IDs to be replaced by ontology mappings
            variant_id = self.make_cgd_id('variant{0}'.format(variant_key))
            disease_id = self._get_disease_id(diagnoses_key, diagnoses_label)
            therapy_status_id = self.make_cgd_id('{0}'.format(therapy_status))
            relationship_id = "RO:has_environment"
            disease_quality = ("CGD:{0}".format(relationship)).replace(" ", "_")
            has_quality_property = "BFO:0000159"
            drug_id = self._get_drug_id(drug_key, drug_label)

            geno.addGenotype(variant_id, variant_label,
                             geno.genoparts['sequence_alteration'])

            disease_instance_id = self.make_cgd_id('disease{0}{1}'.format(
                                                     diagnoses_label, variant_key))

            phenotype_instance_id = self.make_cgd_id('phenotype{0}{1}{2}'.format(
                                                     diagnoses_label, variant_key, relationship))

            phenotype_instance_label = "{0} with {1} to therapy".format(diagnoses_label, relationship)
            if relationship == "detrimental effect":
                phenotype_instance_label = "{0} with therapeutic response {1} to health"\
                                           .format(diagnoses_label, relationship)

            # Reified association for disease caused_by genotype
            variant_disease_annot = self.make_cgd_id("assoc{0}{1}".format(variant_key, diagnoses_label))

            # Add individuals/classes
            gu.addClassToGraph(self.graph, disease_id, diagnoses_label, 'DOID:4')

            gu.addClassToGraph(self.graph, drug_id, drug_label, 'CHEBI:23888')
            gu.addIndividualToGraph(self.graph, phenotype_instance_id, phenotype_instance_label,
                                    disease_id)
            gu.loadObjectProperties(self.graph, {relationship: relationship_id})

            if pubmed_id is not None:
                source_id = "PMID:{0}".format(pubmed_id)
                ref = Reference(source_id, Reference.ref_types['journal_article'])
                ref.addRefToGraph(self.graph)
                evidence = 'ECO:0000033'
            else:
                source_id = None
                evidence = None

            rel_id = gu.object_properties['has_phenotype']
            variant_phenotype_assoc = G2PAssoc(self.name,
                                               variant_id,
                                               phenotype_instance_id,
                                               rel_id)

            variant_phenotype_assoc.set_association_id(variant_disease_annot)
            if evidence:
                variant_phenotype_assoc.add_evidence(evidence)

            if source_id:
                variant_phenotype_assoc.add_source(source_id)

            variant_phenotype_assoc.add_association_to_graph(self.graph)
            gu.addTriple(self.graph, variant_disease_annot, relationship_id, drug_id)
            gu.addTriple(self.graph, phenotype_instance_id, has_quality_property, disease_quality)

            # Add therapy-disease association and approval status
            marker_relation = "RO:has_biomarker"

            disease_instance_label = "{0} with biomarker {1}".format(diagnoses_label, variant_label)
            gu.addIndividualToGraph(self.graph, disease_instance_id, disease_instance_label,
                                    disease_id)
            gu.addTriple(self.graph, disease_instance_id, marker_relation, variant_id)

            gu.addClassToGraph(self.graph, therapy_status_id, therapy_status)
            self._add_therapy_drug_association(drug_id, disease_instance_id, therapy_status_id)

        return
Пример #17
0
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and creates the genes as classes, typed with SO.  It will add their
        label, any alternate labels as synonyms, alternate ids as equivlaent classes.  HPRDs get added as
        protein products.  The chromosome and chr band get added as blank node regions, and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:
        """
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        # not unzipping the file
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_info']['file']))
        logger.info("FILE: %s", myfile)

        # Add taxa and genome classes for those in our filter
        for tax_num in self.tax_ids:
            tax_id = ':'.join(('NCBITaxon', str(tax_num)))
            geno.addGenome(tax_id, str(tax_num))   # tax label can get added elsewhere
            gu.addClassToGraph(g, tax_id, None)   # label added elsewhere
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, symbol, locustag,
                 synonyms, xrefs, chr, map_loc, desc,
                 gtype, authority_symbol, name,
                 nomenclature_status, other_designations, modification_date) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #            or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1

                gene_id = ':'.join(('NCBIGene', gene_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))
                gene_type_id = self._map_type_of_gene(gtype)

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol

                # TODO might have to figure out if things aren't genes, and make them individuals
                gu.addClassToGraph(g, gene_id, label, gene_type_id, desc)

                # we have to do special things here for genes, because they're classes not individuals
                # f = Feature(gene_id,label,gene_type_id,desc)

                if name != '-':
                    gu.addSynonym(g, gene_id, name)
                if synonyms.strip() != '-':
                    for s in synonyms.split('|'):
                        gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
                if other_designations.strip() != '-':
                    for s in other_designations.split('|'):
                        gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])

                # deal with the xrefs
                # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
                if xrefs.strip() != '-':
                    for r in xrefs.strip().split('|'):
                        fixedr = self._cleanup_id(r)
                        if fixedr is not None and fixedr.strip() != '':
                            if re.match('HPRD', fixedr):
                                # proteins are not == genes.
                                gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr)
                            else:
                                # skip some of these for now
                                if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']:
                                    gu.addEquivalentClass(g, gene_id, fixedr)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # 419     ART3      4    with   4q21.1|4p15.1-p14   # no idea why there's two bands listed - possibly 2 assemblies
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3   #this is of "unknown" type == susceptibility
                # 101928066       LOC101928066    1|Un    -         # unlocated scaffold
                # 11435   Chrna1  2       2 C3|2 43.76 cM           # mouse --> 2C3
                # 11548   Adra1b  11      11 B1.1|11 25.81 cM       # mouse --> 11B1.1
                # 11717   Ampd3   7       7 57.85 cM|7 E2-E3        # mouse
                # 14421   B4galnt1        10      10 D3|10 74.5 cM  # mouse
                # 323212  wu:fb92e12      19|20   -                 # fish
                # 323368  ints10  6|18    -                         # fish
                # 323666  wu:fc06e02      11|23   -                 # fish

                # feel that the chr placement can't be trusted in this table when there is > 1 listed
                # with the exception of human X|Y, i will only take those that align to one chr

                # FIXME remove the chr mapping below when we pull in the genomic coords
                if str(chr) != '-' and str(chr) != '':
                    if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']:
                        # this means that there's uncertainty in the mapping.  skip it
                        # TODO we'll need to figure out how to deal with >1 loc mapping
                        logger.info('%s is non-uniquely mapped to %s.  Skipping for now.', gene_id, str(chr))
                        continue
                        # X|Y	Xp22.33;Yp11.3

                    # if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
                    #    print('odd chr=',str(chr))
                    if str(chr) == 'X; Y':
                        chr = 'X|Y'  # rewrite the PAR regions for processing
                    # do this in a loop to allow PAR regions like X|Y
                    for c in re.split('\|',str(chr)) :
                        geno.addChromosomeClass(c, tax_id, None)  # assume that the chromosome label will get added elsewhere
                        mychrom = makeChromID(c, tax_num, 'CHR')
                        mychrom_syn = makeChromLabel(c, tax_num)  # temporarily use the taxnum for the disambiguating label
                        gu.addSynonym(g, mychrom,  mychrom_syn)
                        band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc)
                        if band_match is not None and len(band_match.groups()) > 0:
                            # if tax_num != '9606':
                            #     continue
                            # this matches the regular kind of chrs, so make that kind of band
                            # not sure why this matches? chrX|Y or 10090chr12|Un"
                            # TODO we probably need a different regex per organism
                            # the maploc_id already has the numeric chromosome in it, strip it first
                            bid = re.sub('^'+c, '', map_loc)
                            maploc_id = makeChromID(c+bid, tax_num, 'CHR')  # the generic location (no coordinates)
                            # print(map_loc,'-->',bid,'-->',maploc_id)
                            band = Feature(maploc_id, None, None)  # Assume it's type will be added elsewhere
                            band.addFeatureToGraph(g)
                            # add the band as the containing feature
                            gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id)
                        else:
                            # TODO handle these cases
                            # examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24,
                            ## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1,  12cen-q21, 22q13.3|22q13.3
                            logger.debug('not regular band pattern for %s: %s', gene_id, map_loc)
                            # add the gene as a subsequence of the chromosome
                            gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom)

                geno.addTaxon(tax_id, gene_id)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

            gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
            gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
            gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
            gu.loadAllProperties(g)

        return
Пример #18
0
class Pathway():
    """
    This provides convenience methods to deal with gene and protein collections
    in the context of pathways.
    """

    pathway_parts = {
        'signal_transduction': 'GO:0007165',
        'cellular_process': 'GO:0009987',
        'pathway': 'PW:0000001',
        'gene_product': 'CHEBI:33695'  # bioinformation molecule
    }

    object_properties = {
        'involved_in': 'RO:0002331',
        'gene_product_of': 'RO:0002204',
        'has_gene_product': 'RO:0002205'
    }

    properties = object_properties.copy()

    def __init__(self, graph, nobnodes=False):

        self.gu = GraphUtils(curie_map.get())

        self.graph = graph

        self.nobnodes = nobnodes

        self.gu.loadProperties(self.graph, self.object_properties,
                               self.gu.OBJPROP)

        return

    def addPathway(
            self, pathway_id, pathway_label, pathway_type=None,
            pathway_description=None):
        """
        Adds a pathway as a class.  If no specific type is specified, it will
        default to a subclass of "GO:cellular_process" and "PW:pathway".
        :param pathway_id:
        :param pathway_label:
        :param pathway_type:
        :param pathway_description:
        :return:
        """

        if pathway_type is None:
            pathway_type = self.pathway_parts['cellular_process']
        self.gu.addClassToGraph(
            self.graph, pathway_id, pathway_label, pathway_type,
            pathway_description)
        self.gu.addSubclass(
            self.graph, self.pathway_parts['pathway'], pathway_id)

        return

    def addGeneToPathway(self, pathway_id, gene_id):
        """
        When adding a gene to a pathway, we create an intermediate
        'gene product' that is involved in
        the pathway, through a blank node.

        gene_id RO:has_gene_product _gene_product
        _gene_product RO:involved_in pathway_id

        :param pathway_id:
        :param gene_id:
        :return:
        """

        gene_product = '_'+re.sub(r':', '', gene_id)+'product'
        if self.nobnodes:
            gene_product = ':'+gene_product
        self.gu.addIndividualToGraph(
            self.graph, gene_product, None,
            self.pathway_parts['gene_product'])
        self.gu.addTriple(
            self.graph, gene_id,
            self.object_properties['has_gene_product'],
            gene_product)
        self.addComponentToPathway(pathway_id, gene_product)

        return

    def addComponentToPathway(self, pathway_id, component_id):
        """
        This can be used directly when the component is directly involved in
        the pathway.  If a transforming event is performed on the component
        first, then the addGeneToPathway should be used instead.

        :param pathway_id:
        :param component_id:
        :return:
        """

        self.gu.addTriple(self.graph, component_id,
                          self.object_properties['involved_in'], pathway_id)

        return
Пример #19
0
    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

            line_id a CL_0000057,  #fibroblast line
                derives_from patient_id
                part_of :NIGMSrepository
                RO:model_of OMIM:disease_id

            patient id a foaf:person,
                label: "fibroblast from patient 12345 with disease X"
                member_of family_id  #what is the right thing here?
                SIO:race EFO:caucasian  #subclass of EFO:0001799
                in_taxon NCBITaxon:9606
                dc:description Literal(remark)
                RO:has_phenotype OMIM:disease_id
                GENO:has_genotype genotype_id

            family_id a owl:NamedIndividual
                foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

            genotype_id a intrinsic_genotype
                GENO:has_alternate_part allelic_variant_id
                we don't necessarily know much about the genotype,
                other than the allelic variant. also there's the sex here

            pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:
        """
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:'+catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_person'
                    if self.nobnodes:
                        patient_id = ':'+patient_id
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    gu.addIndividualToGraph(
                        g, cell_line_id, line_label, cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:'+dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        gu.addIndividualToGraph(
                            g, equiv_cell_line, None, cell_line_reagent_id)
                        gu.addSameIndividual(g, cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    gu.addMember(g, repository, cell_line_id)

                    if cat_remark != '':
                        gu.addDescription(g, cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                        # this would give a BNode that is an instance of Age.
                        # but i don't know how to connect
                        # the age node to the cell line? we need to ask @mbrush
                        # age_id = '_'+re.sub('\s+','_',age)
                        # gu.addIndividualToGraph(
                        #   g,age_id,age,self.terms['age'])
                        # gu.addTriple(
                        #   g,age_id,self.properties['has_measurement'],age,
                        #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    gu.addPerson(g, patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        gu.addSubclass(
                    #           g,self.terms['ethnic_group'],mapped_race)

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:'+family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        gu.addIndividualToGraph(
                            g, family_comp_id, family_label,
                            geno.genoparts['family'])

                        # Add the patient as a member of the family
                        gu.addMemberOf(g, patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_'+re.sub('MONARCH:', '', self.make_id(karyotype))
                        if self.nobnodes:
                            karyotype_id = ':'+karyotype_id
                        # add karyotype as karyotype_variation_complement
                        gu.addIndividualToGraph(
                            g, karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(
                                karyotype_feature_id, karyotype_feature_label,
                                geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph(g)
                            f.loadAllProperties(g)
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene+'('+mutation+')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = '_' + variant_id.replace(';', '-') + '-' \
                                    + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    if gvc_id is not None and gvc_id != karyotype_id \
                            and self.nobnodes:
                        gvc_id = ':'+gvc_id

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            if self.nobnodes:
                                vslc_id = ':'+vslc_id
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            gu.addIndividualToGraph(
                                g, vslc_id, vslc_label,
                                geno.genoparts[
                                    'variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:'+o+'.'+v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'],
                                    geno.object_properties[
                                        'has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        gu.addType(g, patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_geno'+catalog_id.strip()
                        if self.nobnodes:
                            genotype_id = ':'+genotype_id

                    # add the gvc
                    if gvc_id is not None:
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id,
                                    geno.object_properties[
                                        'has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' ['+catalog_id.strip()+']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(
                            genotype_id, genotype_label,
                            geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        gu.addTriple(
                            g, patient_id,
                            geno.properties['has_genotype'], genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:'+d.strip()
                                        # assume the label is taken care of
                                        gu.addClassToGraph(g, disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            self.name, patient_id, disease_id)
                                        assoc.add_association_to_graph(g)

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        gu.addTriple(
                                            g, cell_line_id,
                                            gu.properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:'+s.strip()
                            ref = Reference(pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph(g)
                            gu.addTriple(
                                g, pubmed_id, gu.properties['mentions'],
                                cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        return
Пример #20
0
    def _process_phenotype_data(self, limit):
        """
        NOTE: If a Strain carries more than one mutation,
        then each Mutation description,
        i.e., the set: (
            Mutation Type - Chromosome - Gene Symbol -
            Gene Name - Allele Symbol - Allele Name)
        will require a separate line.

        Note that MMRRC curates phenotypes to alleles,
        even though they distribute only one file with the
        phenotypes appearing to be associated with a strain.

        So, here we process the allele-to-phenotype relationships separately
        from the strain-to-allele relationships.

        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        gu = GraphUtils(curie_map.get())
        fname = '/'.join((self.rawdir, self.files['catalog']['file']))

        self.strain_hash = {}
        self.id_label_hash = {}
        genes_with_no_ids = set()
        stem_cell_class = 'CL:0000034'
        mouse_taxon = 'NCBITaxon:10090'
        geno = Genotype(g)
        with open(fname, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # skip the first 3 lines which are header, etc.
                if line_counter < 4:
                    continue

                (strain_id, strain_label, strain_type_symbol, strain_state,
                 mgi_allele_id, mgi_allele_symbol, mgi_allele_name,
                 mutation_type, chrom, mgi_gene_id, mgi_gene_symbol,
                 mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums,
                 research_areas) = row

                if self.testMode and (strain_id not in self.test_ids):
                    continue

                # strip off stuff after the dash -
                # is the holding center important?
                # MMRRC:00001-UNC --> MMRRC:00001
                strain_id = re.sub(r'-\w+$', '', strain_id)

                self.id_label_hash[strain_id] = strain_label

                # get the variant or gene to save for later building of
                # the genotype
                if strain_id not in self.strain_hash:
                    self.strain_hash[strain_id] = {'variants': set(),
                                                   'genes': set()}

                # clean up the bad one
                if mgi_allele_id == 'multiple mutation':
                    logger.error("Erroneous gene id: %s", mgi_allele_id)
                    mgi_allele_id = ''

                if mgi_allele_id != '':
                    self.strain_hash[strain_id]['variants'].add(mgi_allele_id)
                    self.id_label_hash[mgi_allele_id] = mgi_allele_symbol

                    # use the following if needing to add the
                    # sequence alteration types
                    # var_type =
                    #   self._get_variant_type_from_abbrev(mutation_type)
                    # make a sequence alteration for this variant locus,
                    # and link the variation type to it
                    # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA'
                    # if self.nobnodes:
                    #     sa_id = ':'+sa_id
                    # gu.addIndividualToGraph(g, sa_id, None, var_type)
                    # geno.addSequenceAlterationToVariantLocus(sa_id,
                    #                                          mgi_allele_id)

                # scrub out any spaces
                mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id)
                if mgi_gene_id.strip() != '':
                    if re.match(r'Gene\s*ID:', mgi_gene_id, re.I):
                        mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:',
                                             mgi_gene_id)
                    elif not re.match(r'MGI', mgi_gene_id):
                        logger.info("Gene id not recognized: %s", mgi_gene_id)
                        if re.match(r'\d+$', mgi_gene_id):
                            # assume that if it's all numbers, then it's MGI
                            mgi_gene_id = 'MGI:'+str(mgi_gene_id)
                            logger.info("Assuming numerics are MGI.")
                    self.strain_hash[strain_id]['genes'].add(mgi_gene_id)
                    self.id_label_hash[mgi_gene_id] = mgi_gene_symbol

                # catch some errors -
                # some things have gene labels, but no identifiers - report
                if mgi_gene_symbol.strip() != '' and mgi_gene_id == '':
                    logger.error(
                        "Gene label with no identifier for strain %s: %s",
                        strain_id, mgi_gene_symbol)
                    genes_with_no_ids.add(mgi_gene_symbol.strip())
                    # make a temp id for genes that aren't identified
                    # tmp_gene_id = '_'+mgi_gene_symbol
                    # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol
                    # self.strain_hash[strain_id]['genes'].add(tmp_gene_id)

                # split apart the mp ids
                # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ...
                # mp_ids are now a comma delimited list
                # with MP terms in brackets
                phenotype_ids = []
                if mp_ids != '':
                    for i in re.split(r',', mp_ids):
                        i = i.strip()
                        mps = re.search(r'\[(.*)\]', i)
                        if mps is not None:
                            mp_id = mps.group(1).strip()
                            phenotype_ids.append(mp_id)

                # pubmed ids are space delimited
                pubmed_ids = []
                if pubmed_nums.strip() != '':
                    for i in re.split(r'\s+', pubmed_nums):
                        pmid = 'PMID:'+i.strip()
                        pubmed_ids.append(pmid)
                        r = Reference(pmid,
                                      Reference.ref_types['journal_article'])
                        r.addRefToGraph(g)

                # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001
                # is a good example of 4 genotype parts

                gu.addClassToGraph(g, mouse_taxon, None)
                if research_areas.strip() == '':
                    research_areas = None
                else:
                    research_areas = 'Research Areas: '+research_areas
                strain_type = mouse_taxon
                if strain_state == 'ES':
                    strain_type = stem_cell_class
                gu.addIndividualToGraph(
                    g, strain_id, strain_label, strain_type,
                    research_areas)  # an inst of mouse??
                gu.makeLeader(g, strain_id)

                # phenotypes are associated with the alleles
                for pid in phenotype_ids:
                    # assume the phenotype label is in the ontology
                    gu.addClassToGraph(g, pid, None)
                    if mgi_allele_id is not None and mgi_allele_id != '':
                        assoc = G2PAssoc(self.name, mgi_allele_id, pid,
                                         gu.object_properties['has_phenotype'])
                        for p in pubmed_ids:
                            assoc.add_source(p)
                        assoc.add_association_to_graph(g)
                    else:
                        logger.info("Phenotypes and no allele for %s",
                                    strain_id)

                if not self.testMode and (
                        limit is not None and line_counter > limit):
                    break

            # now that we've collected all of the variant information, build it
            # we don't know their zygosities
            for s in self.strain_hash:
                h = self.strain_hash.get(s)
                variants = h['variants']
                genes = h['genes']
                vl_set = set()
                # make variant loci for each gene
                if len(variants) > 0:
                    for v in variants:
                        vl_id = v
                        vl_symbol = self.id_label_hash[vl_id]
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        vl_set.add(vl_id)
                        if len(variants) == 1 and len(genes) == 1:
                            for gene in genes:
                                geno.addAlleleOfGene(vl_id, gene)
                        else:
                            geno.addAllele(vl_id, vl_symbol)
                else:  # len(vars) == 0
                    # it's just anonymous variants in some gene
                    for gene in genes:
                        vl_id = '_'+gene+'-VL'
                        vl_id = re.sub(r':', '', vl_id)
                        if self.nobnodes:
                            vl_id = ':'+vl_id
                        vl_symbol = self.id_label_hash[gene]+'<?>'
                        self.id_label_hash[vl_id] = vl_symbol
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        geno.addGene(gene, self.id_label_hash[gene])
                        geno.addAlleleOfGene(vl_id, gene)
                        vl_set.add(vl_id)

                # make the vslcs
                vl_list = sorted(vl_set)
                vslc_list = []
                for vl in vl_list:
                    # for unknown zygosity
                    vslc_id = '_'+re.sub(r'^_', '', vl)+'U'
                    vslc_id = re.sub(r':', '', vslc_id)
                    if self.nobnodes:
                        vslc_id = ':' + vslc_id
                    vslc_label = self.id_label_hash[vl] + '/?'
                    self.id_label_hash[vslc_id] = vslc_label
                    vslc_list.append(vslc_id)
                    geno.addPartsToVSLC(
                        vslc_id, vl, None, geno.zygosity['indeterminate'],
                        geno.object_properties['has_alternate_part'], None)
                    gu.addIndividualToGraph(
                        g, vslc_id, vslc_label,
                        geno.genoparts['variant_single_locus_complement'])
                if len(vslc_list) > 0:
                    if len(vslc_list) > 1:
                        gvc_id = '-'.join(vslc_list)
                        gvc_id = re.sub(r':', '', gvc_id)
                        if self.nobnodes:
                            gvc_id = ':'+gvc_id
                        gvc_label = \
                            '; '.join(self.id_label_hash[v] for v in vslc_list)
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])
                        for vslc_id in vslc_list:
                            geno.addVSLCtoParent(vslc_id, gvc_id)
                    else:
                        # the GVC == VSLC, so don't have to make an extra piece
                        gvc_id = vslc_list.pop()
                        gvc_label = self.id_label_hash[gvc_id]

                    genotype_label = gvc_label + ' [n.s.]'
                    bkgd_id = \
                        '_' + re.sub(r':', '', '-'.join(
                            (geno.genoparts['unspecified_genomic_background'],
                             s)))
                    genotype_id = '-'.join((gvc_id, bkgd_id))
                    if self.nobnodes:
                        bkgd_id = ':'+bkgd_id
                    geno.addTaxon(mouse_taxon, bkgd_id)
                    geno.addGenomicBackground(
                        bkgd_id, 'unspecified ('+s+')',
                        geno.genoparts['unspecified_genomic_background'],
                        "A placeholder for the " +
                        "unspecified genetic background for "+s)
                    geno.addGenomicBackgroundToGenotype(
                        bkgd_id, genotype_id,
                        geno.genoparts['unspecified_genomic_background'])
                    geno.addParts(
                        gvc_id, genotype_id,
                        geno.object_properties['has_alternate_part'])
                    geno.addGenotype(genotype_id, genotype_label)
                    gu.addTriple(
                        g, s, geno.object_properties['has_genotype'],
                        genotype_id)
                else:
                    # logger.debug(
                    #   "Strain %s is not making a proper genotype.", s)
                    pass

            gu.loadProperties(
                g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
            gu.loadProperties(
                g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
            gu.loadProperties(
                g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
            gu.loadAllProperties(g)

            logger.warning(
                "The following gene symbols did not list identifiers: %s",
                str(sorted(list(genes_with_no_ids))))

        return
Пример #21
0
class Monochrom(Source):
    """
    This class will leverage the GENO ontology and modeling patterns to build
    an ontology of chromosomes for any species. These classes represent major
    structural pieces of Chromosomes which are often universally referenced,
    using physical properties/observations that remain constant over different
    genome builds (such as banding patterns and arms). The idea is to create a
    scaffold upon which we can hang build-specific chromosomal coordinates,
    and reason across them.

    In general, this will take the cytogenic bands files from UCSC, and create
    missing grouping classes, in order to build the partonomy from a very
    specific chromosomal band up through the chromosome itself and enable
    overlap and containment queries.  We use RO:subsequence_of as our
    relationship between nested chromosomal parts. For example,
    13q21.31 ==>  13q21.31,  13q21.3,  13q21,  13q2,  13q, 13

    At the moment, this only computes the bands for
    Human, Mouse, Zebrafish, and Rat
    but will be expanding in the future as needed.

    Because this is a universal framework to represent the chromosomal
    structure of any species, we must mint identifiers for each chromosome
    and part. We differentiate species by first creating a species-specific
    genome, then for each species-specific chromosome we include the NCBI taxon
    number together with the chromosome number, like:
    ```<species number>chr<num><band>```.  For 13q21.31, this would be
    9606chr13q21.31.
    We then create triples for a given band like:
    <pre>
    CHR:9606chr1p36.33 rdf[type] SO:chromosome_band
    CHR:9606chr1p36 subsequence_of :9606chr1p36.3
    </pre>
    where any band in the file is an instance of a chr_band
    (or a more specific type), is a subsequence of it's containing region.

    We determine the containing regions of the band by parsing the band-string;
    since each alphanumeric is a significant "place", we can split it with the
    shorter strings being parents of the longer string

    Since this is small, and we have not limited other items in our test set to
    a small region, we simply use the whole graph (genome)
    for testing purposes, and copy the main graph to the test graph.

    Since this Dipper class is building an ONTOLOGY,
    rather than instance-level data, we must also include domain and range
    constraints, and other owl-isms.

    TODO: any species by commandline argument

    We are currently mapping these to the **CHR idspace**,
    but this is NOT YET APPROVED and is subject to change.
    """

    files = {
        '9606': {
            'file': '9606cytoBand.txt.gz',
            'url': MCDL + '/hg19/database/cytoBand.txt.gz',
            'build_num': 'hg19',
            'genome_label': 'Human'
        },
        '10090': {
            'file': '10090cytoBand.txt.gz',
            'url': MCDL + '/mm10/database/cytoBandIdeo.txt.gz',
            'build_num': 'mm10',
            'genome_label': 'Mouse'
        },
        # Note that there are no bands, arms or staining components
        # for the following genomes at the moment
        '7955': {
            'file': '7955cytoBand.txt.gz',
            'url': MCDL + '/danRer10/database/cytoBandIdeo.txt.gz',
            'build_num': 'danRer10',
            'genome_label': 'Zebrafish'
        },
        '10116': {
            'file': '10116cytoBand.txt.gz',
            'url': MCDL + '/rn6/database/cytoBandIdeo.txt.gz',
            'build_num': 'rn6',
            'genome_label': 'Rat'
        },
        '9913': {
            'file': 'bosTau7cytoBand.txt.gz',
            'url': MCDL + '/bosTau7/database/cytoBandIdeo.txt.gz',
            'build_num': 'bosTau7',
            'genome_label': 'cow'
        },
        '9031': {
            'file': 'galGal4cytoBand.txt.gz',
            'url': MCDL + '/galGal4/database/cytoBandIdeo.txt.gz',
            'build_num': 'galGal4',
            'genome_label': 'chicken'
        },
        '9823': {
            'file': 'susScr3cytoBand.txt.gz',
            'url': MCDL + '/susScr3/database/cytoBandIdeo.txt.gz',
            'build_num': 'susScr3',
            'genome_label': 'pig'
        },
        '9940': {
            'file': 'oviAri3cytoBand.txt.gz',
            'url': MCDL + '/oviAri3/database/cytoBandIdeo.txt.gz',
            'build_num': 'oviAri3',
            'genome_label': 'sheep'
        },
        '9796': {
            'file': 'equCab2cytoBand.txt.gz',
            'url': MCDL + '/equCab2/database/cytoBandIdeo.txt.gz',
            'build_num': 'equCab2',
            'genome_label': 'horse'
        },
    }

    region_type_map = {
        'acen': Feature.types['centromere'],
        'gvar': Feature.types['chromosome_band'],
        'stalk': Feature.types['chromosome_band'],
        'gneg': Feature.types['chromosome_band'],
        'gpos100': Feature.types['chromosome_band'],
        'gpos25': Feature.types['chromosome_band'],
        'gpos33': Feature.types['chromosome_band'],
        'gpos50': Feature.types['chromosome_band'],
        'gpos66': Feature.types['chromosome_band'],
        'gpos75': Feature.types['chromosome_band'],
        'chromosome': Feature.types['chromosome'],
        'chromosome_arm': Feature.types['chromosome_arm'],
        'chromosome_band': Feature.types['chromosome_band'],
        'chromosome_part': Feature.types['chromosome_part']
    }

    def __init__(self, tax_ids=None):
        super().__init__('monochrom')

        self.tax_ids = tax_ids
        self.load_bindings()
        self.gu = GraphUtils(curie_map.get())

        # Defaults
        if self.tax_ids is None:
            self.tax_ids = [
                9606, 10090, 7955, 10116, 9913, 9031, 9823, 9940, 9796]

        self._check_tax_ids()

        # TODO add license
        self.dataset = Dataset(
            'monochrom', 'Monarch Chromosome Ontology',
            'http://monarchinitiative.org', None,
            'http://creativecommons.org/licenses/by/4.0/')

        return

    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)
        return

    def parse(self, limit=None):

        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        for taxon in self.tax_ids:
            self._get_chrbands(limit, str(taxon))

        self.load_core_bindings()
        self.load_bindings()

        # using the full graph as the test here
        self.testgraph = self.graph
        logger.info("Found %d nodes", len(self.graph))
        logger.info("Done parsing files.")

        return

    def _get_chrbands(self, limit, taxon):
        """
        For the given taxon, it will fetch the chr band file.
        We will not deal with the coordinate information with this parser.
        Here, we only are concerned with building the partonomy.
        :param limit:
        :return:

        """
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files[taxon]['file']))
        logger.info("Processing Chr bands from FILE: %s", myfile)
        geno = Genotype(self.graph)

        # build the organism's genome from the taxon
        genome_label = self.files[taxon]['genome_label']
        taxon_id = 'NCBITaxon:'+taxon

        # add the taxon as a class.  adding the class label elsewhere
        self.gu.addClassToGraph(self.graph, taxon_id, None)
        self.gu.addSynonym(self.graph, taxon_id, genome_label)

        self.gu.loadObjectProperties(self.graph, Feature.object_properties)

        genome_id = geno.makeGenomeID(taxon_id)
        geno.addGenome(taxon_id, genome_label)
        self.gu.addOWLPropertyClassRestriction(
            self.graph, genome_id, Genotype.object_properties['in_taxon'],
            taxon_id)

        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # chr13	4500000	10000000	p12	stalk
                (chrom, start, stop, band, rtype) = line.split('\t')
                line_counter += 1

                # NOTE
                # some less-finished genomes have placed and unplaced scaffolds
                # * Placed scaffolds:
                #    Scaffold has an oriented location within a chromosome.
                # * Unlocalized scaffolds:
                #     scaffold 's chromosome  is known,
                #     scaffold's position, orientation or both is not known.
                # *Unplaced scaffolds:
                #   it is not known which chromosome the scaffold belongs to.

                # find out if the thing is a full on chromosome, or a scaffold:
                # ex: unlocalized scaffold: chr10_KL568008v1_random
                # ex: unplaced scaffold: chrUn_AABR07022428v1
                placed_scaffold_pattern = r'chr(\d+|X|Y|Z|W|MT|M)'

                # TODO unused
                # unlocalized_scaffold_pattern = \
                #    placed_scaffold_pattern + r'_(\w+)_random'
                # unplaced_scaffold_pattern = r'chrUn_(\w+)'

                m = re.match(placed_scaffold_pattern+r'$', chrom)
                if m is not None and len(m.groups()) == 1:
                    # the chromosome is the first match of the pattern
                    # ch = m.group(1)  # TODO unused
                    pass
                else:
                    # let's skip over anything that isn't a placed_scaffold
                    # at the class level
                    logger.info("Skipping non-placed chromosome %s", chrom)
                    continue
                # the chrom class, taxon as the reference
                cclassid = makeChromID(chrom, taxon, 'CHR')

                # add the chromosome as a class
                geno.addChromosomeClass(chrom, taxon_id, genome_label)
                self.gu.addOWLPropertyClassRestriction(
                    self.graph, cclassid,
                    self.gu.object_properties['member_of'], genome_id)

                # add the band(region) as a class
                maplocclass_id = cclassid+band
                maplocclass_label = makeChromLabel(chrom+band, genome_label)
                if band is not None and band.strip() != '':
                    region_type_id = self.map_type_of_region(rtype)
                    self.gu.addClassToGraph(
                        self.graph, maplocclass_id, maplocclass_label,
                        region_type_id)
                else:
                    region_type_id = Feature.types['chromosome']
                # add the staining intensity of the band
                if re.match(r'g(neg|pos|var)', rtype):
                    if region_type_id in [
                            Feature.types['chromosome_band'],
                            Feature.types['chromosome_subband']]:
                        stain_type = Feature.types.get(rtype)
                        if stain_type is not None:
                            self.gu.addOWLPropertyClassRestriction(
                                self.graph, maplocclass_id,
                                Feature.properties['has_staining_intensity'],
                                Feature.types.get(rtype))
                    else:
                        # usually happens if it's a chromosome because
                        # they don't actually have banding info
                        logger.info("feature type %s != chr band",
                                    region_type_id)
                else:
                    logger.warning('staining type not found: %s', rtype)

                # get the parent bands, and make them unique
                parents = list(self.make_parent_bands(band, set()))
                # alphabetical sort will put them in smallest to biggest
                parents.sort(reverse=True)

                # print("PARENTS of",maplocclass_id,"=",parents)
                # add the parents to the graph, in hierarchical order
                # TODO this is somewhat inefficient due to
                # re-adding upper-level nodes when iterating over the file
                # TODO PYLINT Consider using enumerate
                # instead of iterating with range and len
                for i in range(len(parents)):
                    pclassid = cclassid+parents[i]  # class chr parts
                    pclass_label = \
                        makeChromLabel(chrom+parents[i], genome_label)

                    rti = getChrPartTypeByNotation(parents[i])

                    self.gu.addClassToGraph(
                        self.graph, pclassid, pclass_label, rti)

                    # for canonical chromosomes,
                    # then the subbands are subsequences of the full band
                    # add the subsequence stuff as restrictions
                    if i < len(parents) - 1:
                        pid = cclassid+parents[i+1]   # the instance
                        self.gu.addOWLPropertyClassRestriction(
                            self.graph, pclassid,
                            Feature.object_properties['is_subsequence_of'],
                            pid)
                        self.gu.addOWLPropertyClassRestriction(
                            self.graph, pid,
                            Feature.object_properties['has_subsequence'],
                            pclassid)

                    else:
                        # add the last one (p or q usually)
                        # as attached to the chromosome
                        self.gu.addOWLPropertyClassRestriction(
                            self.graph, pclassid,
                            Feature.object_properties['is_subsequence_of'],
                            cclassid)
                        self.gu.addOWLPropertyClassRestriction(
                            self.graph, cclassid,
                            Feature.object_properties['has_subsequence'],
                            pclassid)

                # connect the band here to the first one in the parent list
                if len(parents) > 0:
                    self.gu.addOWLPropertyClassRestriction(
                        self.graph, maplocclass_id,
                        Feature.object_properties['is_subsequence_of'],
                        cclassid+parents[0])
                    self.gu.addOWLPropertyClassRestriction(
                        self.graph, cclassid+parents[0],
                        Feature.object_properties['has_subsequence'],
                        maplocclass_id)

                if limit is not None and line_counter > limit:
                    break

        self.gu.loadAllProperties(self.graph)

        # TODO figure out the staining intensities for the encompassing bands

        return

    def make_parent_bands(self, band, child_bands):
        """
        this will determine the grouping bands that it belongs to, recursively
        13q21.31 ==>  13, 13q, 13q2, 13q21, 13q21.3, 13q21.31

        :param band:
        :param child_bands:
        :return:

        """
        m = re.match(r'([pq][A-H\d]+(?:\.\d+)?)', band)
        if len(band) > 0:
            if m:
                p = str(band[0:len(band)-1])
                p = re.sub(r'\.$', '', p)
                if p is not None:
                    child_bands.add(p)
                    self.make_parent_bands(p, child_bands)
        else:
            child_bands = set()
        return child_bands

    def map_type_of_region(self, regiontype):
        """
        Note that "stalk" refers to the short arm of acrocentric chromosomes
        chr13,14,15,21,22 for human.
        :param regiontype:
        :return:

        """
        so_id = Feature.types['chromosome_part']

        if regiontype in self.region_type_map.keys():
            so_id = self.region_type_map.get(regiontype)
        else:
            logger.warning(
                "Unmapped code %s. Defaulting to chr_part 'SO:0000830'.",
                regiontype)

        return so_id

    def _check_tax_ids(self):
        for taxon in self.tax_ids:
            if str(taxon) not in self.files:
                raise Exception("Taxon " + str(taxon) +
                                " not supported by source Monochrom")

    def getTestSuite(self):
        # import unittest
        # from tests.test_ucscbands import UCSCBandsTestCase
        test_suite = None
        # test_suite = \
        #   unittest.TestLoader().loadTestsFromTestCase(UCSCBandsTestCase)

        return test_suite
Пример #22
0
    def _process_kegg_disease2gene(self, limit=None):
        """
        This method creates an association between diseases and their associated genes.
        We are being conservative here, and only processing those diseases for which there
        is no mapping to OMIM.

        Triples created:
        <alternate_locus> is an Individual
        <alternate_locus> has type <variant_locus>
        <alternate_locus> is an allele of  <gene_id>

        <assoc_id> has subject <disease_id>
        <assoc_id> has object <gene_id>
        :param limit:
        :return:
        """

        logger.info("Processing KEGG disease to gene")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        gu = GraphUtils(curie_map.get())
        rel = gu.object_properties['is_marker_for']
        gu.loadAllProperties(g)
        noomimset = set()
        raw = '/'.join((self.rawdir, self.files['disease_gene']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (gene_id, disease_id) = row

                if self.testMode and gene_id not in self.test_ids['genes']:
                    continue

                gene_id = 'KEGG-'+gene_id.strip()
                disease_id = 'KEGG-'+disease_id.strip()

                # only add diseases for which there is no omim id and not a grouping class
                if disease_id not in self.kegg_disease_hash:
                    # add as a class
                    disease_label = None
                    if disease_id in self.label_hash:
                        disease_label = self.label_hash[disease_id]
                    if re.search('includ', str(disease_label)):
                        # they use 'including' when it's a grouping class
                        logger.info("Skipping this association because it's a grouping class: %s", disease_label)
                        continue
                    gu.addClassToGraph(g, disease_id, disease_label, 'DOID:4')  # type this disease_id as a disease
                    noomimset.add(disease_id)
                    alt_locus_id = self._make_variant_locus_id(gene_id, disease_id)
                    alt_label = self.label_hash[alt_locus_id]
                    gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus'])
                    geno.addAlleleOfGene(alt_locus_id, gene_id)
                    # Add the disease to gene relationship.
                    assoc = G2PAssoc(self.name, alt_locus_id, disease_id, rel)
                    assoc.load_all_properties(g)
                    assoc.add_association_to_graph(g)

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        logger.info("Done with KEGG disease to gene")
        logger.info("Found %d diseases with no omim id", len(noomimset))

        return
Пример #23
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
        the graph additions are in the addXToFeature functions,
        but should be separated.
    TODO:
        this will need to be extended to properly deal with
        fuzzy positions in faldo.
    """

    object_properties = {
        'location': 'faldo:location',
        'begin': 'faldo:begin',
        'end': 'faldo:end',
        'reference': 'faldo:reference',
        'gene_product_of': 'RO:0002204',
        'has_gene_product': 'RO:0002205',
        'is_about': 'IAO:00000136',
        'has_subsequence': 'RO:0002524',
        'is_subsequence_of': 'RO:0002525',
        'has_staining_intensity': 'GENO:0000207',
        # was GENO:0000626 (staining_intensity),
        # but changing to has_sequence_attribute
        'upstream_of_sequence_of': 'RO:0002528',
        'downstream_of_sequence_of': 'RO:0002529'

    }

    data_properties = {
        'position': 'faldo:position',
    }

    annotation_properties = {}

    properties = object_properties.copy()
    properties.update(data_properties)
    properties.update(annotation_properties)

    types = {
        'region': 'faldo:Region',
        'Position': 'faldo:Position',
        # big P for Position type.  little p for position property
        'FuzzyPosition': 'faldo:FuzzyPosition',
        'chromosome': 'SO:0000340',
        'chromosome_arm': 'SO:0000105',
        'chromosome_band': 'SO:0000341',
        'chromosome_part': 'SO:0000830',
        'long_chromosome_arm': 'GENO:0000629',
        'short_chromosome_arm': 'GENO:0000628',
        'chromosome_region': 'GENO:0000614',
        'chromosome_subband': 'GENO:0000616',
        'centromere': 'SO:0000577',
        'plus_strand': 'faldo:PlusStrandPosition',
        'minus_strand': 'faldo:MinusStrandPosition',
        'both_strand': 'faldo:BothStrandPosition',
        'score': 'SO:0001685',
        # FIXME - score is not a good solution, too generic
        'reference_genome': 'SO:0001505',
        'genome': 'SO:0001026',
        'assembly_component': 'SO:0000143',
        'SNP': 'SO:0000694',

        # the following are sequence attributes:
        'band_intensity':  'GENO:0000618',
        'gneg': 'GENO:0000620',
        'gpos': 'GENO:0000619',
        'gpos100': 'GENO:0000622',
        'gpos75': 'GENO:0000623',
        'gpos50': 'GENO:0000624',
        'gpos25': 'GENO:0000625',
        'gvar': 'GENO:0000621',
        'gpos33': 'GENO:0000633',
        'gpos66': 'GENO:0000632'
    }

    def __init__(self, id, label, type, description=None):
        self.id = id
        self.label = label
        self.type = type
        self.description = description
        self.gu = GraphUtils(curie_map.get())
        self.start = None
        self.stop = None
        self.nobnodes = True  # TODO remove this before official release
        return

    def addFeatureStartLocation(
            self, coordinate, reference_id, strand=None,
            position_types=None):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:
        :return:
        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand,
                                       position_types)

        return

    def addFeatureEndLocation(
            self, coordinate, reference_id, strand=None,
            position_types=None):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:
        :return:
        """

        self.stop = self._getLocation(coordinate, reference_id, strand,
                                      position_types)

        return

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:
        :return:
        """

        loc = {}
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.types['Position'])

        return loc

    def _getStrandType(self, strand):
        """

        :param strand:
        :return:
        """

        # TODO make this a dictionary/enum:  PLUS, MINUS, BOTH, UNKNOWN
        strand_id = None
        if strand == '+':
            strand_id = self.types['plus_strand']
        elif strand == '-':
            strand_id = self.types['minus_strand']
        elif strand == '.':
            strand_id = self.types['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            logger.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(
            self, graph, add_region=True, region_id=None,
            feature_as_class=False):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
            which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
            which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
            faldo:location region_id
        region_id a faldo:region
            faldo:begin start_position
            faldo:end end_position
        start_position a
            (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
            faldo:position Integer(numeric position)
            faldo:reference reference_id
        end_position a
            (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
            faldo:position Integer(numeric position)
            faldo:reference reference_id

        :param graph:
        :return:
        """

        if feature_as_class:
            self.gu.addClassToGraph(graph, self.id, self.label, self.type,
                                    self.description)
        else:
            self.gu.addIndividualToGraph(graph, self.id, self.label, self.type,
                                         self.description)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and \
                        self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(
                        self.start['type'])
                if self.stop is not None and\
                        self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                rid = '_'+rid+"-Region"
                region_id = rid
                if self.nobnodes:
                    region_id = ':'+region_id
            self.gu.addTriple(graph, self.id, self.properties['location'],
                              region_id)
            self.gu.addIndividualToGraph(
                graph, region_id, None, 'faldo:Region')
        else:
            region_id = self.id
            self.gu.addType(graph, region_id, 'faldo:Region')

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(self.start['reference'],
                                          self.start['coordinate'],
                                          self.start['type'])
            self.addPositionToGraph(graph,
                                    self.start['reference'],
                                    self.start['coordinate'],
                                    self.start['type'])

        if self.stop is not None:
            endp = self._makePositionId(self.stop['reference'],
                                        self.stop['coordinate'],
                                        self.stop['type'])
            self.addPositionToGraph(graph,
                                    self.stop['reference'],
                                    self.stop['coordinate'],
                                    self.stop['type'])

        self.addRegionPositionToGraph(graph, region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

        return

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.types['plus_strand'] in tylist:
            strand = 'plus'
        elif self.types['minus_strand'] in tylist:
            strand = 'minus'
        elif self.types['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
            Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return:
        """

        if reference is None:
            logger.error("Trying to make position with no reference.")
            return None

        i = '_'
        if self.nobnodes:
            i = ':'+i
        reference = re.sub(r'\w+\:', '', reference, 1)
        if re.match(r'^_', reference):
            # this is in the case if the reference is a bnode
            reference = re.sub(r'^_', '', reference)
        i += reference
        if coordinate is not None:
            # just in case it isn't a string already
            i = '-'.join((i, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                i = '-'.join((i, tstring))

        return i

    def addRegionPositionToGraph(
            self, graph, region_id, begin_position_id,
            end_position_id):

        if begin_position_id is None:
            pass
            # logger.warn(
            #   "No begin position specified for region %s", region_id)
        else:
            self.gu.addTriple(graph, region_id, self.properties['begin'],
                              begin_position_id)

        if end_position_id is None:
            pass
            # logger.warn("No end position specified for region %s", region_id)
        else:
            self.gu.addTriple(graph, region_id, self.properties['end'],
                              end_position_id)

        return

    def addPositionToGraph(
            self, graph, reference_id, position,
            position_types=None, strand=None):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
            we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
            faldo:position Integer(numeric position)
            faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:
        :return:  Identifier of the position created
        """

        iid = self._makePositionId(reference_id, position, position_types)
        n = self.gu.getNode(iid)
        pos = self.gu.getNode(self.properties['position'])
        ref = self.gu.getNode(self.properties['reference'])
        if position is not None:
            graph.add((n, pos, Literal(position, datatype=XSD['integer'])))
        graph.add((n, ref, self.gu.getNode(reference_id)))
        if position_types is not None:
            for t in position_types:
                graph.add((n, RDF['type'], self.gu.getNode(t)))
        s = None
        if strand is not None:
            s = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                s = self._getStrandType(strand)
        # else:
        #    s = self.types['both_strand']
        if s is None and (position_types is None or position_types == []):
            s = self.types['Position']

        if s is not None:
            graph.add((n, RDF['type'], self.gu.getNode(s)))

        return iid

    def addSubsequenceOfFeature(self, graph, parentid):
        """
        This will add reciprocal triples like:
        feature is_subsequence_of parent
        parent has_subsequence feature
        :param graph:
        :param parentid:
        :return:
        """
        self.gu.addTriple(
            graph, self.id, self.properties['is_subsequence_of'], parentid)
        self.gu.addTriple(
            graph, parentid, self.properties['has_subsequence'], self.id)

        return

    def addTaxonToFeature(self, graph, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        # TEC: should taxon be set in __init__()?
        self.taxon = taxonid
        self.gu.addTriple(
            graph, self.id, Assoc.properties['in_taxon'], self.taxon)

        return

    def loadAllProperties(self, graph):

        prop_dict = {
            Assoc(None).ANNOTPROP: self.annotation_properties,
            Assoc(None).OBJECTPROP: self.object_properties,
            Assoc(None).DATAPROP: self.data_properties
        }

        for p in prop_dict:
            self.gu.loadProperties(graph, prop_dict.get(p), p)

        return

    def addFeatureProperty(self, graph, property_type, property):
        self.gu.addTriple(graph, self.id, property_type, property)
        return

    def setNoBNodes(self, nobnodes):
        self.nobnodes = nobnodes
        return
Пример #24
0
    def _process_omim2gene(self, limit=None):
        """
        This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field.
        Equivalent link types are mapped as gene XRefs.
        Reverse link types are mapped as disease to gene associations.
        Original link types are currently skipped.

        Triples created:
        <kegg_gene_id> is a Gene
        <omim_gene_id> is a Gene
        <kegg_gene_id>> hasXref <omim_gene_id>

        <assoc_id> has subject <omim_disease_id>
        <assoc_id> has object <kegg_gene_id>
        :param limit:
        :return:
        """

        logger.info("Processing OMIM to KEGG gene")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['omim2gene']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (kegg_gene_id, omim_id, link_type) = row

                if self.testMode and kegg_gene_id not in self.test_ids['genes']:
                    continue

                kegg_gene_id = 'KEGG-'+kegg_gene_id.strip()
                omim_id = re.sub('omim', 'OMIM', omim_id)
                if link_type == 'equivalent':
                    # these are genes!  so add them as a class then make equivalence
                    gu.addClassToGraph(g, omim_id, None)
                    geno.addGene(kegg_gene_id, None)
                    gu.addEquivalentClass(g, kegg_gene_id, omim_id)
                elif link_type == 'reverse':
                    # make an association between an OMIM ID and the KEGG gene ID
                    # we do this with omim ids because they are more atomic than KEGG ids

                    alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id)
                    alt_label = self.label_hash[alt_locus_id]
                    gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus'])
                    geno.addAlleleOfGene(alt_locus_id, kegg_gene_id)

                    # Add the disease to gene relationship.
                    rel = gu.object_properties['is_marker_for']
                    assoc = G2PAssoc(self.name, alt_locus_id, omim_id, rel)
                    assoc.add_association_to_graph(g)

                elif link_type == 'original':
                    # these are sometimes a gene, and sometimes a disease
                    logger.info('Unable to handle original link for %s-%s', kegg_gene_id, omim_id)
                else:
                    # don't know what these are
                    logger.warn('Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type)

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        logger.info("Done with OMIM to KEGG gene")
        gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
        gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)

        return
Пример #25
0
    def _process_nlx_157874_1_view(self, raw, limit=None):
        """
        This table contains the Elements of Morphology data that has been
        screen-scraped into DISCO.
        Note that foaf:depiction is inverse of foaf:depicts relationship.

        Since it is bad form to have two definitions,
        we concatenate the two into one string.

        Triples:
            <eom id> a owl:Class
                rdf:label Literal(eom label)
                OIO:hasRelatedSynonym Literal(synonym list)
                IAO:definition Literal(objective_def. subjective def)
                foaf:depiction Literal(small_image_url),
                               Literal(large_image_url)
                foaf:page Literal(page_url)
                rdfs:comment Literal(long commented text)


        :param raw:
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())
        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            filereader = csv.reader(f1, delimiter='\t', quotechar='\"')
            for line in filereader:
                line_counter += 1
                (morphology_term_id, morphology_term_num,
                 morphology_term_label, morphology_term_url,
                 terminology_category_label, terminology_category_url,
                 subcategory, objective_definition, subjective_definition,
                 comments, synonyms, replaces, small_figure_url,
                 large_figure_url, e_uid, v_uid, v_uuid,
                 v_last_modified) = line

                # note:
                # e_uid v_uuid v_last_modified terminology_category_url
                # subcategory v_uid morphology_term_num
                # terminology_category_label hp_label notes
                # are currently unused.

                # Add morphology term to graph as a class
                # with label, type, and description.
                gu.addClassToGraph(self.graph, morphology_term_id,
                                   morphology_term_label)

                # Assemble the description text

                if subjective_definition != '' and not (
                        re.match(r'.+\.$', subjective_definition)):
                    # add a trailing period.
                    subjective_definition = subjective_definition.strip() + '.'
                if objective_definition != '' and not (
                        re.match(r'.+\.$', objective_definition)):
                    # add a trailing period.
                    objective_definition = objective_definition.strip() + '.'

                definition = \
                    '  '.join(
                        (objective_definition, subjective_definition)).strip()

                gu.addDefinition(self.graph, morphology_term_id, definition)

                # <term id> FOAF:depicted_by literal url
                # <url> type foaf:depiction

                # do we want both images?
                # morphology_term_id has depiction small_figure_url
                if small_figure_url != '':
                    gu.addDepiction(self.graph, morphology_term_id,
                                    small_figure_url)

                # morphology_term_id has depiction large_figure_url
                if large_figure_url != '':
                    gu.addDepiction(self.graph, morphology_term_id,
                                    large_figure_url)

                # morphology_term_id has comment comments
                if comments != '':
                    gu.addComment(self.graph, morphology_term_id,
                                  comments.strip())

                if synonyms != '':
                    for s in synonyms.split(';'):
                        gu.addSynonym(
                            self.graph, morphology_term_id, s.strip(),
                            gu.properties['hasExactSynonym'])

                # morphology_term_id hasRelatedSynonym replaces (; delimited)
                if replaces != '' and replaces != synonyms:
                    for s in replaces.split(';'):
                        gu.addSynonym(
                            self.graph, morphology_term_id, s.strip(),
                            gu.properties['hasRelatedSynonym'])

                # morphology_term_id has page morphology_term_url
                gu.addPage(self.graph, morphology_term_id, morphology_term_url)

                if limit is not None and line_counter > limit:
                    break
        return
Пример #26
0
    def _process_omim2disease(self, limit=None):
        """
        This method maps the KEGG disease IDs to the corresponding OMIM disease IDs.
        Currently this only maps KEGG diseases and OMIM diseases that have a 1:1 mapping.

        Triples created:
        <kegg_disease_id> is a class
        <omim_disease_id> is a class
        <kegg_disease_id> hasXref <omim_disease_id>
        :param limit:
        :return:
        """

        logger.info("Processing 1:1 KEGG disease to OMIM disease mappings")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['omim2disease']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                (omim_disease_id, kegg_disease_id, link_type) = row

                kegg_disease_id = 'KEGG-'+kegg_disease_id.strip()
                omim_disease_id = re.sub('omim', 'OMIM', omim_disease_id)

                # Create hash for the links from OMIM ID -> KEGG ID
                if omim_disease_id not in self.omim_disease_hash:
                    self.omim_disease_hash[omim_disease_id] = [kegg_disease_id]
                else:
                    self.omim_disease_hash[omim_disease_id].append(kegg_disease_id)

                # Create hash for the links from KEGG ID -> OMIM ID
                if kegg_disease_id not in self.kegg_disease_hash:
                    self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id]
                else:
                    self.kegg_disease_hash[kegg_disease_id].append(omim_disease_id)

        # Now process the disease hashes and only process 1:1 omim disease:KEGG disease entries.
        for omim_disease_id in self.omim_disease_hash:
            if self.testMode and omim_disease_id not in self.test_ids['disease']:
                continue

            if (not self.testMode) and (limit is not None and line_counter > limit):
                break
            line_counter += 1

            if len(self.omim_disease_hash[omim_disease_id]) == 1:
                kegg_disease_id = ''.join(self.omim_disease_hash.get(omim_disease_id))
                if len(self.kegg_disease_hash[kegg_disease_id]) == 1:
                    # add ids, and deal with the labels separately
                    gu.addClassToGraph(g, kegg_disease_id, None)
                    gu.addClassToGraph(g, omim_disease_id, None)
                    gu.addEquivalentClass(g, kegg_disease_id, omim_disease_id)  # safe?
                    # gu.addXref(g, kegg_disease_id, omim_disease_id)

        logger.info("Done with KEGG disease to OMIM disease mappings.")
        return
Пример #27
0
class Genotype():
    """
    These methods provide convenient methods to add items related to a genotype and it's parts to a supplied graph.
    They follow the patterns set out in GENO https://github.com/monarch-initiative/GENO-ontology.
    For specific sequence features, we use the GenomicFeature class to create them.
    """

    # special genotype parts mapped to their GENO and SO classes that we explicitly reference here
    genoparts = {
        'intrinsic_genotype': 'GENO:0000000',
        'extrinsic_genotype': 'GENO:0000524',
        'effective_genotype': 'GENO:0000525',
        'genomic_background': 'GENO:0000611',
        'genomic_variation_complement': 'GENO:0000009',
        'karyotype_variation_complement': 'GENO:0000644',
        'variant_single_locus_complement': 'GENO:0000030',
        'variant_locus': 'GENO:0000002',
        'reference_locus': 'GENO:0000036',
        'allele': 'GENO:0000008',
        'gene': 'SO:0000704',
        'QTL': 'SO:0000771',
        'transgene': 'SO:0000902',
        'pseudogene': 'SO:0000336',
        'cytogenetic marker': 'SO:0000341',
        'sequence_feature': 'SO:0000110',
        'sequence_alteration': 'SO:0001059',
        'insertion': 'SO:0000667',
        'deletion': 'SO:0000159',
        'substitution': 'SO:1000002',
        'duplication': 'SO:1000035',
        'translocation': 'SO:0000199',
        'inversion': 'SO:1000036',
        'tandem_duplication': 'SO:1000173',
        'point_mutation': 'SO:1000008',
        'population': 'PCO:0000001',  # population
        'family': 'PCO:0000020',  # family
        'wildtype': 'GENO:0000511',
        'reagent_targeted_gene': 'GENO:0000504',
        'targeted_gene_subregion' : 'GENO:0000534',
        'targeted_gene_complement' : 'GENO:0000527',
        'biological_region' : 'SO:0001411',
        'missense_variant': 'SO:0001583',
        'transcript': 'SO:0000233',
        'polypeptide': 'SO:0000104',
        'cDNA': 'SO:0000756',
        'sequence_variant_causing_loss_of_function_of_polypeptide': 'SO:1000118',
        'sequence_variant_causing_gain_of_function_of_polypeptide': 'SO:1000125',
        'sequence_variant_causing_inactive_catalytic_site': 'SO:1000120',
        'sequence_variant_affecting_polypeptide_function': 'SO:1000117',
        'regulatory_transgene_feature': 'GENO:0000638',
        'coding_transgene_feature': 'GENO:0000637',
        'protein_coding_gene': 'SO:0001217',
        'ncRNA_gene': 'SO:0001263'
    }

    object_properties = {
        'is_mutant_of': 'GENO:0000440',
        'derives_from': 'RO:0001000',
        'has_alternate_part': 'GENO:0000382',
        'has_reference_part': 'GENO:0000385',
        'in_taxon': 'RO:0002162',
        'has_zygosity': 'GENO:0000608',
        'is_sequence_variant_instance_of': 'GENO:0000408',  # links a alternate locus (instance) to a gene (class)
        'targets_instance_of': 'GENO:0000414',
        'is_reference_instance_of': 'GENO:0000610',
        'has_part': 'BFO:0000051',
        'has_member_with_allelotype': 'GENO:0000225',  # use this when relating populations
        'is_allelotype_of': 'GENO:0000206',
        'has_genotype': 'GENO:0000222',
        'has_phenotype': 'RO:0002200',
        'transcribed_to': 'RO:0002205',
        'translates_to': 'RO:0002513',
        'is_targeted_expression_variant_of' : 'GENO:0000443',
        'is_transgene_variant_of': 'GENO:0000444',
        'has_expression-variant_part' : 'GENO:0000532',
        'targeted_by' : 'GENO:0000634',  # between a (reagent-targeted gene) and a morpholino
        'derives_sequence_from_gene': 'GENO:0000639',   # FIXME should this just be subsequence of?
        'feature_to_gene_relation': 'GENO:0000418'
    }

    annotation_properties = {
        # TODO change properties with https://github.com/monarch-initiative/GENO-ontology/issues/21
        'reference_nucleotide': 'GENO:reference_nucleotide',  # Made up term
        'reference_amino_acid': 'GENO:reference_amino_acid',  # Made up term
        'altered_nucleotide': 'GENO:altered_nucleotide',  # Made up term
        'results_in_amino_acid_change': 'GENO:results_in_amino_acid_change'  # Made up term
    }

    zygosity = {
        'homoplasmic': 'GENO:0000602',
        'heterozygous': 'GENO:0000135',
        'indeterminate': 'GENO:0000137',
        'heteroplasmic': 'GENO:0000603',
        'hemizygous-y': 'GENO:0000604',
        'hemizygous-x': 'GENO:0000605',
        'homozygous': 'GENO:0000136',
        'hemizygous': 'GENO:0000606',
        'complex_heterozygous': 'GENO:0000402',
        'simple_heterozygous': 'GENO:0000458'
    }

    properties = object_properties.copy()
    properties.update(annotation_properties)

    def __init__(self, graph):

        self.gu = GraphUtils(curie_map.get())

        self.graph = graph

        self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP)

        return

    def addGenotype(self, genotype_id, genotype_label, genotype_type=None, genotype_description=None):
        """
        If a genotype_type is not supplied, we will default to 'intrinsic_genotype'
        :param genotype_id:
        :param genotype_label:
        :param genotype_type:
        :param genotype_description:
        :return:
        """
        if genotype_type is None:
            genotype_type = self.genoparts['intrinsic_genotype']

        self.gu.addIndividualToGraph(self.graph, genotype_id, genotype_label, genotype_type, genotype_description)

        return

    def addAllele(self, allele_id, allele_label, allele_type=None, allele_description=None):
        """
        Make an allele object. If no allele_type is added, it will default to a geno:allele
        :param allele_id: curie for allele (required)
        :param allele_label: label for allele (required)
        :param allele_type: id for an allele type (optional, recommended SO or GENO class)
        :param allele_description: a free-text description of the allele
        :return:
        """
        # TODO should we accept a list of allele types?
        if (allele_type is None):
            allele_type = self.genoparts['allele']  #TODO is this a good idea?
        self.gu.addIndividualToGraph(self.graph, allele_id, allele_label, allele_type, allele_description)

        return

    def addGene(self, gene_id, gene_label, gene_type=None, gene_description=None):
        if gene_type is None:
            gene_type = self.genoparts['gene']
        # genes are classes
        self.gu.addClassToGraph(self.graph, gene_id, gene_label, gene_type, gene_description)

        return

    def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None):
        # TODO add base type for construct
        # if (constrcut_type is None):
        #    constrcut_type=self.construct_base_type
        self.gu.addIndividualToGraph(self.graph, construct_id, construct_label, construct_type, construct_description)

        return

    def addDerivesFrom(self, child_id, parent_id):
        """
        We add a derives_from relationship between the child and parent id.  Examples of uses include between:
        an allele and a construct or strain here, a cell line and it's parent genotype.  Adding the
        parent and child to the graph should happen outside of this function call to
        ensure graph integrity.
        :param child_id:
        :param parent_id:
        :return:
        """

        self.gu.addTriple(self.graph, child_id, self.properties['derives_from'], parent_id)

        return

    def addSequenceDerivesFrom(self, child_id, parent_id):
        self.gu.addTriple(self.graph, child_id, self.properties['derives_sequence_from_gene'], parent_id)
        return

    def addAlleleOfGene(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided, it is a
        GENO:is_sequence_variant_instance_of.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:
        """
        if (rel_id is None):
            rel_id = self.properties['is_sequence_variant_instance_of']
        self.gu.addTriple(self.graph, allele_id, rel_id, gene_id)
        return

    def addTranscript(self, variant_id, transcript_id, transcript_label=None, transcript_type=None):
        """
        Add gene/variant/allele transcribes_to relationship
        :param variant_id:
        :param transcript_id:
        :param transcript_label:
        :param transcript_type:
        :return:
        """
        self.gu.addIndividualToGraph(self.graph, transcript_id, transcript_label, transcript_type)
        self.gu.addTriple(self.graph, variant_id, self.properties['transcribed_to'], transcript_id)

        return

    def addPolypeptide(self, polypeptide_id, polypeptide_label=None, transcript_id=None, polypeptide_type=None, ):
        """
        :param polypeptide_id:
        :param polypeptide_label:
        :param polypeptide_type:
        :param transcript_id:
        :return:
        """
        if polypeptide_type is None:
            polypeptide_type = self.genoparts['polypeptide']
        self.gu.addIndividualToGraph(self.graph, polypeptide_id, polypeptide_label, polypeptide_type)
        if transcript_id is not None:
            self.gu.addTriple(self.graph, transcript_id, self.properties['translates_to'], polypeptide_id)

        return


    def addPartsToVSLC(self, vslc_id, allele1_id, allele2_id, zygosity_id=None, allele1_rel=None, allele2_rel=None):
        """
        Here we add the parts to the VSLC.  While traditionally alleles (reference or variant loci) are
        traditionally added, you can add any node (such as sequence_alterations for unlocated variations)
        to a vslc if they are known to be paired.  However, if a sequence_alteration's loci is unknown,
        it probably should be added directly to the GVC.
        :param vslc_id:
        :param allele1_id:
        :param allele2_id:
        :param zygosity_id:
        :param allele1_rel:
        :param allele2_rel:
        :return:
        """

        # vslc has parts allele1/allele2
        gu = self.gu

        vslc = gu.getNode(vslc_id)
        if allele1_id is not None:
            self.addParts(allele1_id, vslc_id, allele1_rel)
        if allele2_id is not None and allele2_id.strip() != '':
            self.addParts(allele2_id, vslc_id, allele2_rel)

        # figure out zygosity if it's not supplied
        if zygosity_id is None:
            if allele1_id == allele2_id:
                zygosity_id = self.zygosity['homozygous']
            else:
                zygosity_id = self.zygosity['heterozygous']

        if zygosity_id is not None:
            gu.addTriple(self.graph, vslc_id, self.properties['has_zygosity'], zygosity_id)

        return

    def addVSLCtoParent(self, vslc_id, parent_id):
        """
        The VSLC can either be added to a genotype or to a GVC.  The vslc is added as a part of the parent.
        :param vslc_id:
        :param parent_id:
        :return:
        """
        self.addParts(vslc_id, parent_id, self.properties['has_alternate_part'])

        return

    def addParts(self, part_id, parent_id, part_relationship=None):
        """
        This will add a has_part (or subproperty) relationship between a parent_id and the supplied part.
        By default the relationship will be BFO:has_part, but any relationship could be given here.
        :param part_id:
        :param parent_id:
        :param part_relationship:
        :return:
        """
        if part_relationship is None:
            part_relationship = self.properties['has_part']

        self.gu.addTriple(self.graph, parent_id, part_relationship, part_id)

        return

    def addSequenceAlteration(self, sa_id, sa_label, sa_type=None, sa_description=None):
        if sa_type is None:
            sa_type = self.genoparts['sequence_alteration']
        self.gu.addIndividualToGraph(self.graph, sa_id, sa_label, sa_type, sa_description)

        return

    def addSequenceAlterationToVariantLocus(self, sa_id, vl_id):
        self.addParts(sa_id, vl_id, self.properties['has_alternate_part'])
        return

    def addGenomicBackground(self, background_id, background_label, background_type=None, background_description=None):
        if background_type is None:
            background_type = self.genoparts['genomic_background']
        self.gu.addIndividualToGraph(self.graph, background_id, background_label, background_type, background_description)

        return

    def addGenomicBackgroundToGenotype(self, background_id, genotype_id):
        self.gu.addType(self.graph, background_id, self.genoparts['genomic_background'])
        self.addParts(background_id, genotype_id, self.object_properties['has_reference_part'])

        return

    def addTaxon(self, taxon_id, genopart_id):
        """
        The supplied geno part will have the specified taxon added with RO:in_taxon relation.
        Generally the taxon is associated with a genomic_background, but could be added to any
        genotype part (including a gene, regulatory element, or sequence alteration).
        :param taxon_id:
        :param genopart_id:
        :return:
        """
        in_taxon = self.gu.getNode(self.properties['in_taxon'])
        s = self.gu.getNode(genopart_id)
        self.graph.add((s, in_taxon, self.gu.getNode(taxon_id)))

        return

    def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id):
        # for example, add a morphant reagent thingy to the genotype, assuming it's a extrinsic_genotype
        p = self.object_properties['has_expression-variant_part']
        self.gu.addTriple(self.graph, genotype_id, p, reagent_id)

        return

    def addGeneTargetingReagent(self, reagent_id, reagent_label, reagent_type, gene_id, description=None):
        """
        Here, a gene-targeting reagent is added.  The actual targets of this reagent should be added separately.
        :param reagent_id:
        :param reagent_label:
        :param reagent_type:
        :return:
        """
        # TODO add default type to reagent_type
        self.gu.addIndividualToGraph(self.graph, reagent_id, reagent_label, reagent_type, description)

        self.gu.addTriple(self.graph, reagent_id, self.object_properties['targets_instance_of'], gene_id)

        return

    def addReagentTargetedGene(self, reagent_id, gene_id, targeted_gene_id=None, targeted_gene_label=None,
                               description=None):
        """
        This will create the instance of a gene that is targeted by a molecular reagent (such as a morpholino or rnai).
        If an instance id is not supplied, we will create it as an anonymous individual which is of the
        type GENO:reagent_targeted_gene.  We will also add the targets relationship between the reagent and gene class.

        <targeted_gene_id> a GENO:reagent_targeted_gene
            rdf:label targeted_gene_label
            dc:description description
        <reagent_id> GENO:targets_instance_of <gene_id>

        :param reagent_id:
        :param gene_id:
        :param targeted_gene_id:
        :return:
        """

        # akin to a variant locus
        if (targeted_gene_id is None):
            targeted_gene_id = '_' + gene_id + '-' + reagent_id
        self.gu.addIndividualToGraph(self.graph, targeted_gene_id, targeted_gene_label,
                                     self.genoparts['reagent_targeted_gene'], description)

        self.gu.addTriple(self.graph, targeted_gene_id,
                          self.object_properties['is_targeted_expression_variant_of'], gene_id)

        self.gu.addTriple(self.graph, targeted_gene_id, self.object_properties['targeted_by'], reagent_id)

        return

    def addTargetedGeneSubregion(self, tgs_id, tgs_label, tgs_type=None, tgs_description=None):
        if tgs_type is None:
            tgs_type = self.genoparts['targeted_gene_subregion']
        self.gu.addIndividualToGraph(self.graph, tgs_id, tgs_label, tgs_type, tgs_description)

    def addMemberOfPopulation(self, member_id, population_id):
        self.gu.addTriple(self.graph, population_id,
                          self.properties['has_member_with_allelotype'], member_id)

        return


    def addTargetedGeneComplement(self, tgc_id, tgc_label, tgc_type=None, tgc_description=None):
        if tgc_type is None:
            tgc_type = self.genoparts['targeted_gene_complement']
        self.gu.addIndividualToGraph(self.graph, tgc_id, tgc_label, tgc_type, tgc_description)

        return


    def addGenome(self, taxon_id, taxon_label=None):
        if taxon_label is None:
            taxon_label = taxon_id
        genome_label = taxon_label+' genome'
        genome_id = self.makeGenomeID(taxon_id)
        self.gu.addClassToGraph(self.graph, genome_id, genome_label, Feature.types['genome'])

        return

    def addReferenceGenome(self, build_id, build_label, taxon_id):
        genome_id = self.makeGenomeID(taxon_id)
        self.gu.addIndividualToGraph(self.graph, build_id, build_label, Feature.types['reference_genome'])
        self.gu.addType(self.graph, build_id, genome_id)
        self.addTaxon(taxon_id, build_id)

        return

    def makeGenomeID(self, taxon_id):
        # scrub off the taxon prefix.  put it in base space

        genome_id = re.sub('.*\:', ':', taxon_id) + 'genome'

        return genome_id

    def addChromosome(self, chr, tax_id, tax_label=None, build_id=None, build_label=None):
        # if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome.
        # if a build is included, punn the chromosome as a subclass of SO:chromsome, and
        # make the build-specific chromosome an instance of the supplied chr.  The chr then becomes part of the
        # build or genome.

        # first, make the chromosome class, at the taxon level
        chr_id = makeChromID(str(chr), tax_id)
        if tax_label is not None:
            chr_label = makeChromLabel(chr, tax_label)
        else:
            chr_label = makeChromLabel(chr)
        genome_id = self.makeGenomeID(tax_id)
        self.gu.addClassToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome'])
        self.addTaxon(tax_id, genome_id)  # add the taxon to the genome

        if build_id is not None:
            chrinbuild_id = makeChromID(chr, build_id)  # the build-specific chromosome
            if build_label is None:
                build_label = build_id
            chrinbuild_label = makeChromLabel(chr, build_label)
            # add the build-specific chromosome as an instance of the chr class
            self.gu.addIndividualToGraph(self.graph, chrinbuild_id, chrinbuild_label, chr_id)

            # add the build-specific chromosome as a member of the build  (both ways)
            self.gu.addMember(self.graph, build_id, chrinbuild_id)
            self.gu.addMemberOf(self.graph, chrinbuild_id, build_id)

        return

    def addChromosomeClass(self, chrom_num, taxon_id, taxon_label):
        taxon = re.sub('NCBITaxon:', '', taxon_id)
        chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')  # the chrom class (generic) id
        chrom_class_label = makeChromLabel(chrom_num, taxon_label)
        self.gu.addClassToGraph(self.graph, chrom_class_id, chrom_class_label,
                                Feature.types['chromosome'])

        return

    def addChromosomeInstance(self, chr_num, reference_id, reference_label, chr_type=None):
        """
        Add the supplied chromosome as an instance within the given reference
        :param chr:
        :param reference_id: for example, a build id like UCSC:hg19
        :param reference_label:
        :param chr_type: this is the class that this is an instance of.  typically a genome-specific chr
        :return:
        """

        chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH')
        chr_label = makeChromLabel(str(chr_num), reference_label)

        self.gu.addIndividualToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome'])
        self.gu.addType(self.graph, chr_id, chr_type)

        # add the build-specific chromosome as a member of the build  (both ways)
        self.gu.addMember(self.graph, reference_id, chr_id)
        self.gu.addMemberOf(self.graph, chr_id, reference_id)

        return

    def make_variant_locus_label(self, gene_label, allele_label):
        if gene_label is None:
            gene_label = ''
        label = gene_label.strip()+'<' + allele_label.strip() + '>'

        return label

    def make_vslc_label(self, gene_label, allele1_label, allele2_label):
        """
        Make a Variant Single Locus Complement (VSLC) in monarch-style.
        :param gene_label:
        :param allele1_label:
        :param allele2_label:
        :return:
        """
        vslc_label = ''

        if (gene_label is None and allele1_label is None and allele2_label is None):
            logger.error("Not enough info to make vslc label")
            return None

        top = self.make_variant_locus_label(gene_label, allele1_label)
        bottom = ''
        if allele2_label is not None:
            bottom = self.make_variant_locus_label(gene_label, allele2_label)

        vslc_label = '/'.join((top, bottom))

        return vslc_label
Пример #28
0
    def _process_ddg2p_annotations(self, limit):
        """
        The ddg2p annotations associate a gene symbol to an omim disease,
        along with some HPO ids and pubs. The gene symbols come from gencode,
        which in turn come from HGNC official gene symbols.  Therefore,
        we use the HGNC source class to get the id/symbol mapping for
        use in our annotations here.

        According to http://www.gencodegenes.org/faq.html,
        "Gene names are usually HGNC or MGI-approved gene symbols mapped
        to the GENCODE genes by the Ensembl xref pipeline. Sometimes,
        when there is no official gene symbol, the Havana clone-based
        name is used."

        The kind of variation that is linked to a disease is indicated
        (LOF, GOF, CNV, etc) in the source data.
        Here, we create an anonymous variant of the specified gene of
        the indicated type (mapped to the sequence ontology (SO)).

        :param limit:
        :return:

        """

        line_counter = 0
        if self.g is not None:
            g = self.g
        else:
            g = self.graph
        gu = GraphUtils(curie_map.get())

        # in order for this to work, we need to map the HGNC id-symbol;
        hgnc = HGNC()
        hgnc_symbol_id_map = hgnc.get_symbol_id_map()

        myzip = ZipFile(
            '/'.join((self.rawdir, self.files['annot']['file'])), 'r')

        # use the ddg2p.txt file
        fname = 'ddg2p.txt'

        unmapped_omim_counter = 0
        unmapped_gene_count = 0
        with myzip.open(fname, 'r') as f:
            f = io.TextIOWrapper(f)
            reader = csv.reader(f, delimiter='\t', quotechar='\"')
            # score_means_by_measure = {}
            # strain_scores_by_measure = {}   # TODO theseare unused
            for row in reader:
                line_counter += 1
                if re.match(r'#', row[0]):   # skip comments
                    continue

                (gencode_gene_name, mode, category, consequence, disease, omim,
                 ddg2p_id, pubmed_ids, hpo_codes) = row

                hgnc_id = hgnc_symbol_id_map.get(gencode_gene_name.strip())
                if hgnc_id is None:
                    logger.error(
                        "Couldn't map the gene symbol %s to HGNC.",
                        gencode_gene_name)
                    unmapped_gene_count += 1
                    continue
                # add the gene
                gu.addClassToGraph(g, hgnc_id, gencode_gene_name)

                # TODO make VSLC with the variation
                #   to associate with the disorder
                # TODO use the Inheritance and Mutation consequence
                #   to classify the VSLCs

                allele_id = self.make_allele_by_consequence(
                    consequence, hgnc_id, gencode_gene_name)

                if omim.strip() != '':
                    omim_id = 'OMIM:'+str(omim.strip())
                    # assume this is declared elsewhere in ontology
                    gu.addClassToGraph(g, omim_id, None)

                    if category.strip() == 'Confirmed DD gene':
                        rel = gu.object_properties['has_phenotype']
                    elif category.strip() == 'Probable DD gene':
                        rel = gu.object_properties['has_phenotype']
                    elif category.strip() == 'Possible DD gene':
                        rel = gu.object_properties['contributes_to']
                    elif category.strip() == 'Not DD gene':
                        # TODO negative annotation
                        continue
                    assoc = G2PAssoc(self.name, allele_id, omim_id)
                    # TODO 'rel' is assigned to but never used

                    for p in re.split(r';', pubmed_ids):
                        p = p.strip()
                        if p != '':
                            pmid = 'PMID:'+str(p)
                            r = Reference(
                                pmid, Reference.ref_types['journal_article'])
                            r.addRefToGraph(g)
                            assoc.add_source(pmid)

                    assoc.add_association_to_graph(g)
                else:
                    # these are unmapped to a disease id.
                    # note that some match OMIM disease labels
                    # but the identifiers are just not included.
                    # TODO consider mapping to OMIM or DOIDs in other ways
                    logger.warning(
                        "No omim id on line %d\n%s", line_counter, str(row))
                    unmapped_omim_counter += 1

                # TODO hpo phenotypes
                # since the DDG2P file is not documented,
                # I don't know what the HPO annotations are actually about
                # are they about the gene?  the omim disease?  something else?
                # So, we wont create associations until this is clarified

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        myzip.close()
        logger.warning(
            "gene-disorder associations with no omim id: %d",
            unmapped_omim_counter)
        logger.warning("unmapped gene count: %d", unmapped_gene_count)

        gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP)
        gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP)

        return
Пример #29
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        gu = GraphUtils(curie_map.get())

        myfile = "/".join((self.rawdir, self.files["disease-gene"]["file"]))

        for event, elem in ET.iterparse(myfile):
            if elem.tag == "Disorder":
                # get the element name and id
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find("OrphaNumber").text

                disorder_id = "Orphanet:" + str(disorder_num)

                if self.testMode and disorder_id not in config.get_config()["test_ids"]["disease"]:
                    continue

                disorder_label = elem.find("Name").text

                # make a hash of internal gene id to type for later lookup
                gene_iid_to_type = {}
                gene_list = elem.find("GeneList")
                for gene in gene_list.findall("Gene"):
                    gene_iid = gene.get("id")
                    gene_type = gene.find("GeneType").get("id")
                    gene_iid_to_type[gene_iid] = gene_type

                gu.addClassToGraph(g, disorder_id, disorder_label)  # assuming that these are in the ontology

                assoc_list = elem.find("DisorderGeneAssociationList")
                for a in assoc_list.findall("DisorderGeneAssociation"):
                    gene_iid = a.find(".//Gene").get("id")
                    gene_name = a.find(".//Gene/Name").text
                    gene_symbol = a.find(".//Gene/Symbol").text
                    gene_num = a.find("./Gene/OrphaNumber").text
                    gene_id = "Orphanet:" + str(gene_num)
                    gene_type_id = self._map_gene_type_id(gene_iid_to_type[gene_iid])
                    gu.addClassToGraph(g, gene_id, gene_symbol, gene_type_id, gene_name)
                    syn_list = a.find("./Gene/SynonymList")
                    if int(syn_list.get("count")) > 0:
                        for s in syn_list.findall("./Synonym"):
                            gu.addSynonym(g, gene_id, s.text)

                    dgtype = a.find("DisorderGeneAssociationType").get("id")
                    rel_id = self._map_rel_id(dgtype)
                    dg_label = a.find("./DisorderGeneAssociationType/Name").text
                    if rel_id is None:
                        logger.warn(
                            "Cannot map association type (%s) to RO for association (%s | %s).  Skipping.",
                            dg_label,
                            disorder_label,
                            gene_symbol,
                        )
                        continue

                    alt_locus_id = "_" + gene_num + "-" + disorder_num + "VL"
                    alt_label = " ".join(
                        ("some variant of", gene_symbol.strip(), "that is a", dg_label.lower(), disorder_label)
                    )
                    if self.nobnodes:
                        alt_locus_id = ":" + alt_locus_id
                    gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts["variant_locus"])
                    geno.addAlleleOfGene(alt_locus_id, gene_id)

                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use "assessed" status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    status_code = a.find("DisorderGeneAssociationStatus").get("id")
                    eco_id = "ECO:0000323"  # imported automatically asserted information used in automatic assertion
                    if status_code == "17991":  # Assessed  # TODO are these internal ids stable between releases?
                        eco_id = "ECO:0000322"  # imported manually asserted information used in automatic assertion
                    # Non-traceable author statement ECO_0000034
                    # imported information in automatic assertion ECO_0000313

                    assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id)
                    assoc.add_evidence(eco_id)
                    assoc.add_association_to_graph(g)

                    rlist = a.find("./Gene/ExternalReferenceList")
                    eqid = None

                    for r in rlist.findall("ExternalReference"):
                        if r.find("Source").text == "Ensembl":
                            eqid = "ENSEMBL:" + r.find("Reference").text
                        elif r.find("Source").text == "HGNC":
                            eqid = "HGNC:" + r.find("Reference").text
                        elif r.find("Source").text == "OMIM":
                            eqid = "OMIM:" + r.find("Reference").text
                        else:
                            pass  # skip the others for now
                        if eqid is not None:
                            gu.addClassToGraph(g, eqid, None)
                            gu.addEquivalentClass(g, gene_id, eqid)
                            pass
                elem.clear()  # discard the element

            if self.testMode and limit is not None and line_counter > limit:
                return

        gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
        gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
        gu.loadAllProperties(g)

        return
Пример #30
0
    def _get_identifiers(self, limit):
        """
        This will process the id mapping file provided by Biogrid.
        The file has a very large header, which we scan past,
        then pull the identifiers, and make equivalence axioms

        :param limit:
        :return:

        """

        logger.info("getting identifier mapping")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['identifiers']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        foundheader = False

        gu = GraphUtils(curie_map.get())

        # TODO align this species filter with the one above
        # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster,
        # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',')

        speciesfilters = 'H**o sapiens,Mus musculus'.split(',')
        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip header lines
                if not foundheader:
                    if re.match(r'BIOGRID_ID', line.decode()):
                        foundheader = True
                    continue

                line = line.decode().strip()
                # BIOGRID_ID
                # IDENTIFIER_VALUE
                # IDENTIFIER_TYPE
                # ORGANISM_OFFICIAL_NAME
                # 1	814566	ENTREZ_GENE	Arabidopsis thaliana
                (biogrid_num, id_num, id_type,
                 organism_label) = line.split('\t')

                if self.testMode:
                    g = self.testgraph
                    # skip any genes that don't match our test set
                    if int(biogrid_num) not in self.biogrid_ids:
                        continue
                else:
                    g = self.graph

                # for each one of these,
                # create the node and add equivalent classes
                biogrid_id = 'BIOGRID:'+biogrid_num
                prefix = self._map_idtype_to_prefix(id_type)

                # TODO make these filters available as commandline options
                # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,
                #                   WormBase,XenBase,ENSEMBL,miRBase'.split(',')
                geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',')
                # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
                if (speciesfilters is not None) \
                        and (organism_label.strip() in speciesfilters):
                    line_counter += 1
                    if (geneidtypefilters is not None) \
                            and (prefix in geneidtypefilters):
                        mapped_id = ':'.join((prefix, id_num))
                        gu.addEquivalentClass(g, biogrid_id, mapped_id)
                    # this symbol will only get attached to the biogrid class
                    elif id_type == 'OFFICIAL_SYMBOL':
                        gu.addClassToGraph(g, biogrid_id, id_num)
                    # elif (id_type == 'SYNONYM'):
                    #   FIXME - i am not sure these are synonyms, altids?
                    #   gu.addSynonym(g,biogrid_id,id_num)

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        myzip.close()

        return
Пример #31
0
    def _fill_provenance_graph(self, limit):
        logger.info("Building graph ...")
        gu = GraphUtils(curie_map.get())
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        taxon_id = 'NCBITaxon:10090'  # hardcode to Mus musculus
        gu.addClassToGraph(g, taxon_id, None)

        scores_passing_threshold_count = 0
        scores_passing_threshold_with_ontologies_count = 0
        scores_not_passing_threshold_count = 0

        # loop through all the strains,
        # and make G2P assoc for those with scores beyond threshold
        for strain_num in self.strain_scores_by_measure:
            if self.testMode and 'MPD:'+str(strain_num) not in self.test_ids:
                continue
            strain_id = 'MPD-strain:'+str(strain_num)
            for sex in self.strain_scores_by_measure[strain_num]:
                measures = self.strain_scores_by_measure[strain_num][sex]
                for m in measures:
                    assay_id = 'MPD-assay:'+str(m)
                    # TODO consider using the means
                    # instead of precomputed zscores
                    if 'zscore' in measures[m]:
                        zscore = measures[m]['zscore']
                        if abs(zscore) >= self.stdevthreshold:
                            scores_passing_threshold_count += 1
                            # logger.info(
                            #   "Score passing threshold: %s | %s | %s",
                            #   strain_id, assay_id, zscore)
                            # add the G2P assoc
                            prov = Provenance()
                            assay_label = self.assayhash[m]['assay_label']
                            if assay_label is not None:
                                assay_label += ' ('+str(m)+')'
                            # TODO unused
                            # assay_type = self.assayhash[m]['assay_type']
                            assay_description = \
                                self.assayhash[m]['description']
                            assay_type_id = Provenance.prov_types['assay']
                            comment = ' '.join((assay_label,
                                                '(zscore='+str(zscore)+')'))
                            ont_term_ids = self.assayhash[m].get('ont_terms')
                            if ont_term_ids is not None:
                                scores_passing_threshold_with_ontologies_count += 1
                                prov.add_assay_to_graph(
                                    g, assay_id, assay_label, assay_type_id,
                                    assay_description)
                                self._add_g2p_assoc(
                                    g, strain_id, sex, assay_id, ont_term_ids,
                                    comment)
                        else:
                            scores_not_passing_threshold_count += 1

        logger.info("Scores passing threshold: %d",
                    scores_passing_threshold_count)
        logger.info("Scores passing threshold with ontologies: %d",
                    scores_passing_threshold_with_ontologies_count)
        logger.info("Scores not passing threshold: %d",
                    scores_not_passing_threshold_count)

        return
Пример #32
0
    def _process_phenotype_tab(self, raw, limit):
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        gu = GraphUtils(curie_map.get())
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (db, num, name, qual, pheno_id, publist, eco, onset, freq, w, asp, syn, date, curator) = row
                disease_id = db + ":" + str(num)

                if self.testMode and disease_id.strip() not in config.get_config()['test_ids']['disease']:
                    continue

                # logger.info('adding %s', disease_id)

                gu.addClassToGraph(g, disease_id, None)
                gu.addClassToGraph(g, pheno_id, None)
                eco_id = self._map_evidence_to_codes(eco)
                gu.addClassToGraph(g, eco_id, None)
                if onset is not None and onset.strip() != '':
                    gu.addClassToGraph(g, onset, None)

                # we want to do things differently depending on the aspect of the annotation
                if asp == 'O' or asp == 'M':  # organ abnormality or mortality
                    assoc = D2PAssoc(self.name, disease_id, pheno_id, onset, freq)
                elif asp == 'I':  # inheritance patterns for the whole disease
                    assoc = DispositionAssoc(self.name, disease_id, pheno_id)
                elif asp == 'C':  # clinical course / onset
                    assoc = DispositionAssoc(self.name, disease_id, pheno_id)
                else:
                    logger.error("I don't know what this aspect is:", asp)

                assoc.add_evidence(eco_id)

                publist = publist.split(';')
                # blow these apart if there is a list of pubs
                for pub in publist:
                    pub = pub.strip()
                    if pub != '':
                        # if re.match('http://www.ncbi.nlm.nih.gov/bookshelf/br\.fcgi\?book=gene', pub):
                        #     #http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ced
                        #     m = re.search('part\=(\w+)', pub)
                        #     pub_id = 'GeneReviews:'+m.group(1)
                        # elif re.search('http://www.orpha.net/consor/cgi-bin/OC_Exp\.php\?lng\=en\&Expert\=', pub):
                        #     m = re.search('Expert=(\d+)', pub)
                        #     pub_id = 'Orphanet:'+m.group(1)
                        if not re.match('http', pub):
                            r = Reference(pub)
                            if re.match('PMID', pub):
                                r.setType(Reference.ref_types['journal_article'])
                            r.addRefToGraph(g)
                        # TODO add curator
                        assoc.add_source(pub)

                assoc.add_association_to_graph(g)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

            Assoc(None).load_all_properties(g)

        return