Пример #1
0
    def _add_therapy_drug_association(self, drug_id, disease_id, therapy_status_id):
        """
        Create an association linking a drug and disease with
        RO:0002606 (substance_that_treats) and any supporting information
        such as FDA approval and source (not implemented)
        :param drug_id: Id as curie of the drug
        :param disease_id: Id as curie of the disease
        :param therapy_status: (Optional) String label of therapy approval status
        :return: None
        """
        gu = GraphUtils(curie_map.get())
        # Placeholder relationship, note this does not exist in RO
        relationship_id = "RO:has_approval_status"
        gu.addTriple(self.graph, drug_id, gu.object_properties['substance_that_treats'], disease_id)
        # Make association
        drug_disease_annot = self.make_cgd_id("assoc{0}{1}".format(drug_id, disease_id))

        therapy_disease_assoc = Assoc(self.name)
        therapy_disease_assoc.set_subject(drug_id)
        therapy_disease_assoc.set_relationship(gu.object_properties['substance_that_treats'])
        therapy_disease_assoc.set_object(disease_id)
        therapy_disease_assoc.set_association_id(drug_disease_annot)
        therapy_disease_assoc.add_association_to_graph(self.graph)

        gu.addTriple(self.graph, drug_disease_annot, relationship_id, therapy_status_id)
Пример #2
0
    def _get_process_allelic_variants(self, entry, g):
        gu = GraphUtils(curie_map.get())
        geno = Genotype(g)
        du = DipperUtil()
        if entry is not None:
            publist = {}  # to hold the entry-specific publication mentions for the allelic variants
            entry_num = entry['mimNumber']

            # process the ref list just to get the pmids
            ref_to_pmid = self._get_pubs(entry, g)

            if 'allelicVariantList' in entry:
                allelicVariantList = entry['allelicVariantList']
                for al in allelicVariantList:
                    al_num = al['allelicVariant']['number']
                    al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4)
                    al_label = None
                    al_description = None
                    if al['allelicVariant']['status'] == 'live':
                        publist[al_id] = set()
                        if 'mutations' in al['allelicVariant']:
                            al_label = al['allelicVariant']['mutations']
                        if 'text' in al['allelicVariant']:
                            al_description = al['allelicVariant']['text']
                            m = re.findall('\{(\d+)\:', al_description)
                            publist[al_id] = set(m)
                        geno.addAllele(al_id, al_label, geno.genoparts['variant_locus'], al_description)
                        geno.addAlleleOfGene(al_id, 'OMIM:'+str(entry_num),
                                             geno.object_properties['is_sequence_variant_instance_of'])
                        for r in publist[al_id]:
                            pmid = ref_to_pmid[int(r)]
                            gu.addTriple(g, pmid, gu.object_properties['is_about'], al_id)
                        # look up the pubmed id in the list of references
                        if 'dbSnps' in al['allelicVariant']:
                            dbsnp_ids = re.split(',', al['allelicVariant']['dbSnps'])
                            for dnum in dbsnp_ids:
                                did = 'dbSNP:'+dnum.strip()
                                gu.addIndividualToGraph(g, did, None)
                                gu.addEquivalentClass(g, al_id, did)
                        if 'clinvarAccessions' in al['allelicVariant']:
                            # clinvarAccessions triple semicolon delimited, each lik eRCV000020059;;1
                            rcv_ids = re.split(';;;', al['allelicVariant']['clinvarAccessions'])
                            rcv_ids = [(re.match('(RCV\d+)\;\;', r)).group(1) for r in rcv_ids]
                            for rnum in rcv_ids:
                                rid = 'ClinVar:'+rnum
                                gu.addXref(g, al_id, rid)
                        gu.addPage(g, al_id, "http://omim.org/entry/"+str(entry_num)+"#"+str(al_num).zfill(4))
                    elif re.search('moved', al['allelicVariant']['status']):
                        # for both 'moved' and 'removed'
                        moved_ids = None
                        if 'movedTo' in al['allelicVariant']:
                            moved_id = 'OMIM:'+al['allelicVariant']['movedTo']
                            moved_ids = [moved_id]
                        gu.addDeprecatedIndividual(g, al_id, moved_ids)
                    else:
                        logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status'])
                # end loop allelicVariantList

        return
Пример #3
0
    def _process_pathway_disease(self, limit):
        """
        We make a link between the pathway identifiers,
        and any diseases associated with them.
        Since we model diseases as processes, we make a triple saying that
        the pathway may be causally upstream of or within the disease process.

        :param limit:
        :return:

        """
        logger.info("Processing KEGG pathways to disease ids")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0

        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['pathway_disease']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (disease_id, kegg_pathway_num) = row

                if self.testMode and \
                        kegg_pathway_num not in self.test_ids['pathway']:
                    continue

                disease_id = 'KEGG-'+disease_id
                # will look like KEGG-path:map04130 or KEGG-path:hsa04130
                pathway_id = 'KEGG-'+kegg_pathway_num

                gu.addTriple(
                    g, pathway_id,
                    GraphUtils.object_properties[
                        'causally_upstream_of_or_within'],
                    disease_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Пример #4
0
    def _get_pubs(self, entry, g):
        """
        Extract mentioned publications from the reference list
        :param entry:
        :return:
        """

        ref_to_pmid = {}
        du = DipperUtil()
        entry_num = entry['mimNumber']
        gu = GraphUtils(curie_map.get())
        if 'referenceList' in entry:
            reflist = entry['referenceList']
            for r in reflist:
                if 'pubmedID' in r['reference']:
                    pub_id = 'PMID:' + str(r['reference']['pubmedID'])
                    ref = Reference(pub_id, Reference.ref_types['journal_article'])
                else:
                    # make blank node for internal reference
                    pub_id = '_OMIM' + str(entry_num) + 'ref' + str(r['reference']['referenceNumber'])
                    if self.nobnodes:
                        pub_id = ':' + pub_id
                    ref = Reference(pub_id)
                    title = author_list = source = citation = None
                    if 'title' in r['reference']:
                        title = r['reference']['title']
                        ref.setTitle(title)
                    if 'authors' in r['reference']:
                        author_list = r['reference']['authors']
                        ref.setAuthorList(author_list)
                        citation = re.split('\.\,', author_list)[0] + ' et al'
                    if 'source' in r['reference']:
                        source = r['reference']['source']
                    citation = '; '.join(du.flatten([citation, title, source]))
                    ref.setShortCitation(citation)
                ref.addRefToGraph(g)
                ref_to_pmid[r['reference']['referenceNumber']] = pub_id

                # add is_about for the pub
                omim_id = 'OMIM:'+str(entry_num)
                gu.addTriple(g, omim_id, gu.object_properties['mentions'], pub_id)

        return ref_to_pmid
Пример #5
0
    def _process_pathway_pubmed(self, limit):
        """
        Indicate that a pathway is annotated directly to a paper (is about)
            via it's pubmed id.
        :param limit:
        :return:
        """
        logger.info("Processing KEGG pathways to pubmed ids")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0

        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['pathway_pubmed']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (pubmed_id, kegg_pathway_num) = row

                if self.testMode and \
                        kegg_pathway_num not in self.test_ids['pathway']:
                    continue

                pubmed_id = pubmed_id.upper()
                # will look like KEGG-path:map04130
                kegg_id = 'KEGG-'+kegg_pathway_num

                r = Reference(
                    pubmed_id, Reference.ref_types['journal_article'])
                r.addRefToGraph(g)
                gu.addTriple(g, pubmed_id,
                             GraphUtils.object_properties['is_about'], kegg_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Пример #6
0
class MPD(Source):
    """
    From the [MPD](http://phenome.jax.org/) website:
    This resource is a collaborative standardized collection of measured data
    on laboratory mouse strains and populations. Includes baseline phenotype
    data sets as well as studies of drug, diet, disease and aging effect.
    Also includes protocols, projects and publications, and SNP,
    variation and gene expression studies.

    Here, we pull the data and model the genotypes using GENO and
    the genotype-to-phenotype associations using the OBAN schema.

    MPD provide measurements for particular assays for several strains.
    Each of these measurements is itself mapped to a MP or VT term
    as a phenotype.  Therefore, we can create a strain-to-phenotype association
    based on those strains that lie outside of the "normal" range for the given
    measurements.  We can compute the average of the measurements
    for all strains tested, and then threshold any extreme measurements being
    beyond some threshold beyond the average.

    Our default threshold here, is +/-2 standard deviations beyond the mean.

    Because the measurements are made and recorded at the level of
    a specific sex of each strain, we associate the MP/VT phenotype with
    the sex-qualified genotype/strain.

    """
    mdpdl = 'http://phenomedoc.jax.org/MPD_downloads'
    files = {
        'ontology_mappings': {
            'file': 'ontology_mappings.csv',
            'url': mdpdl+'/ontology_mappings.csv'},
        'straininfo': {
            'file': 'straininfo.csv',
            'url': mdpdl+'/straininfo.csv'},
        'assay_metadata': {
            'file': 'measurements.csv',
            'url': mdpdl+'/measurements.csv'},
        'strainmeans': {
            'file': 'strainmeans.csv.gz',
            'url': mdpdl+'/strainmeans.csv.gz'},
        # 'mpd_datasets_metadata': { #TEC does not seem to be used
        #    'file': 'mpd_datasets_metadata.xml.gz',
        #    'url': mdpdl+'/mpd_datasets_metadata.xml.gz'},
    }

    # the following are strain ids for testing
    # test_ids = [
    #   "MPD:2", "MPD:3", "MPD:5", "MPD:6", "MPD:9", "MPD:11", "MPD:18",
    #   "MPD:20", "MPD:24", "MPD:28", "MPD:30", "MPD:33", "MPD:34", "MPD:36",
    #   "MPD:37", "MPD:39", "MPD:40", "MPD:42", "MPD:47", "MPD:66", "MPD:68",
    #   "MPD:71", "MPD:75", "MPD:78", "MPD:122", "MPD:169", "MPD:438",
    #   "MPD:457","MPD:473", "MPD:481", "MPD:759", "MPD:766", "MPD:770",
    #   "MPD:849",  "MPD:857", "MPD:955", "MPD:964", "MPD:988", "MPD:1005",
    #   "MPD:1017", "MPD:1204", "MPD:1233", "MPD:1235", "MPD:1236", "MPD:1237"]

    test_ids = [
        'MPD:6', 'MPD:849', 'MPD:425', 'MPD:569', "MPD:10", "MPD:1002",
        "MPD:39", "MPD:2319"]

    mgd_agent_id = "MPD:db/q?rtn=people/allinv"
    mgd_agent_label = "Mouse Phenotype Database"
    mgd_agent_type = "foaf:organization"

    def __init__(self):
        Source.__init__(self, 'mpd')
        # @N, not sure if this step is required
        self.namespaces.update(curie_map.get())
        self.stdevthreshold = 2

        self.nobnodes = True  # FIXME

        # update the dataset object with details about this resource
        # @N: Note that there is no license as far as I can tell
        self.dataset = Dataset(
            'mpd', 'MPD', 'http://phenome.jax.org', None, None)

        # TODO add a citation for mpd dataset as a whole
        self.dataset.set_citation('PMID:15619963')

        self.assayhash = {}
        self.idlabel_hash = {}
        # to store the mean/zscore of each measure by strain+sex
        self.score_means_by_measure = {}
        # to store the mean value for each measure by strain+sex
        self.strain_scores_by_measure = {}

        self.geno = Genotype(self.graph)
        self.gu = GraphUtils(curie_map.get())

        return

    def fetch(self, is_dl_forced=False):

        self.get_files(is_dl_forced)
        return

    def parse(self, limit=None):
        """
        MPD data is delivered in four separate csv files and one xml file,
        which we process iteratively and write out as
        one large graph.

        :param limit:
        :return:
        """
        if limit is not None:
            logger.info("Only parsing first %s rows fo each file", str(limit))

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True
            g = self.testgraph
            self.geno = Genotype(self.testgraph)
        else:
            g = self.graph

        self._process_straininfo(limit)
        # the following will provide us the hash-lookups
        # These must be processed in a specific order

        # mapping between assays and ontology terms
        self._process_ontology_mappings_file(limit)
        # this is the metadata about the measurements
        self._process_measurements_file(limit)
        # get all the measurements per strain
        self._process_strainmeans_file(limit)

        # The following will use the hash populated above
        # to lookup the ids when filling in the graph
        self._fill_provenance_graph(limit)

        logger.info("Finished parsing.")

        self.load_bindings()

        gu = GraphUtils(curie_map.get())
        gu.loadAllProperties(g)
        gu.loadProperties(g, G2PAssoc.object_properties, GraphUtils.OBJPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, GraphUtils.OBJPROP)
        gu.loadProperties(
            g, G2PAssoc.annotation_properties, GraphUtils.ANNOTPROP)

        logger.info("Found %d nodes", len(self.graph))
        return

    def _process_ontology_mappings_file(self, limit):

        # line_counter = 0  # TODO unused

        logger.info("Processing ontology mappings...")
        raw = '/'.join((self.rawdir, 'ontology_mappings.csv'))

        with open(raw, 'r') as f:
            reader = csv.reader(f)
            # read the header row; skip
            f.readline()
            for row in reader:
                try:
                    (assay_id, ont_term, descrip) = row
                except ValueError:
                    continue
                assay_id = int(assay_id)
                if re.match(r'(MP|VT)', ont_term):
                    # add the mapping denovo
                    if assay_id not in self.assayhash:
                        self.assayhash[assay_id] = {}
                        self.assayhash[assay_id]['ont_terms'] = set()
                    self.assayhash[assay_id]['ont_terms'].add(ont_term)

        return

    def _process_straininfo(self, limit):
        # line_counter = 0  # TODO unused
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, self.files['straininfo']['file']))

        tax_id = 'NCBITaxon:10090'

        gu = GraphUtils(curie_map.get())

        with open(raw, 'r') as f:
            reader = csv.reader(f, delimiter=',', quotechar='\"')
            f.readline()  # read the header row; skip
            for row in reader:
                (strain_name, vendor, stocknum, panel, mpd_strainid,
                 straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.testMode and \
                        'MPD:'+str(mpd_strainid) not in self.test_ids:
                    continue
                strain_id = 'MPD-strain:'+str(mpd_strainid)
                gu.addIndividualToGraph(g, strain_id, strain_name, tax_id)
                if mpdshortname.strip() != '':
                    gu.addSynonym(g, strain_id, mpdshortname.strip())
                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:'+stocknum
                        gu.addSameIndividual(g, strain_id, jax_id)
                    elif vendor == 'Rbrc':
                        # reiken
                        reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
                        gu.addSameIndividual(g, strain_id, reiken_id)
                    else:
                        if url != '':
                            gu.addXref(g, strain_id, url, True)
                        if vendor != '':
                            gu.addXref(
                                g, strain_id, ':'.join((vendor, stocknum)),
                                True)

                # add the panel information
                if panel != '':
                    desc = panel+' [panel]'
                    gu.addDescription(g, strain_id, desc)

                # TODO make the panels as a resource collection

        return

    def _process_measurements_file(self, limit):
        line_counter = 0

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, 'measurements.csv'))

        with open(raw, 'r') as f:
            reader = csv.reader(f)
            # read the header row; skip
            header = f.readline()
            logger.info("HEADER: %s", header)
            for row in reader:
                # measnum,projsym,varname,descrip,units,cat1,cat2,cat3,
                # intervention,intparm,appmeth,panelsym,datatype,sextested,
                # nstrainstested,ageweeks
                # Again the last row has changed. contains: '(4486 rows)'
                if len(row) != 16:
                    continue
                line_counter += 1
                assay_id = int(row[0])
                assay_label = row[3]
                assay_units = row[4]
                assay_type = row[10] if row[10] is not '' else None

                if assay_id not in self.assayhash:
                    self.assayhash[assay_id] = {}
                description = self.build_measurement_description(row)
                self.assayhash[assay_id]['description'] = description
                self.assayhash[assay_id]['assay_label'] = assay_label
                self.assayhash[assay_id]['assay_type'] = assay_type
                self.assayhash[assay_id]['assay_units'] = assay_units

                # TODO add projectsym property?
                # TODO add intervention?
                # ageweeks might be useful for adding to phenotype assoc

            # end loop on measurement metadata

        return

    def _process_strainmeans_file(self, limit):
        """
        This will store the entire set of strain means in a hash.
        Not the most efficient representation,
        but easy access.
        We will loop through this later to then apply cutoffs
        and add associations
        :param limit:
        :return:

        """
        logger.info("Processing strain means ...")
        line_counter = 0
        raw = '/'.join((self.rawdir, self.files['strainmeans']['file']))
        with gzip.open(raw, 'rb') as f:
            f = io.TextIOWrapper(f)
            reader = csv.reader(f)
            f.readline()  # read the header row; skip
            score_means_by_measure = {}
            strain_scores_by_measure = {}
            for row in reader:
                try:
                    (measnum, varname, strain, strainid, sex, mean, nmice, sd,
                     sem, cv, minval, maxval, logmean, logsd, zscore,
                     logzscore) = row
                except ValueError:
                    continue
                line_counter += 1
                strain_num = int(strainid)
                assay_num = int(measnum)
                # assuming the zscore is across all the items
                # in the same measure+var+strain+sex
                # note: it seems that there is only ever 1 varname per measnum.
                # note: some assays only tested one sex!
                # we split this here by sex
                if assay_num not in score_means_by_measure:
                    score_means_by_measure[assay_num] = {}
                if sex not in score_means_by_measure[assay_num]:
                    score_means_by_measure[assay_num][sex] = list()
                score_means_by_measure[assay_num][sex].append(float(mean))

                if strain_num not in strain_scores_by_measure:
                    strain_scores_by_measure[strain_num] = {}
                if sex not in strain_scores_by_measure[strain_num]:
                    strain_scores_by_measure[strain_num][sex] = {}
                strain_scores_by_measure[strain_num][sex][assay_num] = \
                    {'mean': float(mean), 'zscore': float(zscore)}

            # end loop over strainmeans
        self.score_means_by_measure = score_means_by_measure
        self.strain_scores_by_measure = strain_scores_by_measure

        return

    def _fill_provenance_graph(self, limit):
        logger.info("Building graph ...")
        gu = GraphUtils(curie_map.get())
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        taxon_id = 'NCBITaxon:10090'  # hardcode to Mus musculus
        gu.addClassToGraph(g, taxon_id, None)

        scores_passing_threshold_count = 0
        scores_passing_threshold_with_ontologies_count = 0
        scores_not_passing_threshold_count = 0

        # loop through all the strains,
        # and make G2P assoc for those with scores beyond threshold
        for strain_num in self.strain_scores_by_measure:
            if self.testMode and 'MPD:'+str(strain_num) not in self.test_ids:
                continue
            strain_id = 'MPD-strain:'+str(strain_num)
            for sex in self.strain_scores_by_measure[strain_num]:
                measures = self.strain_scores_by_measure[strain_num][sex]
                for m in measures:
                    assay_id = 'MPD-assay:'+str(m)
                    # TODO consider using the means
                    # instead of precomputed zscores
                    if 'zscore' in measures[m]:
                        zscore = measures[m]['zscore']
                        if abs(zscore) >= self.stdevthreshold:
                            scores_passing_threshold_count += 1
                            # logger.info(
                            #   "Score passing threshold: %s | %s | %s",
                            #   strain_id, assay_id, zscore)
                            # add the G2P assoc
                            prov = Provenance()
                            assay_label = self.assayhash[m]['assay_label']
                            if assay_label is not None:
                                assay_label += ' ('+str(m)+')'
                            # TODO unused
                            # assay_type = self.assayhash[m]['assay_type']
                            assay_description = \
                                self.assayhash[m]['description']
                            assay_type_id = Provenance.prov_types['assay']
                            comment = ' '.join((assay_label,
                                                '(zscore='+str(zscore)+')'))
                            ont_term_ids = self.assayhash[m].get('ont_terms')
                            if ont_term_ids is not None:
                                scores_passing_threshold_with_ontologies_count += 1
                                prov.add_assay_to_graph(
                                    g, assay_id, assay_label, assay_type_id,
                                    assay_description)
                                self._add_g2p_assoc(
                                    g, strain_id, sex, assay_id, ont_term_ids,
                                    comment)
                        else:
                            scores_not_passing_threshold_count += 1

        logger.info("Scores passing threshold: %d",
                    scores_passing_threshold_count)
        logger.info("Scores passing threshold with ontologies: %d",
                    scores_passing_threshold_with_ontologies_count)
        logger.info("Scores not passing threshold: %d",
                    scores_not_passing_threshold_count)

        return

    def _add_g2p_assoc(self, g, strain_id, sex, assay_id, phenotypes, comment):
        """
        Create an association between a sex-specific strain id
        and each of the phenotypes.
        Here, we create a genotype from the strain,
        and a sex-specific genotype.
        Each of those genotypes are created as anonymous nodes.

        The evidence code is hardcoded to be:
            ECO:experimental_phenotypic_evidence.

        :param g:
        :param strain_id:
        :param sex:
        :param assay_id:
        :param phenotypes: a list of phenotypes to association with the strain
        :param comment:
        :return:

        """

        eco_id = "ECO:0000059"  # experimental_phenotypic_evidence
        strain_label = self.idlabel_hash.get(strain_id)
        # strain genotype
        genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id), 'genotype'))
        genotype_label = '['+strain_label+']'

        sex_specific_genotype_id = '_'+'-'.join((re.sub(r':', '', strain_id),
                                                 sex, 'genotype'))
        if strain_label is not None:
            sex_specific_genotype_label = strain_label + ' (' + sex + ')'
        else:
            sex_specific_genotype_label = strain_id + '(' + sex + ')'

        if self.nobnodes:
            genotype_id = ':'+genotype_id
            sex_specific_genotype_id = ':'+sex_specific_genotype_id

        genotype_type = Genotype.genoparts['sex_qualified_genotype']
        if sex == 'm':
            genotype_type = Genotype.genoparts['male_genotype']
        elif sex == 'f':
            genotype_type = Genotype.genoparts['female_genotype']

        # add the genotype to strain connection
        self.geno.addGenotype(
            genotype_id, genotype_label,
            Genotype.genoparts['genomic_background'])
        self.gu.addTriple(
            g, strain_id,
            Genotype.object_properties['has_genotype'], genotype_id)

        self.geno.addGenotype(
            sex_specific_genotype_id, sex_specific_genotype_label,
            genotype_type)

        # add the strain as the background for the genotype
        self.gu.addTriple(
            g, sex_specific_genotype_id,
            Genotype.object_properties['has_sex_agnostic_genotype_part'],
            genotype_id)

        # #############    BUILD THE G2P ASSOC    #############
        # TODO add more provenance info when that model is completed

        if phenotypes is not None:
            for phenotype_id in phenotypes:
                assoc = G2PAssoc(
                    self.name, sex_specific_genotype_id, phenotype_id)
                assoc.add_evidence(assay_id)
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph(g)
                assoc_id = assoc.get_association_id()
                self.gu.addComment(g, assoc_id, comment)

        return

    def getTestSuite(self):
        import unittest
        from tests.test_mpd import MPDTestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(MPDTestCase)

        return test_suite

    @staticmethod
    def normalise_units(units):
        # todo:
        return units

    @staticmethod
    def build_measurement_description(row):
        (assay_id, projsym, varname, descrip, units, cat1, cat2, cat3,
         intervention, intparm, appmeth, panelsym, datatype, sextested,
         nstrainstested, ageweeks) = row

        if sextested == 'f':
            sextested = 'female'
        elif sextested == 'm':
            sextested = 'male'
        elif sextested == 'fm':
            sextested = 'male and female'
        else:
            logger.warning("Unknown sex tested key: %s", sextested)
        description = "This is an assay of [" + descrip + "] shown as a [" + \
                      datatype + "] measured in [" + units + "]"

        if intervention is not None and intervention != "":
            description += " in response to [" + intervention + "]"
        if intparm is not None and intervention != "":
            description += \
                ". This represents the [" + intparm + \
                "] arm, using materials and methods that included [" +\
                appmeth + "]"

        description += \
            ".  The overall experiment is entitled [" + projsym + "].  "

        description += \
            "It was conducted in [" + sextested + "] mice at [" + \
            ageweeks + "] of age in" + " [" + nstrainstested + \
            "] different mouse strains. "
        description += "Keywords: " + cat1 + \
                       ((", " + cat2) if cat2.strip() is not "" else "") + \
                       ((", " + cat3) if cat3.strip() is not "" else "") + "."
        return description
Пример #7
0
class Genotype():
    """
    These methods provide convenient methods to add items related to a genotype and it's parts to a supplied graph.
    They follow the patterns set out in GENO https://github.com/monarch-initiative/GENO-ontology.
    For specific sequence features, we use the GenomicFeature class to create them.
    """

    # special genotype parts mapped to their GENO and SO classes that we explicitly reference here
    genoparts = {
        'intrinsic_genotype': 'GENO:0000000',
        'extrinsic_genotype': 'GENO:0000524',
        'effective_genotype': 'GENO:0000525',
        'genomic_background': 'GENO:0000611',
        'genomic_variation_complement': 'GENO:0000009',
        'karyotype_variation_complement': 'GENO:0000644',
        'variant_single_locus_complement': 'GENO:0000030',
        'variant_locus': 'GENO:0000002',
        'reference_locus': 'GENO:0000036',
        'allele': 'GENO:0000008',
        'gene': 'SO:0000704',
        'QTL': 'SO:0000771',
        'transgene': 'SO:0000902',
        'pseudogene': 'SO:0000336',
        'cytogenetic marker': 'SO:0000341',
        'sequence_feature': 'SO:0000110',
        'sequence_alteration': 'SO:0001059',
        'insertion': 'SO:0000667',
        'deletion': 'SO:0000159',
        'substitution': 'SO:1000002',
        'duplication': 'SO:1000035',
        'translocation': 'SO:0000199',
        'inversion': 'SO:1000036',
        'tandem_duplication': 'SO:1000173',
        'point_mutation': 'SO:1000008',
        'population': 'PCO:0000001',  # population
        'family': 'PCO:0000020',  # family
        'wildtype': 'GENO:0000511',
        'reagent_targeted_gene': 'GENO:0000504',
        'targeted_gene_subregion' : 'GENO:0000534',
        'targeted_gene_complement' : 'GENO:0000527',
        'biological_region' : 'SO:0001411',
        'missense_variant': 'SO:0001583',
        'transcript': 'SO:0000233',
        'polypeptide': 'SO:0000104',
        'cDNA': 'SO:0000756',
        'sequence_variant_causing_loss_of_function_of_polypeptide': 'SO:1000118',
        'sequence_variant_causing_gain_of_function_of_polypeptide': 'SO:1000125',
        'sequence_variant_causing_inactive_catalytic_site': 'SO:1000120',
        'sequence_variant_affecting_polypeptide_function': 'SO:1000117',
        'regulatory_transgene_feature': 'GENO:0000638',
        'coding_transgene_feature': 'GENO:0000637',
        'protein_coding_gene': 'SO:0001217',
        'ncRNA_gene': 'SO:0001263'
    }

    object_properties = {
        'is_mutant_of': 'GENO:0000440',
        'derives_from': 'RO:0001000',
        'has_alternate_part': 'GENO:0000382',
        'has_reference_part': 'GENO:0000385',
        'in_taxon': 'RO:0002162',
        'has_zygosity': 'GENO:0000608',
        'is_sequence_variant_instance_of': 'GENO:0000408',  # links a alternate locus (instance) to a gene (class)
        'targets_instance_of': 'GENO:0000414',
        'is_reference_instance_of': 'GENO:0000610',
        'has_part': 'BFO:0000051',
        'has_member_with_allelotype': 'GENO:0000225',  # use this when relating populations
        'is_allelotype_of': 'GENO:0000206',
        'has_genotype': 'GENO:0000222',
        'has_phenotype': 'RO:0002200',
        'transcribed_to': 'RO:0002205',
        'translates_to': 'RO:0002513',
        'is_targeted_expression_variant_of' : 'GENO:0000443',
        'is_transgene_variant_of': 'GENO:0000444',
        'has_expression-variant_part' : 'GENO:0000532',
        'targeted_by' : 'GENO:0000634',  # between a (reagent-targeted gene) and a morpholino
        'derives_sequence_from_gene': 'GENO:0000639',   # FIXME should this just be subsequence of?
        'feature_to_gene_relation': 'GENO:0000418'
    }

    annotation_properties = {
        # TODO change properties with https://github.com/monarch-initiative/GENO-ontology/issues/21
        'reference_nucleotide': 'GENO:reference_nucleotide',  # Made up term
        'reference_amino_acid': 'GENO:reference_amino_acid',  # Made up term
        'altered_nucleotide': 'GENO:altered_nucleotide',  # Made up term
        'results_in_amino_acid_change': 'GENO:results_in_amino_acid_change'  # Made up term
    }

    zygosity = {
        'homoplasmic': 'GENO:0000602',
        'heterozygous': 'GENO:0000135',
        'indeterminate': 'GENO:0000137',
        'heteroplasmic': 'GENO:0000603',
        'hemizygous-y': 'GENO:0000604',
        'hemizygous-x': 'GENO:0000605',
        'homozygous': 'GENO:0000136',
        'hemizygous': 'GENO:0000606',
        'complex_heterozygous': 'GENO:0000402',
        'simple_heterozygous': 'GENO:0000458'
    }

    properties = object_properties.copy()
    properties.update(annotation_properties)

    def __init__(self, graph):

        self.gu = GraphUtils(curie_map.get())

        self.graph = graph

        self.gu.loadProperties(self.graph, self.object_properties, self.gu.OBJPROP)

        return

    def addGenotype(self, genotype_id, genotype_label, genotype_type=None, genotype_description=None):
        """
        If a genotype_type is not supplied, we will default to 'intrinsic_genotype'
        :param genotype_id:
        :param genotype_label:
        :param genotype_type:
        :param genotype_description:
        :return:
        """
        if genotype_type is None:
            genotype_type = self.genoparts['intrinsic_genotype']

        self.gu.addIndividualToGraph(self.graph, genotype_id, genotype_label, genotype_type, genotype_description)

        return

    def addAllele(self, allele_id, allele_label, allele_type=None, allele_description=None):
        """
        Make an allele object. If no allele_type is added, it will default to a geno:allele
        :param allele_id: curie for allele (required)
        :param allele_label: label for allele (required)
        :param allele_type: id for an allele type (optional, recommended SO or GENO class)
        :param allele_description: a free-text description of the allele
        :return:
        """
        # TODO should we accept a list of allele types?
        if (allele_type is None):
            allele_type = self.genoparts['allele']  #TODO is this a good idea?
        self.gu.addIndividualToGraph(self.graph, allele_id, allele_label, allele_type, allele_description)

        return

    def addGene(self, gene_id, gene_label, gene_type=None, gene_description=None):
        if gene_type is None:
            gene_type = self.genoparts['gene']
        # genes are classes
        self.gu.addClassToGraph(self.graph, gene_id, gene_label, gene_type, gene_description)

        return

    def addConstruct(self, construct_id, construct_label, construct_type=None, construct_description=None):
        # TODO add base type for construct
        # if (constrcut_type is None):
        #    constrcut_type=self.construct_base_type
        self.gu.addIndividualToGraph(self.graph, construct_id, construct_label, construct_type, construct_description)

        return

    def addDerivesFrom(self, child_id, parent_id):
        """
        We add a derives_from relationship between the child and parent id.  Examples of uses include between:
        an allele and a construct or strain here, a cell line and it's parent genotype.  Adding the
        parent and child to the graph should happen outside of this function call to
        ensure graph integrity.
        :param child_id:
        :param parent_id:
        :return:
        """

        self.gu.addTriple(self.graph, child_id, self.properties['derives_from'], parent_id)

        return

    def addSequenceDerivesFrom(self, child_id, parent_id):
        self.gu.addTriple(self.graph, child_id, self.properties['derives_sequence_from_gene'], parent_id)
        return

    def addAlleleOfGene(self, allele_id, gene_id, rel_id=None):
        """
        We make the assumption here that if the relationship is not provided, it is a
        GENO:is_sequence_variant_instance_of.

        Here, the allele should be a variant_locus, not a sequence alteration.
        :param allele_id:
        :param gene_id:
        :param rel_id:
        :return:
        """
        if (rel_id is None):
            rel_id = self.properties['is_sequence_variant_instance_of']
        self.gu.addTriple(self.graph, allele_id, rel_id, gene_id)
        return

    def addTranscript(self, variant_id, transcript_id, transcript_label=None, transcript_type=None):
        """
        Add gene/variant/allele transcribes_to relationship
        :param variant_id:
        :param transcript_id:
        :param transcript_label:
        :param transcript_type:
        :return:
        """
        self.gu.addIndividualToGraph(self.graph, transcript_id, transcript_label, transcript_type)
        self.gu.addTriple(self.graph, variant_id, self.properties['transcribed_to'], transcript_id)

        return

    def addPolypeptide(self, polypeptide_id, polypeptide_label=None, transcript_id=None, polypeptide_type=None, ):
        """
        :param polypeptide_id:
        :param polypeptide_label:
        :param polypeptide_type:
        :param transcript_id:
        :return:
        """
        if polypeptide_type is None:
            polypeptide_type = self.genoparts['polypeptide']
        self.gu.addIndividualToGraph(self.graph, polypeptide_id, polypeptide_label, polypeptide_type)
        if transcript_id is not None:
            self.gu.addTriple(self.graph, transcript_id, self.properties['translates_to'], polypeptide_id)

        return


    def addPartsToVSLC(self, vslc_id, allele1_id, allele2_id, zygosity_id=None, allele1_rel=None, allele2_rel=None):
        """
        Here we add the parts to the VSLC.  While traditionally alleles (reference or variant loci) are
        traditionally added, you can add any node (such as sequence_alterations for unlocated variations)
        to a vslc if they are known to be paired.  However, if a sequence_alteration's loci is unknown,
        it probably should be added directly to the GVC.
        :param vslc_id:
        :param allele1_id:
        :param allele2_id:
        :param zygosity_id:
        :param allele1_rel:
        :param allele2_rel:
        :return:
        """

        # vslc has parts allele1/allele2
        gu = self.gu

        vslc = gu.getNode(vslc_id)
        if allele1_id is not None:
            self.addParts(allele1_id, vslc_id, allele1_rel)
        if allele2_id is not None and allele2_id.strip() != '':
            self.addParts(allele2_id, vslc_id, allele2_rel)

        # figure out zygosity if it's not supplied
        if zygosity_id is None:
            if allele1_id == allele2_id:
                zygosity_id = self.zygosity['homozygous']
            else:
                zygosity_id = self.zygosity['heterozygous']

        if zygosity_id is not None:
            gu.addTriple(self.graph, vslc_id, self.properties['has_zygosity'], zygosity_id)

        return

    def addVSLCtoParent(self, vslc_id, parent_id):
        """
        The VSLC can either be added to a genotype or to a GVC.  The vslc is added as a part of the parent.
        :param vslc_id:
        :param parent_id:
        :return:
        """
        self.addParts(vslc_id, parent_id, self.properties['has_alternate_part'])

        return

    def addParts(self, part_id, parent_id, part_relationship=None):
        """
        This will add a has_part (or subproperty) relationship between a parent_id and the supplied part.
        By default the relationship will be BFO:has_part, but any relationship could be given here.
        :param part_id:
        :param parent_id:
        :param part_relationship:
        :return:
        """
        if part_relationship is None:
            part_relationship = self.properties['has_part']

        self.gu.addTriple(self.graph, parent_id, part_relationship, part_id)

        return

    def addSequenceAlteration(self, sa_id, sa_label, sa_type=None, sa_description=None):
        if sa_type is None:
            sa_type = self.genoparts['sequence_alteration']
        self.gu.addIndividualToGraph(self.graph, sa_id, sa_label, sa_type, sa_description)

        return

    def addSequenceAlterationToVariantLocus(self, sa_id, vl_id):
        self.addParts(sa_id, vl_id, self.properties['has_alternate_part'])
        return

    def addGenomicBackground(self, background_id, background_label, background_type=None, background_description=None):
        if background_type is None:
            background_type = self.genoparts['genomic_background']
        self.gu.addIndividualToGraph(self.graph, background_id, background_label, background_type, background_description)

        return

    def addGenomicBackgroundToGenotype(self, background_id, genotype_id):
        self.gu.addType(self.graph, background_id, self.genoparts['genomic_background'])
        self.addParts(background_id, genotype_id, self.object_properties['has_reference_part'])

        return

    def addTaxon(self, taxon_id, genopart_id):
        """
        The supplied geno part will have the specified taxon added with RO:in_taxon relation.
        Generally the taxon is associated with a genomic_background, but could be added to any
        genotype part (including a gene, regulatory element, or sequence alteration).
        :param taxon_id:
        :param genopart_id:
        :return:
        """
        in_taxon = self.gu.getNode(self.properties['in_taxon'])
        s = self.gu.getNode(genopart_id)
        self.graph.add((s, in_taxon, self.gu.getNode(taxon_id)))

        return

    def addGeneTargetingReagentToGenotype(self, reagent_id, genotype_id):
        # for example, add a morphant reagent thingy to the genotype, assuming it's a extrinsic_genotype
        p = self.object_properties['has_expression-variant_part']
        self.gu.addTriple(self.graph, genotype_id, p, reagent_id)

        return

    def addGeneTargetingReagent(self, reagent_id, reagent_label, reagent_type, gene_id, description=None):
        """
        Here, a gene-targeting reagent is added.  The actual targets of this reagent should be added separately.
        :param reagent_id:
        :param reagent_label:
        :param reagent_type:
        :return:
        """
        # TODO add default type to reagent_type
        self.gu.addIndividualToGraph(self.graph, reagent_id, reagent_label, reagent_type, description)

        self.gu.addTriple(self.graph, reagent_id, self.object_properties['targets_instance_of'], gene_id)

        return

    def addReagentTargetedGene(self, reagent_id, gene_id, targeted_gene_id=None, targeted_gene_label=None,
                               description=None):
        """
        This will create the instance of a gene that is targeted by a molecular reagent (such as a morpholino or rnai).
        If an instance id is not supplied, we will create it as an anonymous individual which is of the
        type GENO:reagent_targeted_gene.  We will also add the targets relationship between the reagent and gene class.

        <targeted_gene_id> a GENO:reagent_targeted_gene
            rdf:label targeted_gene_label
            dc:description description
        <reagent_id> GENO:targets_instance_of <gene_id>

        :param reagent_id:
        :param gene_id:
        :param targeted_gene_id:
        :return:
        """

        # akin to a variant locus
        if (targeted_gene_id is None):
            targeted_gene_id = '_' + gene_id + '-' + reagent_id
        self.gu.addIndividualToGraph(self.graph, targeted_gene_id, targeted_gene_label,
                                     self.genoparts['reagent_targeted_gene'], description)

        self.gu.addTriple(self.graph, targeted_gene_id,
                          self.object_properties['is_targeted_expression_variant_of'], gene_id)

        self.gu.addTriple(self.graph, targeted_gene_id, self.object_properties['targeted_by'], reagent_id)

        return

    def addTargetedGeneSubregion(self, tgs_id, tgs_label, tgs_type=None, tgs_description=None):
        if tgs_type is None:
            tgs_type = self.genoparts['targeted_gene_subregion']
        self.gu.addIndividualToGraph(self.graph, tgs_id, tgs_label, tgs_type, tgs_description)

    def addMemberOfPopulation(self, member_id, population_id):
        self.gu.addTriple(self.graph, population_id,
                          self.properties['has_member_with_allelotype'], member_id)

        return


    def addTargetedGeneComplement(self, tgc_id, tgc_label, tgc_type=None, tgc_description=None):
        if tgc_type is None:
            tgc_type = self.genoparts['targeted_gene_complement']
        self.gu.addIndividualToGraph(self.graph, tgc_id, tgc_label, tgc_type, tgc_description)

        return


    def addGenome(self, taxon_id, taxon_label=None):
        if taxon_label is None:
            taxon_label = taxon_id
        genome_label = taxon_label+' genome'
        genome_id = self.makeGenomeID(taxon_id)
        self.gu.addClassToGraph(self.graph, genome_id, genome_label, Feature.types['genome'])

        return

    def addReferenceGenome(self, build_id, build_label, taxon_id):
        genome_id = self.makeGenomeID(taxon_id)
        self.gu.addIndividualToGraph(self.graph, build_id, build_label, Feature.types['reference_genome'])
        self.gu.addType(self.graph, build_id, genome_id)
        self.addTaxon(taxon_id, build_id)

        return

    def makeGenomeID(self, taxon_id):
        # scrub off the taxon prefix.  put it in base space

        genome_id = re.sub('.*\:', ':', taxon_id) + 'genome'

        return genome_id

    def addChromosome(self, chr, tax_id, tax_label=None, build_id=None, build_label=None):
        # if it's just the chromosome, add it as an instance of a SO:chromosome, and add it to the genome.
        # if a build is included, punn the chromosome as a subclass of SO:chromsome, and
        # make the build-specific chromosome an instance of the supplied chr.  The chr then becomes part of the
        # build or genome.

        # first, make the chromosome class, at the taxon level
        chr_id = makeChromID(str(chr), tax_id)
        if tax_label is not None:
            chr_label = makeChromLabel(chr, tax_label)
        else:
            chr_label = makeChromLabel(chr)
        genome_id = self.makeGenomeID(tax_id)
        self.gu.addClassToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome'])
        self.addTaxon(tax_id, genome_id)  # add the taxon to the genome

        if build_id is not None:
            chrinbuild_id = makeChromID(chr, build_id)  # the build-specific chromosome
            if build_label is None:
                build_label = build_id
            chrinbuild_label = makeChromLabel(chr, build_label)
            # add the build-specific chromosome as an instance of the chr class
            self.gu.addIndividualToGraph(self.graph, chrinbuild_id, chrinbuild_label, chr_id)

            # add the build-specific chromosome as a member of the build  (both ways)
            self.gu.addMember(self.graph, build_id, chrinbuild_id)
            self.gu.addMemberOf(self.graph, chrinbuild_id, build_id)

        return

    def addChromosomeClass(self, chrom_num, taxon_id, taxon_label):
        taxon = re.sub('NCBITaxon:', '', taxon_id)
        chrom_class_id = makeChromID(chrom_num, taxon, 'CHR')  # the chrom class (generic) id
        chrom_class_label = makeChromLabel(chrom_num, taxon_label)
        self.gu.addClassToGraph(self.graph, chrom_class_id, chrom_class_label,
                                Feature.types['chromosome'])

        return

    def addChromosomeInstance(self, chr_num, reference_id, reference_label, chr_type=None):
        """
        Add the supplied chromosome as an instance within the given reference
        :param chr:
        :param reference_id: for example, a build id like UCSC:hg19
        :param reference_label:
        :param chr_type: this is the class that this is an instance of.  typically a genome-specific chr
        :return:
        """

        chr_id = makeChromID(str(chr_num), reference_id, 'MONARCH')
        chr_label = makeChromLabel(str(chr_num), reference_label)

        self.gu.addIndividualToGraph(self.graph, chr_id, chr_label, Feature.types['chromosome'])
        self.gu.addType(self.graph, chr_id, chr_type)

        # add the build-specific chromosome as a member of the build  (both ways)
        self.gu.addMember(self.graph, reference_id, chr_id)
        self.gu.addMemberOf(self.graph, chr_id, reference_id)

        return

    def make_variant_locus_label(self, gene_label, allele_label):
        if gene_label is None:
            gene_label = ''
        label = gene_label.strip()+'<' + allele_label.strip() + '>'

        return label

    def make_vslc_label(self, gene_label, allele1_label, allele2_label):
        """
        Make a Variant Single Locus Complement (VSLC) in monarch-style.
        :param gene_label:
        :param allele1_label:
        :param allele2_label:
        :return:
        """
        vslc_label = ''

        if (gene_label is None and allele1_label is None and allele2_label is None):
            logger.error("Not enough info to make vslc label")
            return None

        top = self.make_variant_locus_label(gene_label, allele1_label)
        bottom = ''
        if allele2_label is not None:
            bottom = self.make_variant_locus_label(gene_label, allele2_label)

        vslc_label = '/'.join((top, bottom))

        return vslc_label
Пример #8
0
class Assoc:
    """
    An abstract class for OBAN (Monarch)-style associations,
    to enable attribution of source and evidence
    on statements.
    """

    assoc_types = {
        'association': 'OBAN:association'
    }

    annotation_properties = {
        'replaced_by': 'IAO:0100001',
        'consider': 'OIO:consider',
        'hasExactSynonym': 'OIO:hasExactSynonym',
        'hasRelatedSynonym': 'OIO:hasRelatedSynonym',
        'definition': 'IAO:0000115',
        'has_xref': 'OIO:hasDbXref',
    }

    object_properties = {
        'has_disposition': 'GENO:0000208',
        'has_phenotype': 'RO:0002200',
        'in_taxon': 'RO:0002162',
        'has_quality': 'RO:0000086',
        'towards': 'RO:0002503',
        'has_subject': 'OBAN:association_has_subject',
        'has_object': 'OBAN:association_has_object',
        'has_predicate': 'OBAN:association_has_object_property',
        'is_about': 'IAO:00000136',
        'has_evidence': 'RO:0002558',
        'has_source': 'dc:source',
        'has_provenance': 'OBAN:has_provenance'
    }

    datatype_properties = {
        'position': 'faldo:position',
        'has_measurement': 'IAO:0000004'
    }

    properties = annotation_properties.copy()
    properties.update(object_properties)
    properties.update(datatype_properties)

    OWLCLASS = OWL['Class']
    OWLIND = OWL['NamedIndividual']
    OBJECTPROP = OWL['ObjectProperty']
    ANNOTPROP = OWL['AnnotationProperty']
    DATAPROP = OWL['DatatypeProperty']

    SUBCLASS = RDFS['subClassOf']
    BASE = Namespace(curie_map.get()[''])

    def __init__(self, definedby):
        self.cu = CurieUtil(curie_map.get())
        self.gu = GraphUtils(curie_map.get())

        # core parts of the association
        self.definedby = definedby
        self.sub = self.obj = self.rel = None
        self.assoc_id = None

        self.description = None
        self.source = []
        self.evidence = []
        # this is going to be used for the refactored evidence/provenance
        self.provenance = []

        self.score = None
        self.score_type = None
        self.score_unit = None

        return

    def get_properties(self):
        return self.properties

    def _is_valid(self):

        # check if sub/obj/rel are none...throw error
        if self.sub is None:
            raise ValueError('No subject set for this association')
        if self.obj is None:
            raise ValueError('No object set for this association')
        if self.rel is None:
            raise ValueError('No relation set for this association')

        return True

    def _add_basic_association_to_graph(self, g):

        if not self._is_valid():
            return

        # first, add the direct triple
        # anonymous (blank) nodes are indicated with underscore
        s = self.gu.getNode(self.sub)
        o = self.gu.getNode(self.obj)
        p = self.gu.getNode(self.rel)

        if s is None:
            logging.error(
                "Unable to retrieve graph node for Subject %s ", self.sub)
            return
                
        elif p is None:
            logging.error(
                "Unable to retrieve graph node for Predicate %s ", self.rel)
            return
                
        elif o is None:
            logging.error(
                "Unable to retrieve graph node for Object %s ", self.obj)
            return
        else:
            g.add((s, p, o))

        if self.assoc_id is None:
            self.set_association_id()

        node = self.gu.getNode(self.assoc_id)
        g.add((node, RDF['type'],
               self.gu.getNode(self.assoc_types['association'])))

        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_subject'], self.sub)
        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_object'], self.obj)
        self.gu.addTriple(g, self.assoc_id,
                          self.object_properties['has_predicate'], self.rel)

        if self.description is not None:
            self.gu.addDescription(g, self.assoc_id, self.description)

        if self.evidence is not None and len(self.evidence) > 0:
            for e in self.evidence:
                self.gu.addTriple(g, self.assoc_id,
                                  self.object_properties['has_evidence'], e)

        if self.source is not None and len(self.source) > 0:
            for s in self.source:
                if re.match('http', s):
                    # TODO assume that the source is a publication?
                    # use Reference class here
                    self.gu.addTriple(g, self.assoc_id,
                                      self.object_properties['has_source'], s,
                                      True)
                else:
                    self.gu.addTriple(g, self.assoc_id,
                                      self.object_properties['has_source'], s)

        if self.provenance is not None and len(self.provenance) > 0:
            for p in self.provenance:
                self.gu.addTriple(g, self.assoc_id,
                                  self.object_properties['has_provenance'], p)

        if self.score is not None:
            self.gu.addTriple(
                g, self.assoc_id, self.properties['has_measurement'],
                Literal(self.score, datatype=XSD['float']), True)
            # TODO
            # update with some kind of instance of scoring object
            # that has a unit and type

        return

    def add_association_to_graph(self, g):

        self._add_basic_association_to_graph(g)

        return

    def set_subject(self, identifier):
        self.sub = identifier
        return

    def set_object(self, identifier):
        self.obj = identifier
        return

    def set_relationship(self, identifier):
        self.rel = identifier
        return

    def set_association_id(self, assoc_id=None):
        """
        This will set the association ID based on the internal parts
            of the association.
        To be used in cases where an external association identifier
            should be used.

        :param assoc_id:
        :return:
        """
        if assoc_id is None:
            self.assoc_id = self.make_association_id(self.definedby, self.sub,
                                                     self.rel, self.obj)
        else:
            self.assoc_id = assoc_id

        return

    def get_association_id(self):

        return self.assoc_id

    def set_description(self, description):
        self.description = description

        return

    def set_score(self, score, unit=None, score_type=None):

        self.score = score
        self.score_unit = unit
        self.score_type = score_type

        return

    def add_evidence(self, identifier):
        """
        Add an evidence code to the association object (maintained as a list)
        :param identifier:
        :return:
        """

        if identifier is not None and identifier.strip() != '':
            self.evidence += [identifier]

        return

    def add_source(self, identifier):
        """
        Add a source identifier (such as publication id)
        to the association object (maintained as a list)
        TODO we need to greatly expand this function!

        :param identifier:
        :return:
        """

        if identifier is not None and identifier.strip() != '':
            self.source += [identifier]

        return

    def add_provenance(self, identifier):

        if identifier is not None and identifier.strip() != '':
            self.provenance += [identifier]

        return

    def load_all_properties(self, g):
        props = {
            self.OBJECTPROP: self.object_properties,
            self.ANNOTPROP: self.annotation_properties,
            self.DATAPROP: self.datatype_properties
        }

        for p in props:
            self.gu.loadProperties(g, props[p], p)

        return

    def _get_source_uri(self, pub_id):
        """
        Given some kind of pub_id (which might be a CURIE or url),
        convert it into a proper node.

        :param pub_id:
        :return: source: Well-formed URI for the given identifier (or url)
        """

        source = None
        if re.compile('http').match(pub_id):
            source = URIRef(pub_id)
        else:
            u = self.gu.getNode(pub_id)
            if u is not None:
                source = URIRef(u)
            else:
                logger.error(
                    "An id we don't know how to deal with: %s", pub_id)

        return source

    @staticmethod
    def make_association_id(definedby, subject, predicate, object,
                            attributes=None):
        """
        A method to create unique identifiers for OBAN-style associations,
        based on all the parts of the association
        If any of the items is empty or None, it will convert it to blank.
        It effectively md5 hashes the (+)-joined string from the values.
        Subclasses of Assoc can submit an additional array of attributes
        that will be added to the ID.

        :param definedby: The (data) resource that provided the annotation
        :param subject:
        :param predicate:
        :param object:
        :param attributes:
        :return:
        """

        # note others available:
        #   md5(), sha1(), sha224(), sha256(), sha384(), and sha512()
        # TEC: at our scale, md5 is in danger of having collisions.
        # putting definedby first,
        # as this will usually be the datasource providing the annotation
        # this will end up making the first few parts of the id
        # be the same for all annotations in that resource
        items_to_hash = [definedby, subject, predicate, object]
        if attributes is not None:
            items_to_hash += attributes

        for i, val in enumerate(items_to_hash):
            if val is None:
                items_to_hash[i] = ''

        byte_string = '+'.join(items_to_hash).encode("utf-8")

        # TODO put this in a util?
        return ':'.join(('MONARCH', hashlib.md5(byte_string).hexdigest()))
Пример #9
0
    def process_catalog(self, limit=None):
        """
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['catalog']['file']))
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        tax_id = 'NCBITaxon:9606'  # hardcode
        genome_version = 'GRCh38'  # hardcode

        # build a hashmap of genomic location to identifiers,
        # to try to get the equivalences

        loc_to_id_hash = {}

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1
                    (date_added_to_catalog, pubmed_num, first_author, pub_date,
                     journal, link, study_name, disease_or_trait,
                     initial_sample_description, replicate_sample_description,
                     region, chrom_num, chrom_pos, reported_gene_nums,
                     mapped_gene, upstream_gene_num, downstream_gene_num,
                     snp_gene_nums, upstream_gene_distance,
                     downstream_gene_distance, strongest_snp_risk_allele, snps,
                     merged, snp_id_current, context, intergenic_flag,
                     risk_allele_frequency, pvalue, pvalue_mlog, pvalue_text,
                     or_or_beta, confidence_interval_95,
                     platform_with_snps_passing_qc, cnv_flag, mapped_trait,
                     mapped_trait_uri) = row

                    intersect = \
                        list(set([str(i) for i in self.test_ids['gene']]) &
                             set(re.split(r',', snp_gene_nums)))
                    # skip if no matches found in test set
                    if self.testMode and len(intersect) == 0:
                        continue

# 06-May-2015	25917933	Zai CC	20-Nov-2014	J Psychiatr Res	http://europepmc.org/abstract/MED/25917933
# A genome-wide association study of suicide severity scores in bipolar disorder.
# Suicide in bipolar disorder
# 959 European ancestry individuals	NA
# 10p11.22	10	32704340	C10orf68, CCDC7, ITGB1	CCDC7
# rs7079041-A	rs7079041	0	7079041	intron	0		2E-6	5.698970
                    if chrom_num != '' and chrom_pos != '':
                        loc = 'chr'+str(chrom_num)+':'+str(chrom_pos)
                        if loc not in loc_to_id_hash:
                            loc_to_id_hash[loc] = set()
                    else:
                        loc = None

                    if re.search(r' x ', strongest_snp_risk_allele) \
                            or re.search(r',', strongest_snp_risk_allele):
                        # TODO deal with haplotypes
                        logger.warning(
                            "We can't deal with haplotypes yet: %s",
                            strongest_snp_risk_allele)
                        continue
                    elif re.match(r'rs', strongest_snp_risk_allele):
                        rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip()
                        # remove the alteration
                    elif re.match(r'kgp', strongest_snp_risk_allele):
                        # FIXME this isn't correct
                        rs_id = 'dbSNP:'+strongest_snp_risk_allele.strip()
                        # http://www.1000genomes.org/faq/what-are-kgp-identifiers
                        # for some information
                        # They were created by Illumina for their genotyping
                        # platform before some variants identified during the
                        # pilot phase of the project had been assigned
                        # rs numbers.
                    elif re.match(r'chr', strongest_snp_risk_allele):
                        # like: chr10:106180121-G
                        rs_id = ':gwas-' + \
                            re.sub(
                                r':', '-', strongest_snp_risk_allele.strip())
                    elif strongest_snp_risk_allele.strip() == '':
                        # logger.debug(
                        #    "No strongest SNP risk allele for %s:\n%s",
                        #    pubmed_num, str(row))
                        # FIXME still consider adding in the EFO terms
                        # for what the study measured?
                        continue
                    else:
                        logger.warning(
                            "There's a snp id i can't manage: %s",
                            strongest_snp_risk_allele)
                        continue

                    alteration = re.search(r'-(.*)$', rs_id)
                    if alteration is not None \
                            and re.match(r'[ATGC]', alteration.group(1)):
                        # add variation to snp
                        pass  # TODO
                    rs_id = re.sub(r'-.*$', '', rs_id).strip()
                    if loc is not None:
                        loc_to_id_hash[loc].add(rs_id)

                    pubmed_id = 'PMID:'+pubmed_num

                    r = Reference(
                        pubmed_id, Reference.ref_types['journal_article'])
                    r.addRefToGraph(g)

                    # create the chromosome
                    chrom_id = makeChromID(chrom_num, genome_version, 'CHR')

                    # add the feature to the graph
                    snp_description = None
                    if risk_allele_frequency != '' and \
                            risk_allele_frequency != 'NR':
                        snp_description = \
                            str(risk_allele_frequency) + \
                            ' [risk allele frequency]'

                    f = Feature(
                        rs_id, strongest_snp_risk_allele.strip(),
                        Feature.types[r'SNP'], snp_description)
                    if chrom_num != '' and chrom_pos != '':
                        f.addFeatureStartLocation(chrom_pos, chrom_id)
                        f.addFeatureEndLocation(chrom_pos, chrom_id)
                    f.addFeatureToGraph(g)
                    f.addTaxonToFeature(g, tax_id)
                    # TODO consider adding allele frequency as property;
                    # but would need background info to do that

                    # also want to add other descriptive info about
                    # the variant from the context
                    for c in re.split(r';', context):
                        cid = self._map_variant_type(c.strip())
                        if cid is not None:
                            gu.addType(g, rs_id, cid)

                    # add deprecation information
                    if merged == 1 and str(snp_id_current.strip()) != '':
                        # get the current rs_id
                        current_rs_id = 'dbSNP:'
                        if not re.match(r'rs', snp_id_current):
                            current_rs_id += 'rs'
                        if loc is not None:
                            loc_to_id_hash[loc].append(current_rs_id)
                        current_rs_id += str(snp_id_current)
                        gu.addDeprecatedIndividual(g, rs_id, current_rs_id)
                        # TODO check on this
                        # should we add the annotations to the current
                        # or orig?
                        gu.makeLeader(g, current_rs_id)
                    else:
                        gu.makeLeader(g, rs_id)

                    # add the feature as a sequence alteration
                    # affecting various genes
                    # note that intronic variations don't necessarily list
                    # the genes such as for rs10448080  FIXME
                    if snp_gene_nums != '':
                        for s in re.split(r',', snp_gene_nums):
                            s = s.strip()
                            # still have to test for this,
                            # because sometimes there's a leading comma
                            if s != '':
                                gene_id = 'NCBIGene:'+s
                                geno.addAlleleOfGene(rs_id, gene_id)

                    # add the up and downstream genes if they are available
                    if upstream_gene_num != '':
                        downstream_gene_id = 'NCBIGene:'+downstream_gene_num
                        gu.addTriple(
                            g, rs_id,
                            Feature.object_properties[
                                r'upstream_of_sequence_of'],
                            downstream_gene_id)
                    if downstream_gene_num != '':
                        upstream_gene_id = 'NCBIGene:'+upstream_gene_num
                        gu.addTriple(
                            g, rs_id,
                            Feature.object_properties[
                                'downstream_of_sequence_of'],
                            upstream_gene_id)

                    description = 'A study of ' + disease_or_trait + \
                        ' in ' + initial_sample_description
                    if replicate_sample_description != '':
                        description = \
                            ' '.join(
                                (description, 'with',
                                 replicate_sample_description))
                    if platform_with_snps_passing_qc != '':
                        description = ' '.join(
                            (description, 'on platform',
                             platform_with_snps_passing_qc))
                    description = ' '.join((description, '(p='+pvalue+')'))

                    # make associations to the EFO terms; there can be >1
                    if mapped_trait_uri.strip() != '':
                        for t in re.split(r',', mapped_trait_uri):
                            t = t.strip()

                            cu = CurieUtil(curie_map.get())
                            tid = cu.get_curie(t)

                            assoc = G2PAssoc(
                                self.name, rs_id, tid,
                                gu.object_properties['contributes_to'])
                            assoc.add_source(pubmed_id)
                            # combinatorial evidence
                            # used in automatic assertion
                            eco_id = 'ECO:0000213'
                            assoc.add_evidence(eco_id)

                            # assoc.set_description(description)
                            # FIXME score should get added to provenance/study
                            # assoc.set_score(pvalue)
                            assoc.add_association_to_graph(g)

                    if not self.testMode and\
                            (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        # loop through the location hash,
        # and make all snps at that location equivalent
        for l in loc_to_id_hash:
            snp_ids = loc_to_id_hash[l]
            if len(snp_ids) > 1:
                logger.info("%s has >1 snp id: %s", l, str(snp_ids))
        return
Пример #10
0
    def add_disease_drug_variant_to_graph(self, table):
        """
        Takes an iterable of iterables as input with the following structure,
        optional indices can be Null:
        [[variant_key, variant_label, diagnoses_key, diagnoses,
          specific_diagnosis, organ, relationship,
          drug_key, drug, therapy_status (optional), pubmed_id(optional)]]

        See ongoing discussion of how to best model here:
        https://github.com/monarch-initiative/mckb/issues/9

        :param table: iterable of iterables, for example, a tuple of tuples
                      from _get_disease_drug_variant_relationship
        :return: None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)

        for row in table:
            (variant_key, variant_label, diagnoses_key, diagnoses,
             specific_diagnosis, organ, relationship,
             drug_key, drug_label, therapy_status, pubmed_id) = row

            if specific_diagnosis is not None:
                diagnoses_label = specific_diagnosis
            else:
                diagnoses_label = diagnoses

            # Arbitrary IDs to be replaced by ontology mappings
            variant_id = self.make_cgd_id('variant{0}'.format(variant_key))
            disease_id = self._get_disease_id(diagnoses_key, diagnoses_label)
            therapy_status_id = self.make_cgd_id('{0}'.format(therapy_status))
            relationship_id = "RO:has_environment"
            disease_quality = ("CGD:{0}".format(relationship)).replace(" ", "_")
            has_quality_property = "BFO:0000159"
            drug_id = self._get_drug_id(drug_key, drug_label)

            geno.addGenotype(variant_id, variant_label,
                             geno.genoparts['sequence_alteration'])

            disease_instance_id = self.make_cgd_id('disease{0}{1}'.format(
                                                     diagnoses_label, variant_key))

            phenotype_instance_id = self.make_cgd_id('phenotype{0}{1}{2}'.format(
                                                     diagnoses_label, variant_key, relationship))

            phenotype_instance_label = "{0} with {1} to therapy".format(diagnoses_label, relationship)
            if relationship == "detrimental effect":
                phenotype_instance_label = "{0} with therapeutic response {1} to health"\
                                           .format(diagnoses_label, relationship)

            # Reified association for disease caused_by genotype
            variant_disease_annot = self.make_cgd_id("assoc{0}{1}".format(variant_key, diagnoses_label))

            # Add individuals/classes
            gu.addClassToGraph(self.graph, disease_id, diagnoses_label, 'DOID:4')

            gu.addClassToGraph(self.graph, drug_id, drug_label, 'CHEBI:23888')
            gu.addIndividualToGraph(self.graph, phenotype_instance_id, phenotype_instance_label,
                                    disease_id)
            gu.loadObjectProperties(self.graph, {relationship: relationship_id})

            if pubmed_id is not None:
                source_id = "PMID:{0}".format(pubmed_id)
                ref = Reference(source_id, Reference.ref_types['journal_article'])
                ref.addRefToGraph(self.graph)
                evidence = 'ECO:0000033'
            else:
                source_id = None
                evidence = None

            rel_id = gu.object_properties['has_phenotype']
            variant_phenotype_assoc = G2PAssoc(self.name,
                                               variant_id,
                                               phenotype_instance_id,
                                               rel_id)

            variant_phenotype_assoc.set_association_id(variant_disease_annot)
            if evidence:
                variant_phenotype_assoc.add_evidence(evidence)

            if source_id:
                variant_phenotype_assoc.add_source(source_id)

            variant_phenotype_assoc.add_association_to_graph(self.graph)
            gu.addTriple(self.graph, variant_disease_annot, relationship_id, drug_id)
            gu.addTriple(self.graph, phenotype_instance_id, has_quality_property, disease_quality)

            # Add therapy-disease association and approval status
            marker_relation = "RO:has_biomarker"

            disease_instance_label = "{0} with biomarker {1}".format(diagnoses_label, variant_label)
            gu.addIndividualToGraph(self.graph, disease_instance_id, disease_instance_label,
                                    disease_id)
            gu.addTriple(self.graph, disease_instance_id, marker_relation, variant_id)

            gu.addClassToGraph(self.graph, therapy_status_id, therapy_status)
            self._add_therapy_drug_association(drug_id, disease_instance_id, therapy_status_id)

        return
Пример #11
0
    def _process_genes(self, limit=None):
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        line_counter = 0
        logger.info("Processing HGNC genes")

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                (hgnc_id, symbol, name, locus_group, locus_type, status,
                 location, location_sortable, alias_symbol, alias_name,
                 prev_symbol, prev_name, gene_family, gene_family_id,
                 date_approved_reserved, date_symbol_changed,
                 date_name_changed, date_modified, entrez_id, ensembl_gene_id,
                 vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids,
                 pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase,
                 homeodb, snornabase, bioparadigms_slc, orphanet,
                 pseudogene_org, horde_id, merops, imgt, iuphar,
                 kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id,
                 intermediate_filament_db) = row

                line_counter += 1

                # skip header
                if line_counter <= 1:
                    continue

                if self.testMode and entrez_id != '' \
                        and int(entrez_id) not in self.gene_ids:
                    continue

                if name == '':
                    name = None
                gene_type_id = self._get_gene_type(locus_type)
                gu.addClassToGraph(g, hgnc_id, symbol, gene_type_id, name)
                if locus_type == 'withdrawn':
                    gu.addDeprecatedClass(g, hgnc_id)
                if entrez_id != '':
                    gu.addEquivalentClass(
                        g, hgnc_id, 'NCBIGene:' + entrez_id)
                if ensembl_gene_id != '':
                    gu.addEquivalentClass(
                        g, hgnc_id, 'ENSEMBL:' + ensembl_gene_id)
                geno.addTaxon('NCBITaxon:9606', hgnc_id)

                # add pubs as "is about"
                if pubmed_id != '':
                    for p in re.split(r'\|', pubmed_id.strip()):
                        if str(p) != '':
                            gu.addTriple(
                                g, 'PMID:' + str(p.strip()),
                                gu.object_properties['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]'
                chr_match = re.match(chr_pattern, location)
                if chr_match is not None and len(chr_match.groups()) > 0:
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR')
                    band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)'
                    band_match = re.search(band_pattern, location)
                    f = Feature(hgnc_id, None, None)
                    if band_match is not None and len(band_match.groups()) > 0:
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        # TEC Monoch? Monarchdom??
                        band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR')
                        gu.addClassToGraph(g, band_id, None)
                        f.addSubsequenceOfFeature(g, band_id)
                    else:
                        gu.addClassToGraph(g, chrom_id, None)
                        f.addSubsequenceOfFeature(g, chrom_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

            # end loop through file

        gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
        gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
        gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        return
Пример #12
0
class Pathway():
    """
    This provides convenience methods to deal with gene and protein collections
    in the context of pathways.
    """

    pathway_parts = {
        'signal_transduction': 'GO:0007165',
        'cellular_process': 'GO:0009987',
        'pathway': 'PW:0000001',
        'gene_product': 'CHEBI:33695'  # bioinformation molecule
    }

    object_properties = {
        'involved_in': 'RO:0002331',
        'gene_product_of': 'RO:0002204',
        'has_gene_product': 'RO:0002205'
    }

    properties = object_properties.copy()

    def __init__(self, graph, nobnodes=False):

        self.gu = GraphUtils(curie_map.get())

        self.graph = graph

        self.nobnodes = nobnodes

        self.gu.loadProperties(self.graph, self.object_properties,
                               self.gu.OBJPROP)

        return

    def addPathway(
            self, pathway_id, pathway_label, pathway_type=None,
            pathway_description=None):
        """
        Adds a pathway as a class.  If no specific type is specified, it will
        default to a subclass of "GO:cellular_process" and "PW:pathway".
        :param pathway_id:
        :param pathway_label:
        :param pathway_type:
        :param pathway_description:
        :return:
        """

        if pathway_type is None:
            pathway_type = self.pathway_parts['cellular_process']
        self.gu.addClassToGraph(
            self.graph, pathway_id, pathway_label, pathway_type,
            pathway_description)
        self.gu.addSubclass(
            self.graph, self.pathway_parts['pathway'], pathway_id)

        return

    def addGeneToPathway(self, pathway_id, gene_id):
        """
        When adding a gene to a pathway, we create an intermediate
        'gene product' that is involved in
        the pathway, through a blank node.

        gene_id RO:has_gene_product _gene_product
        _gene_product RO:involved_in pathway_id

        :param pathway_id:
        :param gene_id:
        :return:
        """

        gene_product = '_'+re.sub(r':', '', gene_id)+'product'
        if self.nobnodes:
            gene_product = ':'+gene_product
        self.gu.addIndividualToGraph(
            self.graph, gene_product, None,
            self.pathway_parts['gene_product'])
        self.gu.addTriple(
            self.graph, gene_id,
            self.object_properties['has_gene_product'],
            gene_product)
        self.addComponentToPathway(pathway_id, gene_product)

        return

    def addComponentToPathway(self, pathway_id, component_id):
        """
        This can be used directly when the component is directly involved in
        the pathway.  If a transforming event is performed on the component
        first, then the addGeneToPathway should be used instead.

        :param pathway_id:
        :param component_id:
        :return:
        """

        self.gu.addTriple(self.graph, component_id,
                          self.object_properties['involved_in'], pathway_id)

        return
Пример #13
0
    def _process_data(self, raw, limit=None):
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)
        line_counter = 0
        gu.loadAllProperties(g)
        gu.loadObjectProperties(g, geno.object_properties)

        # Add the taxon as a class
        taxon_id = 'NCBITaxon:10090'  # map to Mus musculus
        gu.addClassToGraph(g, taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        with gzip.open(raw, 'rt') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                line_counter += 1

                (marker_accession_id, marker_symbol, phenotyping_center,
                 colony, sex, zygosity, allele_accession_id, allele_symbol,
                 allele_name, strain_accession_id, strain_name, project_name,
                 project_fullname, pipeline_name, pipeline_stable_id,
                 procedure_stable_id, procedure_name, parameter_stable_id,
                 parameter_name, top_level_mp_term_id, top_level_mp_term_name,
                 mp_term_id, mp_term_name, p_value, percentage_change,
                 effect_size, statistical_method, resource_name) = row

                if self.testMode and marker_accession_id not in self.test_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity_id = self._map_zygosity(zygosity)

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_'+re.sub(r'\W+', '_', colony)
                if self.nobnodes:
                    colony_id = ':'+colony_id

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = \
                        '_IMPC-'+re.sub(r':', '', allele_accession_id)
                    if self.nobnodes:
                        allele_accession_id = ':'+allele_accession_id
                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    strain_accession_id = '_'+strain_accession_id
                    if self.nobnodes:
                        strain_accession_id = ':'+strain_accession_id
                elif not re.match(r'MGI', strain_accession_id):
                    logger.info(
                        "Found a strange strain accession...%s",
                        strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = \
                        re.match(r'.*<(.*)>', allele_symbol).group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and \
                        marker_accession_id == '':
                    logger.warning(
                        "Marker unspecified on row %d", line_counter)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = geno.genoparts['variant_locus']
                    geno.addGene(marker_accession_id, marker_symbol,
                                 geno.genoparts['gene'])
                    geno.addAllele(variant_locus_id, variant_locus_name,
                                   variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    sequence_alteration_id = \
                        '_seqalt'+re.sub(r':', '', allele_accession_id)
                    if self.nobnodes:
                        sequence_alteration_id = ':'+sequence_alteration_id
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(sequence_alteration_id,
                                           sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,
                # with unknown zygosity
                stem_cell_class = 'ERO:0002002'
                gu.addIndividualToGraph(g, colony_id, colony, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = \
                    '_'+allele_accession_id+geno.zygosity['indeterminate']
                vslc_colony = re.sub(r':', '', vslc_colony)
                if self.nobnodes:
                    vslc_colony = ':'+vslc_colony
                vslc_colony_label = allele_symbol+'/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(allele_accession_id, colony_genotype_id,
                              geno.object_properties['has_alternate_part'])
                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    geno.zygosity['indeterminate'],
                    geno.object_properties['has_alternate_part'])
                gu.addTriple(
                    g, colony_id,
                    geno.object_properties['has_genotype'],
                    colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = geno.object_properties['has_alternate_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    logger.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '_' + '-'.join((marker_accession_id,
                                          allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                if self.nobnodes:
                    vslc_id = ':'+vslc_id
                gu.addIndividualToGraph(
                    g, vslc_id, vslc_name,
                    geno.genoparts['variant_single_locus_complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    geno.object_properties['has_alternate_part'],
                    allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                gu.addType(
                    g, vslc_id,
                    Genotype.genoparts['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        geno.genoparts['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = \
                        strain_name + '/' + phenotyping_center
                    pheno_center_strain_id = \
                        '-'.join((re.sub(r':', '', genomic_background_id),
                                  re.sub(r'\s', '_', phenotyping_center)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        pheno_center_strain_id = '_'+pheno_center_strain_id
                    if self.nobnodes:
                        pheno_center_strain_id = ':'+pheno_center_strain_id
                    geno.addGenotype(pheno_center_strain_id,
                                     pheno_center_strain_label,
                                     geno.genoparts['genomic_background'])
                    geno.addSequenceDerivesFrom(pheno_center_strain_id,
                                                genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name+' ['+pheno_center_strain_label+']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(pheno_center_strain_id, taxon_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                genotype_name += '['+colony+']'
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id(
                        (colony_id + phenotyping_center + zygosity +
                         strain_accession_id+sex))
                sex_qualified_genotype_label = genotype_name+' ('+sex+')'
                if sex == 'male':
                    sq_type_id = geno.genoparts['male_genotype']
                elif sex == 'female':
                    sq_type_id = geno.genoparts['female_genotype']
                else:
                    sq_type_id = geno.genoparts['sex_qualified_genotype']

                geno.addGenotype(
                    sex_qualified_genotype_id,
                    sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    geno.object_properties['has_alternate_part'])

                if genomic_background_id is not None and \
                        genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                phenotype_id = mp_term_id

                # it seems that sometimes phenotype ids are missing.
                # indicate here
                if phenotype_id is None or phenotype_id == '':
                    logger.warning(
                        "No phenotype id specified for row %d: %s",
                        line_counter, str(row))
                    continue
                # experimental_phenotypic_evidence This was used in ZFIN
                eco_id = "ECO:0000059"

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(self.name, sex_qualified_genotype_id,
                                 phenotype_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph(g)
                assoc_id = assoc.get_association_id()

                # add a free-text description
                description = \
                    ' '.join((mp_term_name, 'phenotype determined by',
                              phenotyping_center, 'in an',
                              procedure_name, 'assay where',
                              parameter_name.strip(),
                              'was measured with an effect_size of',
                              str(round(float(effect_size), 5)),
                              '(p =', "{:.4e}".format(float(p_value)), ').'))

                gu.addDescription(g, assoc_id, description)

                # TODO add provenance information
                # resource_id = resource_name
                # assoc.addSource(g, assoc_id, resource_id)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        gu.loadProperties(g, G2PAssoc.object_properties, gu.OBJPROP)
        gu.loadProperties(g, G2PAssoc.annotation_properties, gu.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, gu.DATAPROP)

        return
Пример #14
0
class GeneReviews(Source):
    """
    Here we process the GeneReviews mappings to OMIM,
    plus inspect the GeneReviews (html) books to pull the clinical descriptions
    in order to populate the definitions of the terms in the ontology.
    We define the GeneReviews items as classes that are either grouping classes
    over OMIM disease ids (gene ids are filtered out),
    or are made as subclasses of DOID:4 (generic disease).

    Note that GeneReviews
    [copyright policy](http://www.ncbi.nlm.nih.gov/books/NBK138602/)
    (as of 2015.11.20) says:

    GeneReviews® chapters are owned by the University of Washington, Seattle,
    © 1993-2015. Permission is hereby granted to reproduce, distribute,
    and translate copies of content materials provided that
    (i) credit for source (www.ncbi.nlm.nih.gov/books/NBK1116/)
        and copyright (University of Washington, Seattle)
        are included with each copy;
    (ii) a link to the original material is provided whenever the material is
        published elsewhere on the Web; and
    (iii) reproducers, distributors, and/or translators comply with this
        copyright notice and the GeneReviews Usage Disclaimer.

    This script doesn't pull the GeneReviews books from the NCBI Bookshelf
    directly; scripting this task is expressly prohibited by
    [NCBIBookshelf policy](http://www.ncbi.nlm.nih.gov/books/NBK45311/).
    However, assuming you have acquired the books (in html format) via
    permissible means, a parser for those books is provided here to extract
    the clinical descriptions to define the NBK identified classes.

    """

    files = {
        'idmap': {'file': 'NBKid_shortname_OMIM.txt',
                  'url': GRDL + '/NBKid_shortname_OMIM.txt'},
        'titles': {'file': 'GRtitle_shortname_NBKid.txt',
                   'url': GRDL + '/GRtitle_shortname_NBKid.txt'}
        }

    def __init__(self):
        Source.__init__(self, 'genereviews')

        self.load_bindings()

        self.dataset = Dataset(
            'genereviews', 'Gene Reviews', 'http://genereviews.org/',
            None, 'http://www.ncbi.nlm.nih.gov/books/NBK138602/')
        self.dataset.set_citation('GeneReviews:NBK1116')

        self.gu = GraphUtils(curie_map.get())

        self.book_ids = set()
        self.all_books = {}

        if 'test_ids' not in config.get_config() or\
                'disease' not in config.get_config()['test_ids']:
            logger.warning("not configured with disease test ids.")
            self.test_ids = list()
        else:
            # select ony those test ids that are omim's.
            self.test_ids = config.get_config()['test_ids']['disease']

        return

    def fetch(self, is_dl_forced=False):
        """
        We fetch GeneReviews id-label map and id-omim mapping files from NCBI.
        :return: None
        """

        self.get_files(is_dl_forced)

        return

    def parse(self, limit=None):
        """
        :return: None
        """

        if self.testOnly:
            self.testMode = True

        self._get_titles(limit)
        self._get_equivids(limit)

        self.create_books()
        self.process_nbk_html(limit)

        self.load_bindings()

        # no test subset for now; test == full graph
        self.testgraph = self.graph

        logger.info("Found %d nodes", len(self.graph))

        return

    def _get_equivids(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes(not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['idmap']['file']))
        gu = GraphUtils(curie_map.get())
        line_counter = 0

        # we look some stuff up in OMIM, so initialize here
        omim = OMIM()
        id_map = {}
        allomimids = set()
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (nbk_num, shortname, omim_num) = row
                gr_id = 'GeneReviews:'+nbk_num
                omim_id = 'OMIM:'+omim_num
                if not (
                        (self.testMode and
                         len(self.test_ids) > 0 and
                         omim_id in self.test_ids) or not
                        self.testMode):
                    continue

                # sometimes there's bad omim nums
                if len(omim_num) > 6:
                    logger.warning(
                        "OMIM number incorrectly formatted " +
                        "in row %d; skipping:\n%s",
                        line_counter, '\t'.join(row))
                    continue

                # build up a hashmap of the mappings; then process later
                if nbk_num not in id_map:
                    id_map[nbk_num] = set()
                id_map[nbk_num].add(omim_num)

                # add the class along with the shortname
                gu.addClassToGraph(self.graph, gr_id, None)
                gu.addSynonym(self.graph, gr_id, shortname)

                allomimids.add(omim_num)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

            # end looping through file

        # get the omim ids that are not genes
        entries_that_are_phenotypes = \
            omim.process_entries(
                list(allomimids), filter_keep_phenotype_entry_ids,
                None, None, limit)

        logger.info("Filtered out %d/%d entries that are genes or features",
                    len(allomimids)-len(entries_that_are_phenotypes),
                    len(allomimids))

        for nbk_num in self.book_ids:
            gr_id = 'GeneReviews:'+nbk_num
            if nbk_num in id_map:
                omim_ids = id_map.get(nbk_num)
                for omim_num in omim_ids:
                    omim_id = 'OMIM:'+omim_num
                    # add the gene reviews as a superclass to the omim id,
                    # but only if the omim id is not a gene
                    if omim_id in entries_that_are_phenotypes:
                        gu.addClassToGraph(self.graph, omim_id, None)
                        gu.addSubclass(self.graph, gr_id, omim_id)
            # add this as a generic subclass of DOID:4
            gu.addSubclass(self.graph, 'DOID:4', gr_id)

        return

    def _get_titles(self, limit):
        """
        The file processed here is of the format:
        #NBK_id GR_shortname    OMIM
        NBK1103 trimethylaminuria       136132
        NBK1103 trimethylaminuria       602079
        NBK1104 cdls    122470
        Where each of the rows represents a mapping between
        a gr id and an omim id. These are a 1:many relationship,
        and some of the omim ids are genes (not diseases).
        Therefore, we need to create a loose coupling here.
        We make the assumption that these NBKs are generally higher-level
        grouping classes; therefore the OMIM ids are treated as subclasses.
        (This assumption is poor for those omims that are actually genes,
        but we have no way of knowing what those are here...
        we will just have to deal with that for now.)
        :param limit:
        :return:
        """
        raw = '/'.join((self.rawdir, self.files['titles']['file']))
        gu = GraphUtils(curie_map.get())
        line_counter = 0
        with open(raw, 'r', encoding='latin-1') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                if line_counter == 1:  # skip header
                    continue
                (shortname, title, nbk_num) = row
                gr_id = 'GeneReviews:'+nbk_num

                self.book_ids.add(nbk_num)  # a global set of the book nums

                if limit is None or line_counter < limit:
                    gu.addClassToGraph(self.graph, gr_id, title)
                    gu.addSynonym(self.graph, gr_id, shortname)

        return

    def create_books(self):

        # note that although we put in the url to the book,
        # NCBI Bookshelf does not allow robots to download content
        book_item = {'file': 'books/',
                     'url': ''}

        for nbk in self.book_ids:
            b = book_item.copy()
            b['file'] = '/'.join(('books', nbk+'.html'))
            b['url'] = 'http://www.ncbi.nlm.nih.gov/books/'+nbk
            self.all_books[nbk] = b

        return

    def process_nbk_html(self, limit):
        """
        Here we process the gene reviews books to fetch
        the clinical descriptions to include in the ontology.
        We only use books that have been acquired manually,
        as NCBI Bookshelf does not permit automated downloads.
        This parser will only process the books that are found in
        the ```raw/genereviews/books``` directory,
        permitting partial completion.

        :param limit:
        :return:
        """
        c = 0
        books_not_found = set()
        for nbk in self.book_ids:
            c += 1
            nbk_id = 'GeneReviews:'+nbk
            book_item = self.all_books.get(nbk)
            url = '/'.join((self.rawdir, book_item['file']))

            # figure out if the book is there; if so, process, otherwise skip
            book_dir = '/'.join((self.rawdir, 'books'))
            book_files = os.listdir(book_dir)
            if ''.join((nbk, '.html')) not in book_files:
                # logger.warning("No book found locally for %s; skipping", nbk)
                books_not_found.add(nbk)
                continue
            logger.info("Processing %s", nbk)

            page = open(url)
            soup = BeautifulSoup(page.read())

            # sec0 == clinical description
            clin_summary = \
                soup.find(
                    'div', id=re.compile(".*Summary.sec0"))
            if clin_summary is not None:
                p = clin_summary.find('p')
                ptext = p.text
                ptext = re.sub(r'\s+', ' ', ptext)

                ul = clin_summary.find('ul')
                if ul is not None:
                    item_text = list()
                    for li in ul.find_all('li'):
                        item_text.append(re.sub(r'\s+', ' ', li.text))
                    ptext += ' '.join(item_text)

                # add in the copyright and citation info to description
                ptext = \
                    ' '.join(
                        (ptext,
                         '[GeneReviews:NBK1116, GeneReviews:NBK138602, ' +
                         nbk_id+']'))

                self.gu.addDefinition(self.graph, nbk_id, ptext.strip())

            # get the pubs
            pmid_set = set()
            pub_div = soup.find('div', id=re.compile(r".*Literature_Cited"))
            if pub_div is not None:
                ref_list = pub_div.find_all('div', attrs={'class': "bk_ref"})
                for r in ref_list:
                    for a in r.find_all(
                            'a', attrs={'href': re.compile(r"pubmed")}):
                        if re.match(r'PubMed:', a.text):
                            pmnum = re.sub(r'PubMed:\s*', '', a.text)
                        else:
                            pmnum = \
                                re.search(
                                    r'\/pubmed\/(\d+)$', a['href']).group(1)
                        if pmnum is not None:
                            pmid = 'PMID:'+str(pmnum)
                            self.gu.addTriple(
                                self.graph, pmid,
                                self.gu.object_properties['is_about'],
                                nbk_id)
                            pmid_set.add(pmnum)
                            r = Reference(
                                pmid, Reference.ref_types['journal_article'])
                            r.addRefToGraph(self.graph)

            # TODO add author history, copyright, license to dataset

            # TODO get PMID-NBKID equivalence (near foot of page),
            # and make it "is about" link
            # self.gu.addTriple(
            #   self.graph, pmid,
            #   self.gu.object_properties['is_about'], nbk_id)
            # for example: NBK1191 PMID:20301370

            # add the book to the dataset
            self.dataset.setFileAccessUrl(book_item['url'])

            if limit is not None and c > limit:
                break

            # finish looping through books

        l = len(books_not_found)
        if len(books_not_found) > 0:
            if l > 100:
                logger.warning("There were %d books not found.", l)
            else:
                logger.warning(
                    "The following %d books were not found locally: %s",
                    l, str(books_not_found))
        logger.info(
            "Finished processing %d books for clinical descriptions", c-l)

        return

    def getTestSuite(self):
        import unittest
        from tests.test_genereviews import GeneReviewsTestCase

        test_suite = \
            unittest.TestLoader().loadTestsFromTestCase(GeneReviewsTestCase)

        return test_suite
Пример #15
0
class OMIA(Source):
    """
    This is the parser for the
    [Online Mendelian Inheritance in Animals
    (OMIA)](http://www.http://omia.angis.org.au),
    from which we process inherited disorders, other (single-locus) traits,
    and genes in >200 animal species (other than human and mouse and rats).

    We generate the omia graph to include the following information:
    * genes
    * animal taxonomy, and breeds as instances of those taxa
        (breeds are akin to "strains" in other taxa)
    * animal diseases, along with species-specific subtypes of those diseases
    * publications (and their mapping to PMIDs, if available)
    * gene-to-phenotype associations (via an anonymous variant-locus
    * breed-to-phenotype associations

    We make links between OMIA and OMIM in two ways:
    1.  mappings between OMIA and OMIM are created as OMIA --> hasdbXref OMIM
    2.  mappings between a breed and OMIA disease are created
        to be a model for the mapped OMIM disease,
        IF AND ONLY IF it is a 1:1 mapping.
        there are some 1:many mappings,
        and these often happen if the OMIM item is a gene.

    Because many of these species are not covered in
    the PANTHER orthology datafiles, we also pull any orthology
    relationships from the gene_group files from NCBI.

    """

    files = {
        'data': {
            'file': 'omia.xml.gz',
            'url': 'http://omia.angis.org.au/dumps/omia.xml.gz'},
    }

    def __init__(self):
        Source.__init__(self, 'omia')

        self.load_bindings()

        self.dataset = Dataset(
            'omia', 'Online Mendelian Inheritance in Animals',
            'http://omia.angis.org.au', None, None,
            'http://sydney.edu.au/disclaimer.shtml')

        self.id_hash = {
            'article': {},
            'phene': {},
            'breed': {},
            'taxon': {},
            'gene': {}
        }
        self.label_hash = {}
        self.gu = GraphUtils(curie_map.get())
        # used to store the omia to omim phene mappings
        self.omia_omim_map = {}
        # used to store the unique genes that have phenes
        # (for fetching orthology)
        self.annotated_genes = set()

        self.test_ids = {
            'disease': [
                'OMIA:001702', 'OMIA:001867', 'OMIA:000478', 'OMIA:000201',
                'OMIA:000810', 'OMIA:001400'],
            'gene': [
                492297, 434, 492296, 3430235, 200685834, 394659996, 200685845,
                28713538, 291822383],
            'taxon': [9691, 9685, 9606, 9615, 9913, 93934, 37029, 9627, 9825],
            # to be filled in during parsing of breed table
            # for lookup by breed-associations
            'breed': []
        }
        # to store a map of omia ids and any molecular info
        # to write a report for curation
        self.stored_omia_mol_gen = {}
        self.g = self.graph
        self.geno = Genotype(self.g)
        return

    def fetch(self, is_dl_forced=False):
        """
        :param is_dl_forced:
        :return:
        """
        self.get_files(is_dl_forced)

        ncbi = NCBIGene()
        # ncbi.fetch()
        gene_group = ncbi.files['gene_group']
        self.fetch_from_url(
            gene_group['url'], '/'.join((ncbi.rawdir, gene_group['file'])),
            False)

        return

    def parse(self, limit=None):
        # names of tables to iterate - probably don't need all these:
        # Article_Breed, Article_Keyword, Article_Gene, Article_Keyword,
        # Article_People, Article_Phene, Articles, Breed, Breed_Phene,
        # Genes_gb, Group_Categories, Group_MPO, Inherit_Type, Keywords,
        # Landmark, Lida_Links, OMIA_Group, OMIA_author, Omim_Xref, People,
        # Phene, Phene_Gene, Publishers, Resources, Species_gb, Synonyms

        self.scrub()

        if limit is not None:
            logger.info("Only parsing first %d rows", limit)

        logger.info("Parsing files...")

        if self.testOnly:
            self.testMode = True

        if self.testMode:
            self.g = self.testgraph
        else:
            self.g = self.graph
        self.geno = Genotype(self.g)

        # we do three passes through the file
        # first process species (two others reference this one)
        self.process_species(limit)

        # then, process the breeds, genes, articles, and other static stuff
        self.process_classes(limit)

        # next process the association data
        self.process_associations(limit)

        # process the vertebrate orthology for genes
        # that are annotated with phenotypes
        ncbi = NCBIGene()
        ncbi.add_orthologs_by_gene_group(self.g, self.annotated_genes)

        self.load_core_bindings()
        self.load_bindings()

        logger.info("Done parsing.")

        self.write_molgen_report()

        return

    def scrub(self):
        """
        The XML file seems to have mixed-encoding;
        we scrub out the control characters
        from the file for processing.
        :return:

        """

        logger.info(
            "Scrubbing out the nasty characters that break our parser.")

        myfile = '/'.join((self.rawdir, self.files['data']['file']))
        tmpfile = '/'.join((self.rawdir, self.files['data']['file']+'.tmp.gz'))
        t = gzip.open(tmpfile, 'wb')
        du = DipperUtil()
        with gzip.open(myfile, 'rb') as f:
            filereader = io.TextIOWrapper(f, newline="")
            for l in filereader:
                l = du.remove_control_characters(l) + '\n'
                t.write(l.encode('utf-8'))
        t.close()

        # move the temp file
        logger.info("Replacing the original data with the scrubbed file.")
        shutil.move(tmpfile, myfile)
        return

    # ###################### XML LOOPING FUNCTIONS ##################

    def process_species(self, limit):
        """
        Loop through the xml file and process the species.
        We add elements to the graph, and store the
        id-to-label in the label_hash dict.
        :param limit:
        :return:
        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        for event, elem in ET.iterparse(filereader):
            # Species ids are == genbank species ids!
            self.process_xml_table(
                elem, 'Species_gb', self._process_species_table_row, limit)

        f.close()

        return

    def process_classes(self, limit):
        """
        Loop through the xml file and process the articles,
        breed, genes, phenes, and phenotype-grouping classes.
        We add elements to the graph,
        and store the id-to-label in the label_hash dict,
        along with the internal key-to-external id in the id_hash dict.
        The latter are referenced in the association processing functions.

        :param limit:
        :return:

        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        parser = ET.XMLParser(encoding='utf-8')

        for event, elem in ET.iterparse(filereader, parser=parser):
            self.process_xml_table(
                elem, 'Articles', self._process_article_row, limit)
            self.process_xml_table(
                elem, 'Breed', self._process_breed_row, limit)
            self.process_xml_table(
                elem, 'Genes_gb', self._process_gene_row, limit)
            self.process_xml_table(
                elem, 'OMIA_Group', self._process_omia_group_row, limit)
            self.process_xml_table(
                elem, 'Phene', self._process_phene_row, limit)
            self.process_xml_table(
                elem, 'Omim_Xref', self._process_omia_omim_map, limit)

        f.close()

        # post-process the omia-omim associations to filter out the genes
        # (keep only phenotypes/diseases)
        self.clean_up_omim_genes()

        return

    def process_associations(self, limit):
        """
        Loop through the xml file and process the article-breed, article-phene,
        breed-phene, phene-gene associations, and the external links to LIDA.

        :param limit:
        :return:

        """

        myfile = '/'.join((self.rawdir, self.files['data']['file']))

        f = gzip.open(myfile, 'rb')
        filereader = io.TextIOWrapper(f, newline="")

        filereader.readline()  # remove the xml declaration line

        for event, elem in ET.iterparse(filereader):
            self.process_xml_table(
                elem, 'Article_Breed', self._process_article_breed_row, limit)
            self.process_xml_table(
                elem, 'Article_Phene', self._process_article_phene_row, limit)
            self.process_xml_table(
                elem, 'Breed_Phene', self._process_breed_phene_row, limit)
            self.process_xml_table(
                elem, 'Lida_Links', self._process_lida_links_row, limit)
            self.process_xml_table(
                elem, 'Phene_Gene', self._process_phene_gene_row, limit)
            self.process_xml_table(
                elem, 'Group_MPO', self._process_group_mpo_row, limit)

        f.close()

        return

    # ############ INDIVIDUAL TABLE-LEVEL PROCESSING FUNCTIONS ################

    def _process_species_table_row(self, row):
        # gb_species_id, sci_name, com_name, added_by, date_modified
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        sci_name = row['sci_name']
        com_name = row['com_name']

        if self.testMode and \
                (int(row['gb_species_id']) not in self.test_ids['taxon']):
            return

        self.gu.addClassToGraph(self.g, tax_id, sci_name)
        if com_name != '':
            self.gu.addSynonym(self.g, tax_id, com_name)
            self.label_hash[tax_id] = com_name  # for lookup later
        else:
            self.label_hash[tax_id] = sci_name

        return

    def _process_breed_row(self, row):

        # in test mode, keep all breeds of our test species
        if self.testMode and \
                (int(row['gb_species_id']) not in self.test_ids['taxon']):
            return

        # save the breed keys in the test_ids for later processing
        self.test_ids['breed'] += [int(row['breed_id'])]

        breed_id = self.make_breed_id(row['breed_id'])

        self.id_hash['breed'][row['breed_id']] = breed_id
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        breed_label = row['breed_name']
        species_label = self.label_hash.get(tax_id)
        if species_label is not None:
            breed_label = breed_label + ' ('+species_label+')'

        self.gu.addIndividualToGraph(self.g, breed_id, breed_label, tax_id)
        self.label_hash[breed_id] = breed_label

        return

    def _process_phene_row(self, row):

        phenotype_id = None
        sp_phene_label = row['phene_name']
        if sp_phene_label == '':
            sp_phene_label = None
        if 'omia_id' not in row:
            logger.info("omia_id not present for %s", row['phene_id'])
            omia_id = self._make_internal_id('phene', phenotype_id)
        else:
            omia_id = 'OMIA:'+str(row['omia_id'])

        if self.testMode and not\
                (int(row['gb_species_id']) in self.test_ids['taxon'] and
                 omia_id in self.test_ids['disease']):
            return
        # add to internal hash store for later lookup
        self.id_hash['phene'][row['phene_id']] = omia_id

        descr = row['summary']
        if descr == '':
            descr = None

        # omia label
        omia_label = self.label_hash.get(omia_id)

        # add the species-specific subclass (TODO please review this choice)
        gb_species_id = row['gb_species_id']

        if gb_species_id != '':
            sp_phene_id = '-'.join((omia_id, gb_species_id))
        else:
            logger.error(
                "No species supplied in species-specific phene table for %s",
                omia_id)
            return

        species_id = 'NCBITaxon:'+str(gb_species_id)
        # use this instead
        species_label = self.label_hash.get('NCBITaxon:'+gb_species_id)
        if sp_phene_label is None and \
                omia_label is not None and species_label is not None:
            sp_phene_label = ' '.join((omia_label, 'in', species_label))
        self.gu.addClassToGraph(
            self.g, sp_phene_id, sp_phene_label, omia_id, descr)
        # add to internal hash store for later lookup
        self.id_hash['phene'][row['phene_id']] = sp_phene_id
        self.label_hash[sp_phene_id] = sp_phene_label
        # add each of the following descriptions,
        # if they are populated, with a tag at the end.
        for item in [
                'clin_feat', 'history', 'pathology', 'mol_gen', 'control']:
            if row[item] is not None and row[item] != '':
                self.gu.addDescription(
                    self.g, sp_phene_id, row[item] + ' ['+item+']')
        # if row['symbol'] is not None:  # species-specific
        # CHECK ME - sometimes spaces or gene labels
        #     gu.addSynonym(g, sp_phene, row['symbol'])

        self.gu.addOWLPropertyClassRestriction(
            self.g, sp_phene_id, self.gu.object_properties['in_taxon'],
            species_id)

        # add inheritance as an association
        inheritance_id = self._map_inheritance_term_id(row['inherit'])
        if inheritance_id is not None:
            assoc = DispositionAssoc(self.name, sp_phene_id, inheritance_id)
            assoc.add_association_to_graph(self.g)

        if row['characterised'] == 'Yes':
            self.stored_omia_mol_gen[omia_id] = {
                'mol_gen': row['mol_gen'],
                'map_info': row['map_info'],
                'species': row['gb_species_id']}

        return

    def write_molgen_report(self):
        import csv
        logger.info("Writing G2P report for OMIA")
        f = '/'.join((self.outdir, 'omia_molgen_report.txt'))

        with open(f, 'w', newline='\n') as csvfile:
            writer = csv.writer(csvfile, delimiter='\t')
            # write header
            h = ['omia_id', 'molecular_description', 'mapping_info', 'species']
            writer.writerow(h)
            for phene in self.stored_omia_mol_gen:
                writer.writerow((str(phene),
                                 self.stored_omia_mol_gen[phene]['mol_gen'],
                                 self.stored_omia_mol_gen[phene]['map_info'],
                                 self.stored_omia_mol_gen[phene]['species']))

        logger.info(
            "Wrote %d potential G2P descriptions for curation to %s",
            len(self.stored_omia_mol_gen), f)

        return

    def _process_article_row(self, row):

        # don't bother in test mode
        if self.testMode:
            return

        iarticle_id = self._make_internal_id('article', row['article_id'])
        self.id_hash['article'][row['article_id']] = iarticle_id
        rtype = None
        if row['journal'] != '':
            rtype = Reference.ref_types['journal_article']
        r = Reference(iarticle_id, rtype)

        if row['title'] is not None:
            r.setTitle(row['title'].strip())
        if row['year'] is not None:
            r.setYear(row['year'])
        r.addRefToGraph(self.g)

        if row['pubmed_id'] is not None:
            pmid = 'PMID:'+str(row['pubmed_id'])
            self.id_hash['article'][row['article_id']] = pmid
            self.gu.addSameIndividual(self.g, iarticle_id, pmid)
            self.gu.addComment(self.g, pmid, iarticle_id)

        return

    def _process_omia_group_row(self, row):
        omia_id = 'OMIA:'+row['omia_id']

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        group_name = row['group_name']
        group_summary = row['group_summary']

        disease_id = None
        group_category = row.get('group_category')
        disease_id = \
            self.map_omia_group_category_to_ontology_id(group_category)
        if disease_id is not None:
            self.gu.addClassToGraph(self.g, disease_id, None)
            if disease_id == 'MP:0008762':  # embryonic lethal
                # add this as a phenotype association
                # add embryonic onset
                assoc = D2PAssoc(self.name, omia_id, disease_id)
                assoc.add_association_to_graph(self.g)
                disease_id = None
        else:
            logger.info(
                "No disease superclass defined for %s:  %s",
                omia_id, group_name)
            # default to general disease  FIXME this may not be desired
            disease_id = 'DOID:4'

        if group_summary == '':
            group_summary = None
        if group_name == '':
            group_name = None

        self.gu.addClassToGraph(
            self.g, omia_id, group_name, disease_id, group_summary)

        self.label_hash[omia_id] = group_name

        return

    def _process_gene_row(self, row):
        if self.testMode and row['gene_id'] not in self.test_ids['gene']:
            return
        gene_id = 'NCBIGene:'+str(row['gene_id'])
        self.id_hash['gene'][row['gene_id']] = gene_id
        gene_label = row['symbol']
        self.label_hash[gene_id] = gene_label
        tax_id = 'NCBITaxon:'+str(row['gb_species_id'])
        gene_type_id = NCBIGene.map_type_of_gene(row['gene_type'])
        self.gu.addClassToGraph(self.g, gene_id, gene_label, gene_type_id)
        self.geno.addTaxon(tax_id, gene_id)

        return

    def _process_article_breed_row(self, row):
        # article_id, breed_id, added_by
        # don't bother putting these into the test... too many!

        # and int(row['breed_id']) not in self.test_ids['breed']:
        if self.testMode:
            return

        article_id = self.id_hash['article'].get(row['article_id'])
        breed_id = self.id_hash['breed'].get(row['breed_id'])

        # there's some missing data (article=6038).  in that case skip
        if article_id is not None:
            self.gu.addTriple(
                self.g, article_id, self.gu.object_properties['is_about'],
                breed_id)
        else:
            logger.warning("Missing article key %s", str(row['article_id']))

        return

    def _process_article_phene_row(self, row):
        """
        Linking articles to species-specific phenes.

        :param row:
        :return:
        """
        # article_id, phene_id, added_by
        # look up the article in the hashmap
        phenotype_id = self.id_hash['phene'].get(row['phene_id'])
        article_id = self.id_hash['article'].get(row['article_id'])

        omia_id = self._get_omia_id_from_phene_id(phenotype_id)
        if self.testMode and omia_id not in self.test_ids['disease'] \
                or phenotype_id is None or article_id is None:
            return

        # make a triple, where the article is about the phenotype
        self.gu.addTriple(
            self.g, article_id,
            self.gu.object_properties['is_about'], phenotype_id)

        return

    def _process_breed_phene_row(self, row):
        # Linking disorders/characteristic to breeds
        # breed_id, phene_id, added_by
        breed_id = self.id_hash['breed'].get(row['breed_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        # get the omia id
        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if (self.testMode and not (
                omia_id in self.test_ids['disease'] and
                int(row['breed_id']) in self.test_ids['breed']) or
                breed_id is None or phene_id is None):
            return

        # FIXME we want a different relationship here
        assoc = G2PAssoc(
            self.name, breed_id, phene_id,
            self.gu.object_properties['has_phenotype'])
        assoc.add_association_to_graph(self.g)

        # add that the breed is a model of the human disease
        # use the omia-omim mappings for this
        # we assume that we have already scrubbed out the genes
        # from the omim list, so we can make the model associations here

        omim_ids = self.omia_omim_map.get(omia_id)
        eco_id = "ECO:0000214"   # biological aspect of descendant evidence
        if omim_ids is not None and len(omim_ids) > 0:
            if len(omim_ids) > 1:
                logger.info(
                    "There's 1:many omia:omim mapping: %s, %s",
                    omia_id, str(omim_ids))
            for i in omim_ids:
                assoc = G2PAssoc(
                    self.name, breed_id, i,
                    self.gu.object_properties['model_of'])
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph(self.g)
                aid = assoc.get_association_id()

                breed_label = self.label_hash.get(breed_id)
                if breed_label is None:
                    breed_label = "this breed"

                m = re.search(r'\((.*)\)', breed_label)
                if m:
                    sp_label = m.group(1)
                else:
                    sp_label = ''

                phene_label = self.label_hash.get(phene_id)
                if phene_label is None:
                    phene_label = "phenotype"
                elif phene_label.endswith(sp_label):
                    # some of the labels we made already include the species;
                    # remove it to make a cleaner desc
                    phene_label = re.sub(r' in '+sp_label, '', phene_label)
                desc = ' '.join(
                    ("High incidence of", phene_label, "in", breed_label,
                     "suggests it to be a model of disease", i + "."))
                self.gu.addDescription(self.g, aid, desc)
        return

    def _process_lida_links_row(self, row):
        # lidaurl, omia_id, added_by
        omia_id = 'OMIA:'+row['omia_id']
        lidaurl = row['lidaurl']

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        self.gu.addXref(self.g, omia_id, lidaurl, True)

        return

    def _process_phene_gene_row(self, row):

        gene_id = self.id_hash['gene'].get(row['gene_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if self.testMode and not (
                omia_id in self.test_ids['disease'] and
                row['gene_id'] in self.test_ids['gene']) or\
                gene_id is None or phene_id is None:
            return

        # occasionally some phenes are missing!  (ex: 406)
        if phene_id is None:
            logger.warning("Phene id %s is missing", str(row['phene_id']))
            return

        gene_label = self.label_hash[gene_id]
        # some variant of gene_id has phenotype d
        vl = '_'+re.sub(r'NCBIGene:', '', str(gene_id)) + 'VL'
        if self.nobnodes:
            vl = ':'+vl
        self.geno.addAllele(vl, 'some variant of ' + gene_label)
        self.geno.addAlleleOfGene(vl, gene_id)
        assoc = G2PAssoc(self.name, vl, phene_id)
        assoc.add_association_to_graph(self.g)

        # add the gene id to the set of annotated genes
        # for later lookup by orthology
        self.annotated_genes.add(gene_id)

        return

    def _process_omia_omim_map(self, row):
        """
        Links OMIA groups to OMIM equivalents.
        :param row:
        :return:
        """
        # omia_id, omim_id, added_by

        omia_id = 'OMIA:'+row['omia_id']
        omim_id = 'OMIM:'+row['omim_id']

        # also store this for use when we say that a given animal is
        # a model of a disease
        if omia_id not in self.omia_omim_map:
            self.omia_omim_map[omia_id] = set()
        self.omia_omim_map[omia_id].add(omim_id)

        if self.testMode and omia_id not in self.test_ids['disease']:
            return

        self.gu.addXref(self.g, omia_id, omim_id)

        return

    def map_omia_group_category_to_ontology_id(self, category_num):
        """
        Using the category number in the OMIA_groups table,
        map them to a disease id.
        This may be superceeded by other MONDO methods.

        Platelet disorders will be more specific once
        https://github.com/obophenotype/human-disease-ontology/issues/46
        is fulfilled.

        :param category_num:
        :return:

        """

        category_map = {
            1: 'DOID:0014667',      # Inborn error of metabolism
            2: 'MESH:D004392',      # Dwarfism
            3: 'DOID:1682',         # congenital heart disease
            4: 'DOID:74',           # blood system disease
            5: 'DOID:3211',         # lysosomal storage disease
            6: 'DOID:16',           # integumentary system disease
            # --> retinal degeneration ==> OMIA:000830
            7: 'DOID:8466',         # progressive retinal atrophy
            8: 'DOID:0050572',      # Cone–rod dystrophy
            9: 'MESH:C536122',      # stationary night blindness
            10: 'Orphanet:98553',   # developmental retinal disorder
            11: 'DOID:5679',        # retinal disorder
            12: 'Orphanet:90771',   # Disorder of Sex Development
            #  - what to do about this one?
            13: 'MP:0008762',       # embryonic lethal
            # - not sure what to do with this
            14: None,               # blood group
            # FIXME make me more specific
            15: 'DOID:2218',        # intrinsic platelet disorder
            # FIXME make me more specific
            16: 'DOID:2218',        # extrinsic platelet disorder
            17: None  # transgenic ???
        }

        disease_id = None
        if category_num is not None and int(category_num) in category_map:
            disease_id = category_map.get(int(category_num))
            logger.info(
                "Found %s for category %s", str(disease_id), str(category_num))
        else:
            logger.info(
                "There's a group category I don't know anything about: %s",
                str(category_num))

        return disease_id

    def _process_group_mpo_row(self, row):
        """
        Make OMIA to MP associations
        :param row:
        :return:
        """
        omia_id = 'OMIA:'+row['omia_id']
        mpo_num = int(row['MPO_no'])
        mpo_id = 'MP:'+str(mpo_num).zfill(7)

        assoc = D2PAssoc(self.name, omia_id, mpo_id)
        assoc.add_association_to_graph(self.g)

        return

    def clean_up_omim_genes(self):
        omim = OMIM()
        # get all the omim ids
        allomimids = set()
        for omia in self.omia_omim_map:
            allomimids.update(self.omia_omim_map[omia])

        entries_that_are_phenotypes = omim.process_entries(
            list(allomimids), filter_keep_phenotype_entry_ids, None, None)
        logger.info(
            "Filtered out %d/%d entries that are genes or features",
            len(allomimids)-len(entries_that_are_phenotypes), len(allomimids))

        # now iterate again and remove those non-phenotype ids
        removed_count = 0
        for omia in self.omia_omim_map:
            ids = self.omia_omim_map[omia]
            cleanids = set()
            for i in ids:
                if i in entries_that_are_phenotypes:
                    cleanids.add(i)
                else:
                    removed_count += 1  # keep track of how many we've removed
            self.omia_omim_map[omia] = cleanids

        logger.info(
            "Removed %d omim ids from the omia-to-omim map", removed_count)

        return

    def _make_internal_id(self, prefix, key):

        iid = '_'+''.join(('omia', prefix, 'key', str(key)))
        if self.nobnodes:
            iid = ':'+iid

        return iid

    def make_breed_id(self, key):
        breed_id = 'OMIA-breed:'+str(key)

        return breed_id

    @staticmethod
    def _get_omia_id_from_phene_id(phene_id):
        omia_id = None
        if phene_id is not None:
            m = re.match(r'OMIA:\d+', str(phene_id))
            if m:
                omia_id = m.group(0)

        return omia_id

    @staticmethod
    def _map_inheritance_term_id(inheritance_symbol):

        inherit_map = {
            'A':  None,  # Autosomal
            'ACD': 'GENO:0000143',  # Autosomal co-dominant
            'ADV': None,  # autosomal dominant with variable expressivity
            'AID': 'GENO:0000259',  # autosomal incompletely dominant
            'ASD': 'GENO:0000145',  # autosomal semi-dominant
            # autosomal recessive, semi-lethal
            # using generic autosomal recessive
            'ASL': 'GENO:0000150',
            'D': 'GENO:0000147',  # autosomal dominant
            'M': None,  # multifactorial
            'MAT': None,  # Maternal
            # probably autosomal recessive
            # using generic autosomal recessive
            'PR':  'GENO:0000150',
            'R': 'GENO:0000150',  # Autosomal Recessive
            # Recessive Embryonic Lethal
            # using plain recessive
            'REL': 'GENO:0000148',
            # Autosomal Recessive Lethal
            # using plain autosomal recessive
            'RL': 'GENO:0000150',
            'S': 'GENO:0000146',  # Sex-linked   <--using allosomal dominant
            'SLi': None,  # Sex-limited
            'UD': 'GENO:0000144',  # Dominant
            'X': None,  # x-linked    # HP:0001417 ?
            # X-linked Dominant     <-- temp using allosomal dominant  FIXME
            'XLD': 'GENO:0000146',
            # X-linked Recessive    <-- temp using allosomal recessive  FIXME
            'XLR': 'GENO:0000149',
            'Y': None,  # Y-linked
            'Z': None,  # Z-linked
            # Z-linked recessive    <-- temp using allosomal recessive  FIXME
            'ZR': 'GENO:0000149',
            '999': None,  # Z-linked incompletely dominant
        }

        inheritance_id = inherit_map.get(inheritance_symbol)
        if inheritance_id is None and inheritance_symbol is not None:
            logger.warning(
                "No inheritance id is mapped for %s", inheritance_symbol)

        return inheritance_id

    def getTestSuite(self):
        import unittest
        from tests.test_omia import OMIATestCase

        test_suite = unittest.TestLoader().loadTestsFromTestCase(OMIATestCase)

        return test_suite
Пример #16
0
    def _get_gene2pubmed(self, limit):
        """
        Loops through the gene2pubmed file and adds a simple triple to say
        that a given publication is_about a gene.
        Publications are added as NamedIndividuals.

        These are filtered on the taxon.

        :param limit:
        :return:

        """

        gu = GraphUtils(curie_map.get())
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene2pubmed']['file']))
        logger.info("FILE: %s", myfile)
        assoc_counter = 0
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue
                (tax_num, gene_num, pubmed_num) = line.split('\t')

                # ## set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #     if ((self.filter == 'taxids' and \
                #          (int(tax_num) not in self.tax_ids))
                #        or (self.filter == 'geneids' and \
                #            (int(gene_num) not in self.gene_ids))):
                #         continue
                # #### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if not self.testMode and int(tax_num) not in self.tax_ids:
                    continue

                if gene_num == '-' or pubmed_num == '-':
                    continue

                line_counter += 1
                gene_id = ':'.join(('NCBIGene', gene_num))
                pubmed_id = ':'.join(('PMID', pubmed_num))

                if self.class_or_indiv.get(gene_id) == 'C':
                    gu.addClassToGraph(g, gene_id, None)
                else:
                    gu.addIndividualToGraph(g, gene_id, None)
                # add the publication as a NamedIndividual
                # add type publication
                gu.addIndividualToGraph(g, pubmed_id, None, None)
                r = Reference(
                    pubmed_id, Reference.ref_types['journal_article'])
                r.addRefToGraph(g)
                gu.addTriple(
                    g, pubmed_id, gu.object_properties['is_about'], gene_id)
                assoc_counter += 1
                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        logger.info(
            "Processed %d pub-gene associations", assoc_counter)

        return
Пример #17
0
class Feature():
    """
    Dealing with genomic features here.  By default they are all faldo:Regions.
    We use SO for typing genomic features. At the moment,
    RO:has_subsequence is the default relationship
    between the regions, but this should be tested/verified.

    TODO:
        the graph additions are in the addXToFeature functions,
        but should be separated.
    TODO:
        this will need to be extended to properly deal with
        fuzzy positions in faldo.
    """

    object_properties = {
        'location': 'faldo:location',
        'begin': 'faldo:begin',
        'end': 'faldo:end',
        'reference': 'faldo:reference',
        'gene_product_of': 'RO:0002204',
        'has_gene_product': 'RO:0002205',
        'is_about': 'IAO:00000136',
        'has_subsequence': 'RO:0002524',
        'is_subsequence_of': 'RO:0002525',
        'has_staining_intensity': 'GENO:0000207',
        # was GENO:0000626 (staining_intensity),
        # but changing to has_sequence_attribute
        'upstream_of_sequence_of': 'RO:0002528',
        'downstream_of_sequence_of': 'RO:0002529'

    }

    data_properties = {
        'position': 'faldo:position',
    }

    annotation_properties = {}

    properties = object_properties.copy()
    properties.update(data_properties)
    properties.update(annotation_properties)

    types = {
        'region': 'faldo:Region',
        'Position': 'faldo:Position',
        # big P for Position type.  little p for position property
        'FuzzyPosition': 'faldo:FuzzyPosition',
        'chromosome': 'SO:0000340',
        'chromosome_arm': 'SO:0000105',
        'chromosome_band': 'SO:0000341',
        'chromosome_part': 'SO:0000830',
        'long_chromosome_arm': 'GENO:0000629',
        'short_chromosome_arm': 'GENO:0000628',
        'chromosome_region': 'GENO:0000614',
        'chromosome_subband': 'GENO:0000616',
        'centromere': 'SO:0000577',
        'plus_strand': 'faldo:PlusStrandPosition',
        'minus_strand': 'faldo:MinusStrandPosition',
        'both_strand': 'faldo:BothStrandPosition',
        'score': 'SO:0001685',
        # FIXME - score is not a good solution, too generic
        'reference_genome': 'SO:0001505',
        'genome': 'SO:0001026',
        'assembly_component': 'SO:0000143',
        'SNP': 'SO:0000694',

        # the following are sequence attributes:
        'band_intensity':  'GENO:0000618',
        'gneg': 'GENO:0000620',
        'gpos': 'GENO:0000619',
        'gpos100': 'GENO:0000622',
        'gpos75': 'GENO:0000623',
        'gpos50': 'GENO:0000624',
        'gpos25': 'GENO:0000625',
        'gvar': 'GENO:0000621',
        'gpos33': 'GENO:0000633',
        'gpos66': 'GENO:0000632'
    }

    def __init__(self, id, label, type, description=None):
        self.id = id
        self.label = label
        self.type = type
        self.description = description
        self.gu = GraphUtils(curie_map.get())
        self.start = None
        self.stop = None
        self.nobnodes = True  # TODO remove this before official release
        return

    def addFeatureStartLocation(
            self, coordinate, reference_id, strand=None,
            position_types=None):
        """
        Adds coordinate details for the start of this feature.
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:
        :return:
        """

        # make an object for the start, which has:
        # {coordinate : integer, reference : reference_id, types = []}
        self.start = self._getLocation(coordinate, reference_id, strand,
                                       position_types)

        return

    def addFeatureEndLocation(
            self, coordinate, reference_id, strand=None,
            position_types=None):
        """
        Adds the coordinate details for the end of this feature
        :param coordinate:
        :param reference_id:
        :param strand:
        :return:
        """

        self.stop = self._getLocation(coordinate, reference_id, strand,
                                      position_types)

        return

    def _getLocation(self, coordinate, reference_id, strand, position_types):
        """
        Make an object for the location, which has:
        {coordinate : integer, reference : reference_id, types = []}
        where the strand is indicated in the type array
        :param coordinate:
        :param reference_id:
        :param strand:
        :param position_types:
        :return:
        """

        loc = {}
        loc['coordinate'] = coordinate
        loc['reference'] = reference_id
        loc['type'] = []
        strand_id = self._getStrandType(strand)
        if strand_id is not None:
            loc['type'].append(strand_id)
        if position_types is not None:
            loc['type'] += position_types
        if position_types == []:
            loc['type'].append(self.types['Position'])

        return loc

    def _getStrandType(self, strand):
        """

        :param strand:
        :return:
        """

        # TODO make this a dictionary/enum:  PLUS, MINUS, BOTH, UNKNOWN
        strand_id = None
        if strand == '+':
            strand_id = self.types['plus_strand']
        elif strand == '-':
            strand_id = self.types['minus_strand']
        elif strand == '.':
            strand_id = self.types['both_strand']
        elif strand is None:  # assume this is Unknown
            pass
        else:
            logger.warning("strand type could not be mapped: %s", str(strand))

        return strand_id

    def addFeatureToGraph(
            self, graph, add_region=True, region_id=None,
            feature_as_class=False):
        """
        We make the assumption here that all features are instances.
        The features are located on a region,
            which begins and ends with faldo:Position
        The feature locations leverage the Faldo model,
            which has a general structure like:
        Triples:
        feature_id a feature_type (individual)
            faldo:location region_id
        region_id a faldo:region
            faldo:begin start_position
            faldo:end end_position
        start_position a
            (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
            faldo:position Integer(numeric position)
            faldo:reference reference_id
        end_position a
            (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
            faldo:position Integer(numeric position)
            faldo:reference reference_id

        :param graph:
        :return:
        """

        if feature_as_class:
            self.gu.addClassToGraph(graph, self.id, self.label, self.type,
                                    self.description)
        else:
            self.gu.addIndividualToGraph(graph, self.id, self.label, self.type,
                                         self.description)

        if self.start is None and self.stop is None:
            add_region = False

        if add_region:
            # create a region that has the begin/end positions
            regionchr = re.sub(r'\w+\:_?', '', self.start['reference'])
            if region_id is None:
                # in case the values are undefined
                # if we know only one of the coordinates,
                # then we'll add an "unknown" other.
                st = sp = 'UN'
                strand = None
                if self.start is not None and \
                        self.start['coordinate'] is not None:
                    st = str(self.start['coordinate'])
                    strand = self._getStrandStringFromPositionTypes(
                        self.start['type'])
                if self.stop is not None and\
                        self.stop['coordinate'] is not None:
                    sp = str(self.stop['coordinate'])
                    if strand is not None:
                        strand = self._getStrandStringFromPositionTypes(
                            self.stop['type'])
                # assume that the strand is the same for both start and stop.
                # this will need to be fixed in the future
                region_items = [regionchr, st, sp]
                if strand is not None:
                    region_items += [strand]
                region_id = '-'.join(region_items)
                rid = region_id
                rid = re.sub(r'\w+\:', '', rid, 1)  # replace the id prefix
                rid = '_'+rid+"-Region"
                region_id = rid
                if self.nobnodes:
                    region_id = ':'+region_id
            self.gu.addTriple(graph, self.id, self.properties['location'],
                              region_id)
            self.gu.addIndividualToGraph(
                graph, region_id, None, 'faldo:Region')
        else:
            region_id = self.id
            self.gu.addType(graph, region_id, 'faldo:Region')

        # add the start/end positions to the region
        beginp = endp = None
        if self.start is not None:
            beginp = self._makePositionId(self.start['reference'],
                                          self.start['coordinate'],
                                          self.start['type'])
            self.addPositionToGraph(graph,
                                    self.start['reference'],
                                    self.start['coordinate'],
                                    self.start['type'])

        if self.stop is not None:
            endp = self._makePositionId(self.stop['reference'],
                                        self.stop['coordinate'],
                                        self.stop['type'])
            self.addPositionToGraph(graph,
                                    self.stop['reference'],
                                    self.stop['coordinate'],
                                    self.stop['type'])

        self.addRegionPositionToGraph(graph, region_id, beginp, endp)

        # {coordinate : integer, reference : reference_id, types = []}

        return

    def _getStrandStringFromPositionTypes(self, tylist):
        strand = None
        if self.types['plus_strand'] in tylist:
            strand = 'plus'
        elif self.types['minus_strand'] in tylist:
            strand = 'minus'
        elif self.types['both_strand'] in tylist:
            strand = 'both'
        else:
            strand = None  # it is stranded, but we don't know what it is

        return strand

    def _makePositionId(self, reference, coordinate, types=None):
        """
        Note that positions should have a reference (we will enforce).
            Only exact positions need a coordinate.
        :param reference:
        :param coordinate:
        :param types:
        :return:
        """

        if reference is None:
            logger.error("Trying to make position with no reference.")
            return None

        i = '_'
        if self.nobnodes:
            i = ':'+i
        reference = re.sub(r'\w+\:', '', reference, 1)
        if re.match(r'^_', reference):
            # this is in the case if the reference is a bnode
            reference = re.sub(r'^_', '', reference)
        i += reference
        if coordinate is not None:
            # just in case it isn't a string already
            i = '-'.join((i, str(coordinate)))
        if types is not None:
            tstring = self._getStrandStringFromPositionTypes(types)
            if tstring is not None:
                i = '-'.join((i, tstring))

        return i

    def addRegionPositionToGraph(
            self, graph, region_id, begin_position_id,
            end_position_id):

        if begin_position_id is None:
            pass
            # logger.warn(
            #   "No begin position specified for region %s", region_id)
        else:
            self.gu.addTriple(graph, region_id, self.properties['begin'],
                              begin_position_id)

        if end_position_id is None:
            pass
            # logger.warn("No end position specified for region %s", region_id)
        else:
            self.gu.addTriple(graph, region_id, self.properties['end'],
                              end_position_id)

        return

    def addPositionToGraph(
            self, graph, reference_id, position,
            position_types=None, strand=None):
        """
        Add the positional information to the graph, following the faldo model.
        We assume that if the strand is None,
            we give it a generic "Position" only.
        Triples:
        my_position a (any of: faldo:(((Both|Plus|Minus)Strand)|Exact)Position)
            faldo:position Integer(numeric position)
            faldo:reference reference_id

        :param graph:
        :param reference_id:
        :param position:
        :param position_types:
        :param strand:
        :return:  Identifier of the position created
        """

        iid = self._makePositionId(reference_id, position, position_types)
        n = self.gu.getNode(iid)
        pos = self.gu.getNode(self.properties['position'])
        ref = self.gu.getNode(self.properties['reference'])
        if position is not None:
            graph.add((n, pos, Literal(position, datatype=XSD['integer'])))
        graph.add((n, ref, self.gu.getNode(reference_id)))
        if position_types is not None:
            for t in position_types:
                graph.add((n, RDF['type'], self.gu.getNode(t)))
        s = None
        if strand is not None:
            s = strand
            if not re.match(r'faldo', strand):
                # not already mapped to faldo, so expect we need to map it
                s = self._getStrandType(strand)
        # else:
        #    s = self.types['both_strand']
        if s is None and (position_types is None or position_types == []):
            s = self.types['Position']

        if s is not None:
            graph.add((n, RDF['type'], self.gu.getNode(s)))

        return iid

    def addSubsequenceOfFeature(self, graph, parentid):
        """
        This will add reciprocal triples like:
        feature is_subsequence_of parent
        parent has_subsequence feature
        :param graph:
        :param parentid:
        :return:
        """
        self.gu.addTriple(
            graph, self.id, self.properties['is_subsequence_of'], parentid)
        self.gu.addTriple(
            graph, parentid, self.properties['has_subsequence'], self.id)

        return

    def addTaxonToFeature(self, graph, taxonid):
        """
        Given the taxon id, this will add the following triple:
        feature in_taxon taxonid
        :param graph:
        :param taxonid:
        :return:
        """
        # TEC: should taxon be set in __init__()?
        self.taxon = taxonid
        self.gu.addTriple(
            graph, self.id, Assoc.properties['in_taxon'], self.taxon)

        return

    def loadAllProperties(self, graph):

        prop_dict = {
            Assoc(None).ANNOTPROP: self.annotation_properties,
            Assoc(None).OBJECTPROP: self.object_properties,
            Assoc(None).DATAPROP: self.data_properties
        }

        for p in prop_dict:
            self.gu.loadProperties(graph, prop_dict.get(p), p)

        return

    def addFeatureProperty(self, graph, property_type, property):
        self.gu.addTriple(graph, self.id, property_type, property)
        return

    def setNoBNodes(self, nobnodes):
        self.nobnodes = nobnodes
        return
Пример #18
0
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and creates the genes as classes, typed with SO.  It will add their
        label, any alternate labels as synonyms, alternate ids as equivlaent classes.  HPRDs get added as
        protein products.  The chromosome and chr band get added as blank node regions, and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:
        """
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        # not unzipping the file
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_info']['file']))
        logger.info("FILE: %s", myfile)

        # Add taxa and genome classes for those in our filter
        for tax_num in self.tax_ids:
            tax_id = ':'.join(('NCBITaxon', str(tax_num)))
            geno.addGenome(tax_id, str(tax_num))   # tax label can get added elsewhere
            gu.addClassToGraph(g, tax_id, None)   # label added elsewhere
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, symbol, locustag,
                 synonyms, xrefs, chr, map_loc, desc,
                 gtype, authority_symbol, name,
                 nomenclature_status, other_designations, modification_date) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #            or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1

                gene_id = ':'.join(('NCBIGene', gene_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))
                gene_type_id = self._map_type_of_gene(gtype)

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol

                # TODO might have to figure out if things aren't genes, and make them individuals
                gu.addClassToGraph(g, gene_id, label, gene_type_id, desc)

                # we have to do special things here for genes, because they're classes not individuals
                # f = Feature(gene_id,label,gene_type_id,desc)

                if name != '-':
                    gu.addSynonym(g, gene_id, name)
                if synonyms.strip() != '-':
                    for s in synonyms.split('|'):
                        gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
                if other_designations.strip() != '-':
                    for s in other_designations.split('|'):
                        gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])

                # deal with the xrefs
                # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
                if xrefs.strip() != '-':
                    for r in xrefs.strip().split('|'):
                        fixedr = self._cleanup_id(r)
                        if fixedr is not None and fixedr.strip() != '':
                            if re.match('HPRD', fixedr):
                                # proteins are not == genes.
                                gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr)
                            else:
                                # skip some of these for now
                                if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']:
                                    gu.addEquivalentClass(g, gene_id, fixedr)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # 419     ART3      4    with   4q21.1|4p15.1-p14   # no idea why there's two bands listed - possibly 2 assemblies
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3   #this is of "unknown" type == susceptibility
                # 101928066       LOC101928066    1|Un    -         # unlocated scaffold
                # 11435   Chrna1  2       2 C3|2 43.76 cM           # mouse --> 2C3
                # 11548   Adra1b  11      11 B1.1|11 25.81 cM       # mouse --> 11B1.1
                # 11717   Ampd3   7       7 57.85 cM|7 E2-E3        # mouse
                # 14421   B4galnt1        10      10 D3|10 74.5 cM  # mouse
                # 323212  wu:fb92e12      19|20   -                 # fish
                # 323368  ints10  6|18    -                         # fish
                # 323666  wu:fc06e02      11|23   -                 # fish

                # feel that the chr placement can't be trusted in this table when there is > 1 listed
                # with the exception of human X|Y, i will only take those that align to one chr

                # FIXME remove the chr mapping below when we pull in the genomic coords
                if str(chr) != '-' and str(chr) != '':
                    if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']:
                        # this means that there's uncertainty in the mapping.  skip it
                        # TODO we'll need to figure out how to deal with >1 loc mapping
                        logger.info('%s is non-uniquely mapped to %s.  Skipping for now.', gene_id, str(chr))
                        continue
                        # X|Y	Xp22.33;Yp11.3

                    # if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
                    #    print('odd chr=',str(chr))
                    if str(chr) == 'X; Y':
                        chr = 'X|Y'  # rewrite the PAR regions for processing
                    # do this in a loop to allow PAR regions like X|Y
                    for c in re.split('\|',str(chr)) :
                        geno.addChromosomeClass(c, tax_id, None)  # assume that the chromosome label will get added elsewhere
                        mychrom = makeChromID(c, tax_num, 'CHR')
                        mychrom_syn = makeChromLabel(c, tax_num)  # temporarily use the taxnum for the disambiguating label
                        gu.addSynonym(g, mychrom,  mychrom_syn)
                        band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc)
                        if band_match is not None and len(band_match.groups()) > 0:
                            # if tax_num != '9606':
                            #     continue
                            # this matches the regular kind of chrs, so make that kind of band
                            # not sure why this matches? chrX|Y or 10090chr12|Un"
                            # TODO we probably need a different regex per organism
                            # the maploc_id already has the numeric chromosome in it, strip it first
                            bid = re.sub('^'+c, '', map_loc)
                            maploc_id = makeChromID(c+bid, tax_num, 'CHR')  # the generic location (no coordinates)
                            # print(map_loc,'-->',bid,'-->',maploc_id)
                            band = Feature(maploc_id, None, None)  # Assume it's type will be added elsewhere
                            band.addFeatureToGraph(g)
                            # add the band as the containing feature
                            gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id)
                        else:
                            # TODO handle these cases
                            # examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24,
                            ## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1,  12cen-q21, 22q13.3|22q13.3
                            logger.debug('not regular band pattern for %s: %s', gene_id, map_loc)
                            # add the gene as a subsequence of the chromosome
                            gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom)

                geno.addTaxon(tax_id, gene_id)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

            gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
            gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
            gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
            gu.loadAllProperties(g)

        return
Пример #19
0
    def _process_phenotype_data(self, limit):
        """
        NOTE: If a Strain carries more than one mutation,
        then each Mutation description,
        i.e., the set: (
            Mutation Type - Chromosome - Gene Symbol -
            Gene Name - Allele Symbol - Allele Name)
        will require a separate line.

        Note that MMRRC curates phenotypes to alleles,
        even though they distribute only one file with the
        phenotypes appearing to be associated with a strain.

        So, here we process the allele-to-phenotype relationships separately
        from the strain-to-allele relationships.

        :param limit:
        :return:

        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        gu = GraphUtils(curie_map.get())
        fname = '/'.join((self.rawdir, self.files['catalog']['file']))

        self.strain_hash = {}
        self.id_label_hash = {}
        genes_with_no_ids = set()
        stem_cell_class = 'CL:0000034'
        mouse_taxon = 'NCBITaxon:10090'
        geno = Genotype(g)
        with open(fname, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            for row in filereader:
                line_counter += 1
                # skip the first 3 lines which are header, etc.
                if line_counter < 4:
                    continue

                (strain_id, strain_label, strain_type_symbol, strain_state,
                 mgi_allele_id, mgi_allele_symbol, mgi_allele_name,
                 mutation_type, chrom, mgi_gene_id, mgi_gene_symbol,
                 mgi_gene_name, sds_url, accepted_date, mp_ids, pubmed_nums,
                 research_areas) = row

                if self.testMode and (strain_id not in self.test_ids):
                    continue

                # strip off stuff after the dash -
                # is the holding center important?
                # MMRRC:00001-UNC --> MMRRC:00001
                strain_id = re.sub(r'-\w+$', '', strain_id)

                self.id_label_hash[strain_id] = strain_label

                # get the variant or gene to save for later building of
                # the genotype
                if strain_id not in self.strain_hash:
                    self.strain_hash[strain_id] = {'variants': set(),
                                                   'genes': set()}

                # clean up the bad one
                if mgi_allele_id == 'multiple mutation':
                    logger.error("Erroneous gene id: %s", mgi_allele_id)
                    mgi_allele_id = ''

                if mgi_allele_id != '':
                    self.strain_hash[strain_id]['variants'].add(mgi_allele_id)
                    self.id_label_hash[mgi_allele_id] = mgi_allele_symbol

                    # use the following if needing to add the
                    # sequence alteration types
                    # var_type =
                    #   self._get_variant_type_from_abbrev(mutation_type)
                    # make a sequence alteration for this variant locus,
                    # and link the variation type to it
                    # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA'
                    # if self.nobnodes:
                    #     sa_id = ':'+sa_id
                    # gu.addIndividualToGraph(g, sa_id, None, var_type)
                    # geno.addSequenceAlterationToVariantLocus(sa_id,
                    #                                          mgi_allele_id)

                # scrub out any spaces
                mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id)
                if mgi_gene_id.strip() != '':
                    if re.match(r'Gene\s*ID:', mgi_gene_id, re.I):
                        mgi_gene_id = re.sub(r'Gene\s*ID:\s*', 'NCBIGene:',
                                             mgi_gene_id)
                    elif not re.match(r'MGI', mgi_gene_id):
                        logger.info("Gene id not recognized: %s", mgi_gene_id)
                        if re.match(r'\d+$', mgi_gene_id):
                            # assume that if it's all numbers, then it's MGI
                            mgi_gene_id = 'MGI:'+str(mgi_gene_id)
                            logger.info("Assuming numerics are MGI.")
                    self.strain_hash[strain_id]['genes'].add(mgi_gene_id)
                    self.id_label_hash[mgi_gene_id] = mgi_gene_symbol

                # catch some errors -
                # some things have gene labels, but no identifiers - report
                if mgi_gene_symbol.strip() != '' and mgi_gene_id == '':
                    logger.error(
                        "Gene label with no identifier for strain %s: %s",
                        strain_id, mgi_gene_symbol)
                    genes_with_no_ids.add(mgi_gene_symbol.strip())
                    # make a temp id for genes that aren't identified
                    # tmp_gene_id = '_'+mgi_gene_symbol
                    # self.id_label_hash[tmp_gene_id] = mgi_gene_symbol
                    # self.strain_hash[strain_id]['genes'].add(tmp_gene_id)

                # split apart the mp ids
                # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ...
                # mp_ids are now a comma delimited list
                # with MP terms in brackets
                phenotype_ids = []
                if mp_ids != '':
                    for i in re.split(r',', mp_ids):
                        i = i.strip()
                        mps = re.search(r'\[(.*)\]', i)
                        if mps is not None:
                            mp_id = mps.group(1).strip()
                            phenotype_ids.append(mp_id)

                # pubmed ids are space delimited
                pubmed_ids = []
                if pubmed_nums.strip() != '':
                    for i in re.split(r'\s+', pubmed_nums):
                        pmid = 'PMID:'+i.strip()
                        pubmed_ids.append(pmid)
                        r = Reference(pmid,
                                      Reference.ref_types['journal_article'])
                        r.addRefToGraph(g)

                # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001
                # is a good example of 4 genotype parts

                gu.addClassToGraph(g, mouse_taxon, None)
                if research_areas.strip() == '':
                    research_areas = None
                else:
                    research_areas = 'Research Areas: '+research_areas
                strain_type = mouse_taxon
                if strain_state == 'ES':
                    strain_type = stem_cell_class
                gu.addIndividualToGraph(
                    g, strain_id, strain_label, strain_type,
                    research_areas)  # an inst of mouse??
                gu.makeLeader(g, strain_id)

                # phenotypes are associated with the alleles
                for pid in phenotype_ids:
                    # assume the phenotype label is in the ontology
                    gu.addClassToGraph(g, pid, None)
                    if mgi_allele_id is not None and mgi_allele_id != '':
                        assoc = G2PAssoc(self.name, mgi_allele_id, pid,
                                         gu.object_properties['has_phenotype'])
                        for p in pubmed_ids:
                            assoc.add_source(p)
                        assoc.add_association_to_graph(g)
                    else:
                        logger.info("Phenotypes and no allele for %s",
                                    strain_id)

                if not self.testMode and (
                        limit is not None and line_counter > limit):
                    break

            # now that we've collected all of the variant information, build it
            # we don't know their zygosities
            for s in self.strain_hash:
                h = self.strain_hash.get(s)
                variants = h['variants']
                genes = h['genes']
                vl_set = set()
                # make variant loci for each gene
                if len(variants) > 0:
                    for v in variants:
                        vl_id = v
                        vl_symbol = self.id_label_hash[vl_id]
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        vl_set.add(vl_id)
                        if len(variants) == 1 and len(genes) == 1:
                            for gene in genes:
                                geno.addAlleleOfGene(vl_id, gene)
                        else:
                            geno.addAllele(vl_id, vl_symbol)
                else:  # len(vars) == 0
                    # it's just anonymous variants in some gene
                    for gene in genes:
                        vl_id = '_'+gene+'-VL'
                        vl_id = re.sub(r':', '', vl_id)
                        if self.nobnodes:
                            vl_id = ':'+vl_id
                        vl_symbol = self.id_label_hash[gene]+'<?>'
                        self.id_label_hash[vl_id] = vl_symbol
                        geno.addAllele(vl_id, vl_symbol,
                                       geno.genoparts['variant_locus'])
                        geno.addGene(gene, self.id_label_hash[gene])
                        geno.addAlleleOfGene(vl_id, gene)
                        vl_set.add(vl_id)

                # make the vslcs
                vl_list = sorted(vl_set)
                vslc_list = []
                for vl in vl_list:
                    # for unknown zygosity
                    vslc_id = '_'+re.sub(r'^_', '', vl)+'U'
                    vslc_id = re.sub(r':', '', vslc_id)
                    if self.nobnodes:
                        vslc_id = ':' + vslc_id
                    vslc_label = self.id_label_hash[vl] + '/?'
                    self.id_label_hash[vslc_id] = vslc_label
                    vslc_list.append(vslc_id)
                    geno.addPartsToVSLC(
                        vslc_id, vl, None, geno.zygosity['indeterminate'],
                        geno.object_properties['has_alternate_part'], None)
                    gu.addIndividualToGraph(
                        g, vslc_id, vslc_label,
                        geno.genoparts['variant_single_locus_complement'])
                if len(vslc_list) > 0:
                    if len(vslc_list) > 1:
                        gvc_id = '-'.join(vslc_list)
                        gvc_id = re.sub(r':', '', gvc_id)
                        if self.nobnodes:
                            gvc_id = ':'+gvc_id
                        gvc_label = \
                            '; '.join(self.id_label_hash[v] for v in vslc_list)
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])
                        for vslc_id in vslc_list:
                            geno.addVSLCtoParent(vslc_id, gvc_id)
                    else:
                        # the GVC == VSLC, so don't have to make an extra piece
                        gvc_id = vslc_list.pop()
                        gvc_label = self.id_label_hash[gvc_id]

                    genotype_label = gvc_label + ' [n.s.]'
                    bkgd_id = \
                        '_' + re.sub(r':', '', '-'.join(
                            (geno.genoparts['unspecified_genomic_background'],
                             s)))
                    genotype_id = '-'.join((gvc_id, bkgd_id))
                    if self.nobnodes:
                        bkgd_id = ':'+bkgd_id
                    geno.addTaxon(mouse_taxon, bkgd_id)
                    geno.addGenomicBackground(
                        bkgd_id, 'unspecified ('+s+')',
                        geno.genoparts['unspecified_genomic_background'],
                        "A placeholder for the " +
                        "unspecified genetic background for "+s)
                    geno.addGenomicBackgroundToGenotype(
                        bkgd_id, genotype_id,
                        geno.genoparts['unspecified_genomic_background'])
                    geno.addParts(
                        gvc_id, genotype_id,
                        geno.object_properties['has_alternate_part'])
                    geno.addGenotype(genotype_id, genotype_label)
                    gu.addTriple(
                        g, s, geno.object_properties['has_genotype'],
                        genotype_id)
                else:
                    # logger.debug(
                    #   "Strain %s is not making a proper genotype.", s)
                    pass

            gu.loadProperties(
                g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
            gu.loadProperties(
                g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
            gu.loadProperties(
                g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
            gu.loadAllProperties(g)

            logger.warning(
                "The following gene symbols did not list identifiers: %s",
                str(sorted(list(genes_with_no_ids))))

        return
Пример #20
0
    def _get_var_citations(self, limit):

        # Generated weekly, the first of the week
        # A tab-delimited report of citations associated with data in ClinVar, connected to the AlleleID, the VariationID, and either rs# from dbSNP or nsv in dbVar.
        #
        # AlleleID          integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
        # VariationID       The identifier ClinVar uses to anchor its default display. (in the XML,  //MeasureSet/@ID)
        # rs			    rs identifier from dbSNP
        # nsv				nsv identifier from dbVar
        # citation_source	The source of the citation, either PubMed, PubMedCentral, or the NCBI Bookshelf
        # citation_id		The identifier used by that source

        gu = GraphUtils(curie_map.get())
        logger.info("Processing Citations for variants")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_citations']['file']))
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        with open(myfile, 'r', encoding="utf8") as f:
            filereader = csv.reader(f, delimiter='\t', quotechar='\"')

            for line in filereader:
                # skip comments
                line = line
                if re.match('^#', line[0]):
                    continue
                (allele_num, variant_num, rs_num, nsv_num, citation_source, citation_id) = line

                line_counter += 1

                if self.testMode:
                    if int(variant_num) not in self.variant_ids:
                        continue

                if citation_id.strip() == '':
                    logger.info("Skipping blank citation for ClinVarVariant:%s", str(variant_num))
                    continue

                # the citation for a variant is made to some kind of combination of the ids here.
                # but i'm not sure which we don't know what the citation is for exactly, other
                # than the variant.  so use mentions

                var_id = 'ClinVarVariant:'+variant_num

                # citation source: PubMed | PubMedCentral | citation_source
                # citation id:
                # format the citation id:
                ref_id = None
                if citation_source == 'PubMed':
                    ref_id = 'PMID:'+str(citation_id)
                elif citation_source == 'PubMedCentral':
                    ref_id = 'PMCID:'+str(citation_id)
                if ref_id is not None:
                    r = Reference(ref_id, Reference.ref_types['journal_article'])
                    r.addRefToGraph(g)
                    gu.addTriple(g, ref_id, self.properties['is_about'], var_id)

                if not self.testMode and (limit is not None and line_counter > limit):
                    break

        logger.info("Finished processing citations for variants")

        return
Пример #21
0
    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

            line_id a CL_0000057,  #fibroblast line
                derives_from patient_id
                part_of :NIGMSrepository
                RO:model_of OMIM:disease_id

            patient id a foaf:person,
                label: "fibroblast from patient 12345 with disease X"
                member_of family_id  #what is the right thing here?
                SIO:race EFO:caucasian  #subclass of EFO:0001799
                in_taxon NCBITaxon:9606
                dc:description Literal(remark)
                RO:has_phenotype OMIM:disease_id
                GENO:has_genotype genotype_id

            family_id a owl:NamedIndividual
                foaf:page "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

            genotype_id a intrinsic_genotype
                GENO:has_alternate_part allelic_variant_id
                we don't necessarily know much about the genotype,
                other than the allelic variant. also there's the sex here

            pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:
        """
        logger.info("Processing Data from %s", raw)
        gu = GraphUtils(curie_map.get())

        if self.testMode:      # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        gu.loadProperties(g, geno.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:'+catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_person'
                    if self.nobnodes:
                        patient_id = ':'+patient_id
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    gu.addIndividualToGraph(
                        g, cell_line_id, line_label, cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:'+dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        gu.addIndividualToGraph(
                            g, equiv_cell_line, None, cell_line_reagent_id)
                        gu.addSameIndividual(g, cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    gu.addMember(g, repository, cell_line_id)

                    if cat_remark != '':
                        gu.addDescription(g, cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                        # this would give a BNode that is an instance of Age.
                        # but i don't know how to connect
                        # the age node to the cell line? we need to ask @mbrush
                        # age_id = '_'+re.sub('\s+','_',age)
                        # gu.addIndividualToGraph(
                        #   g,age_id,age,self.terms['age'])
                        # gu.addTriple(
                        #   g,age_id,self.properties['has_measurement'],age,
                        #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    gu.addPerson(g, patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        gu.addSubclass(
                    #           g,self.terms['ethnic_group'],mapped_race)

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:'+family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        gu.addIndividualToGraph(
                            g, family_comp_id, family_label,
                            geno.genoparts['family'])

                        # Add the patient as a member of the family
                        gu.addMemberOf(g, patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:'+dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_'+re.sub('MONARCH:', '', self.make_id(karyotype))
                        if self.nobnodes:
                            karyotype_id = ':'+karyotype_id
                        # add karyotype as karyotype_variation_complement
                        gu.addIndividualToGraph(
                            g, karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(
                                karyotype_feature_id, karyotype_feature_label,
                                geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph(g)
                            f.loadAllProperties(g)
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene+'('+mutation+')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = '_' + variant_id.replace(';', '-') + '-' \
                                    + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    if gvc_id is not None and gvc_id != karyotype_id \
                            and self.nobnodes:
                        gvc_id = ':'+gvc_id

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            if self.nobnodes:
                                vslc_id = ':'+vslc_id
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            gu.addIndividualToGraph(
                                g, vslc_id, vslc_label,
                                geno.genoparts[
                                    'variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:'+o+'.'+v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'],
                                    geno.object_properties[
                                        'has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        gu.addType(g, patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_geno'+catalog_id.strip()
                        if self.nobnodes:
                            genotype_id = ':'+genotype_id

                    # add the gvc
                    if gvc_id is not None:
                        gu.addIndividualToGraph(
                            g, gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id,
                                    geno.object_properties[
                                        'has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' ['+catalog_id.strip()+']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(
                            genotype_id, genotype_label,
                            geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        gu.addTriple(
                            g, patient_id,
                            geno.properties['has_genotype'], genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:'+d.strip()
                                        # assume the label is taken care of
                                        gu.addClassToGraph(g, disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            self.name, patient_id, disease_id)
                                        assoc.add_association_to_graph(g)

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        gu.addTriple(
                                            g, cell_line_id,
                                            gu.properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:'+s.strip()
                            ref = Reference(pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph(g)
                            gu.addTriple(
                                g, pubmed_id, gu.properties['mentions'],
                                cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break

            Assoc(self.name).load_all_properties(g)

        return
Пример #22
0
    def _add_variant_protein_variant_assoc_to_graph(self, row):
        """
        Generates relationships between variants and protein variants
        given a row of data
        :param iterable: row of data, see add_variant_info_to_graph()
                                      docstring for expected structure
        :return None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)
        is_missense = False
        is_literal = True

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source) = row[0:11]

        variant_id = self.make_cgd_id('variant{0}'.format(variant_key))

        transcript_curie = self._make_transcript_curie(transcript_id)
        uniprot_curie = self._make_uniprot_polypeptide_curie(transcript_id)
        ncbi_protein_curie = self._make_ncbi_polypeptide_curie(transcript_id)

        geno.addGenotype(variant_id, variant_label,
                         geno.genoparts['sequence_alteration'])

        # Make fake amino acid sequence in case we
        # can't get a CCDS to Uniprot and/or NCBI Protein mapping
        aa_seq_id = self.make_cgd_id('transcript{0}'.format(amino_acid_variant))

        # Add Transcript:
        geno.addTranscript(variant_id, transcript_curie, transcript_id,
                           geno.genoparts['transcript'])

        # Add polypeptide
        if ncbi_protein_curie is not None:
            geno.addPolypeptide(ncbi_protein_curie,
                                self.transcript_xrefs['RefSeq'][transcript_id],
                                transcript_curie)
            aa_seq_id = ncbi_protein_curie
        if uniprot_curie is not None:
            geno.addPolypeptide(uniprot_curie,
                                self.transcript_xrefs['UniProt'][transcript_id],
                                transcript_curie)
            # Overrides ncbi_protein_curie,
            # but we set them as equal individuals below
            aa_seq_id = uniprot_curie

        if ncbi_protein_curie is not None and uniprot_curie is not None:
            gu.addSameIndividual(self.graph, ncbi_protein_curie, uniprot_curie)
        else:
            aa_seq_id = self.make_cgd_id('transcript{0}'.format(amino_acid_variant))

        if protein_variant_type == 'nonsynonymous - missense' \
                or re.search(r'missense', variant_label):
            is_missense = True
            geno.addGenotype(variant_id, variant_label,
                             geno.genoparts['missense_variant'])

        # Get gene ID from gene map
        self._add_variant_gene_relationship(variant_id, transcript_gene)

        amino_acid_regex = re.compile(r'^p\.([A-Za-z]{1,3})(\d+)([A-Za-z]{1,3})$')

        if is_missense:
            match = re.match(amino_acid_regex, amino_acid_variant.rstrip())
        else:
            match = None

        if match is not None:
            ref_amino_acid = match.group(1)
            position = match.group(2)
            altered_amino_acid = match.group(3)
        else:
            logger.debug("Could not parse amino acid information"
                         " from {0} variant:"
                         " {1} type: {2}".format(amino_acid_variant,
                                                 variant_label,
                                                 protein_variant_type))

        # Add amino acid change to model
        if is_missense is True and match is not None:
            gu.addTriple(self.graph, variant_id,
                         geno.properties['reference_amino_acid'],
                         ref_amino_acid, is_literal)
            gu.addTriple(self.graph, variant_id,
                         geno.properties['results_in_amino_acid_change'],
                         altered_amino_acid, is_literal)

            aa_region_id = ":_{0}{1}{2}Region".format(position, position, aa_seq_id)
            self._add_feature_with_coords(variant_id, position,
                                          position, aa_seq_id, aa_region_id)

        return
Пример #23
0
class Environment():
    """
    These methods provide convenient methods
    to add items related to an experimental environment
    and it's parts to a supplied graph.

    This is a stub ready for expansion.
    """

    # special genotype parts mapped to their GENO and SO classes
    # that we explicitly reference here
    environment_parts = {
        'environmental_system': 'ENVO:01000254',
        'environmental_condition': 'XCO:0000000',
        'morpholio_reagent': 'REO:0000042',
        'talen_reagent': 'REO:0001022',
        'crispr_reagent': 'REO:crispr_TBD'
    }

    object_properties = {
        'has_part': 'BFO:0000051',
    }

    annotation_properties = {
    }

    properties = object_properties.copy()
    properties.update(annotation_properties)

    def __init__(self, graph):

        self.gu = GraphUtils(curie_map.get())

        self.graph = graph

        self.gu.loadProperties(
            self.graph, self.object_properties, self.gu.OBJPROP)

        return

    def addEnvironment(
            self, env_id, env_label, env_type=None, env_description=None):
        if env_type is None:
            env_type = self.environment_parts['environmental_system']

        self.gu.addIndividualToGraph(
            self.graph, env_id, env_label, env_type, env_description)

        return

    def addEnvironmentalCondition(
            self, cond_id, cond_label, cond_type=None, cond_description=None):
        if cond_type is None:
            cond_type = self.environment_parts['environmental_condition']

        self.gu.addIndividualToGraph(
            self.graph, cond_id, cond_label, cond_type, cond_description)

        return

    def addComponentToEnvironment(self, env_id, component_id):

        self.gu.addTriple(
            self.graph, env_id,
            self.gu.object_properties['has_part'],  # TODO cbeck if cself
            component_id)

        return

    def addComponentAttributes(
            self, component_id, entity_id, value=None, unit=None):

        self.gu.addTriple(
            self.graph, component_id, self.gu.object_properties['has_part'],
            entity_id)
        # TODO add value and units

        return
Пример #24
0
    def _add_variant_cdna_variant_assoc_to_graph(self, row):
        """
        Generates relationships between variants and cDNA variants
        given a row of data
        :param iterable: row of data, see add_variant_info_to_graph()
                                      docstring for expected structure.
                                      Only applicable for structure 2.
        :return None
        """
        gu = GraphUtils(curie_map.get())
        geno = Genotype(self.graph)
        is_literal = True

        (variant_key, variant_label, amino_acid_variant, amino_acid_position,
         transcript_id, transcript_priority, protein_variant_type,
         functional_impact, stop_gain_loss, transcript_gene,
         protein_variant_source, variant_gene, bp_pos, variant_cdna,
         cosmic_id, db_snp_id, genome_pos_start, genome_pos_end, ref_base,
         variant_base, primary_transcript_exons,
         primary_transcript_variant_sub_types, variant_type, chromosome,
         genome_build, build_version, build_date) = row

        variant_id = self.make_cgd_id('variant{0}'.format(variant_key))

        # Add gene
        self._add_variant_gene_relationship(variant_id, variant_gene)

        # Transcript reference for nucleotide position
        transcript_curie = self._make_transcript_curie(transcript_id)

        # Make region IDs
        cdna_region_id = ":_{0}Region".format(transcript_curie)
        chrom_region_id = ":_{0}{1}Region-{2}-{3}".format(genome_build,
                                                          chromosome,
                                                          genome_pos_start,
                                                          genome_pos_end)

        # Add the genome build
        genome_label = "Human"
        build_id = "UCSC:{0}".format(genome_build)
        taxon_id = 'NCBITaxon:9606'
        geno.addGenome(taxon_id, genome_label)
        geno.addReferenceGenome(build_id, genome_build, taxon_id)

        # Add chromosome

        chrom_class_id = makeChromID(chromosome, '9606', 'CHR')  # the chrom class (generic) id
        chrom_instance_id = makeChromID(chromosome, build_id, 'MONARCH')

        # first, add the chromosome class (in the taxon)
        geno.addChromosomeClass(chromosome, taxon_id, 'Human')

        # then, add the chromosome instance (from the given build)
        geno.addChromosomeInstance(chromosome, build_id, genome_build, chrom_class_id)

        # Add variant coordinates in reference to chromosome
        self._add_feature_with_coords(variant_id,genome_pos_start,
                                      genome_pos_end, chrom_instance_id, chrom_region_id)

        # Add mutation coordinates in reference to gene
        self._add_feature_with_coords(variant_id, bp_pos,
                                      bp_pos, transcript_curie, cdna_region_id)

        # Add nucleotide mutation
        gu.addTriple(self.graph, variant_id,
                     geno.properties['reference_nucleotide'],
                     ref_base, is_literal)
        gu.addTriple(self.graph, variant_id,
                     geno.properties['altered_nucleotide'],
                     variant_base, is_literal)

        """
        Here we update any internal cgd variant IDS with a cosmic ID
        or dbSNP ID.  Alternatively we could do this using sql rather
        than a sparql update which may be safer
        """
        # Add SNP xrefs
        if cosmic_id is not None:
            cosmic_id_list = cosmic_id.split(', ')
            cosmic_curie_list = []
            for c_id in cosmic_id_list:
                cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id)
                cosmic_curie_list.append(cosmic_curie)
                gu.addIndividualToGraph(self.graph, cosmic_curie, c_id,
                                        geno.genoparts['missense_variant'])

            # If there are multiple ids set them equivalent to the first
            for curie in cosmic_curie_list[1:]:
                gu.addSameIndividual(self.graph, cosmic_curie_list[0], curie)

            self._replace_entity(self.graph, variant_id, cosmic_curie_list[0], self.bindings)

        if db_snp_id is not None:
            db_snp_curie = re.sub(r'rs(\d+)', r'dbSNP:\1', db_snp_id)
            gu.addIndividualToGraph(self.graph, db_snp_curie, db_snp_id,
                                    geno.genoparts['missense_variant'])

            if cosmic_id is None:
                self._replace_entity(self.graph, variant_id, db_snp_curie, self.bindings)
            else:
                cosmic_id_list = cosmic_id.split(', ')
                for c_id in cosmic_id_list:
                    cosmic_curie = re.sub(r'COSM(\d+)', r'COSMIC:\1', c_id)
                    gu.addSameIndividual(self.graph, cosmic_curie, db_snp_curie)

        return