Пример #1
0
    def _process_article_row(self, row):
        model = Model(self.g)
        # don't bother in test mode
        if self.testMode:
            return

        iarticle_id = self._make_internal_id('article', row['article_id'])
        self.id_hash['article'][row['article_id']] = iarticle_id
        rtype = None
        if row['journal'] != '':
            rtype = Reference.ref_types['journal_article']
        reference = Reference(self.g, iarticle_id, rtype)

        if row['title'] is not None:
            reference.setTitle(row['title'].strip())
        if row['year'] is not None:
            reference.setYear(row['year'])
        reference.addRefToGraph()

        if row['pubmed_id'] is not None:
            pmid = 'PMID:'+str(row['pubmed_id'])
            self.id_hash['article'][row['article_id']] = pmid
            model.addSameIndividual(iarticle_id, pmid)
            model.addComment(pmid, iarticle_id.replace("_:", ''))

        return
Пример #2
0
    def _process_article_row(self, row):

        # don't bother in test mode
        if self.testMode:
            return

        iarticle_id = self._make_internal_id('article', row['article_id'])
        self.id_hash['article'][row['article_id']] = iarticle_id
        rtype = None
        if row['journal'] != '':
            rtype = Reference.ref_types['journal_article']
        r = Reference(iarticle_id, rtype)

        if row['title'] is not None:
            r.setTitle(row['title'].strip())
        if row['year'] is not None:
            r.setYear(row['year'])
        r.addRefToGraph(self.g)

        if row['pubmed_id'] is not None:
            pmid = 'PMID:'+str(row['pubmed_id'])
            self.id_hash['article'][row['article_id']] = pmid
            self.gu.addSameIndividual(self.g, iarticle_id, pmid)
            self.gu.addComment(self.g, pmid, iarticle_id)

        return
Пример #3
0
    def _get_pubs(self, entry, g):
        """
        Extract mentioned publications from the reference list
        :param entry:
        :return:
        """

        ref_to_pmid = {}
        entry_num = entry['mimNumber']
        model = Model(g)
        if 'referenceList' in entry:
            reflist = entry['referenceList']
            for r in reflist:
                if 'pubmedID' in r['reference']:
                    pub_id = 'PMID:' + str(r['reference']['pubmedID'])
                    ref = \
                        Reference(
                            g, pub_id,
                            Reference.ref_types['journal_article'])
                else:
                    # make blank node for internal reference
                    pub_id = \
                        '_:OMIM' + str(entry_num) + 'ref' + \
                        str(r['reference']['referenceNumber'])

                    ref = Reference(g, pub_id)
                    title = author_list = source = citation = None
                    if 'title' in r['reference']:
                        title = r['reference']['title']
                        ref.setTitle(title)
                    if 'authors' in r['reference']:
                        author_list = r['reference']['authors']
                        ref.setAuthorList(author_list)
                        citation = re.split(r'\.\,', author_list)[0] + ' et al'
                    if 'source' in r['reference']:
                        source = r['reference']['source']
                    citation = '; '.join(
                        list(filter(None.__ne__, [citation, title, source])))
                    ref.setShortCitation(citation)
                ref.addRefToGraph()
                ref_to_pmid[r['reference']['referenceNumber']] = pub_id

                # add is_about for the pub
                omim_id = 'OMIM:'+str(entry_num)
                g.addTriple(omim_id, model.object_properties['mentions'],
                            pub_id)

        return ref_to_pmid
Пример #4
0
    def _get_pubs(self, entry, graph):
        """
        Extract mentioned publications from the reference list
        :param entry:
        :return:
        """

        ref_to_pmid = {}
        entry_num = entry['mimNumber']

        if 'referenceList' in entry:
            reflist = entry['referenceList']
            for rlst in reflist:
                if 'pubmedID' in rlst['reference']:
                    pub_id = 'PMID:' + str(rlst['reference']['pubmedID'])
                    ref = Reference(graph, pub_id,
                                    self.globaltt['journal article'])
                else:
                    # make blank node for internal reference
                    pub_id = '_:OMIM' + str(entry_num) + 'ref' + str(
                        rlst['reference']['referenceNumber'])

                    ref = Reference(graph, pub_id)
                    title = author_list = source = citation = None
                    if 'title' in rlst['reference']:
                        title = rlst['reference']['title']
                        ref.setTitle(title)
                    if 'authors' in rlst['reference']:
                        author_list = rlst['reference']['authors']
                        ref.setAuthorList(author_list)
                        citation = re.split(r'\.\,', author_list)[0] + ' et al'
                    if 'source' in rlst['reference']:
                        source = rlst['reference']['source']
                    citation = '; '.join([
                        tok for tok in [citation, title, source]
                        if tok is not None
                    ])
                    ref.setShortCitation(citation)
                ref.addRefToGraph()
                ref_to_pmid[rlst['reference']['referenceNumber']] = pub_id

                # add is_about for the pub
                omim_id = 'OMIM:' + str(entry_num)
                graph.addTriple(omim_id, self.globaltt['mentions'], pub_id)

        return ref_to_pmid
Пример #5
0
    def _get_pubs(self, entry, g):
        """
        Extract mentioned publications from the reference list
        :param entry:
        :return:
        """

        ref_to_pmid = {}
        du = DipperUtil()
        entry_num = entry['mimNumber']
        gu = GraphUtils(curie_map.get())
        if 'referenceList' in entry:
            reflist = entry['referenceList']
            for r in reflist:
                if 'pubmedID' in r['reference']:
                    pub_id = 'PMID:' + str(r['reference']['pubmedID'])
                    ref = Reference(pub_id, Reference.ref_types['journal_article'])
                else:
                    # make blank node for internal reference
                    pub_id = '_OMIM' + str(entry_num) + 'ref' + str(r['reference']['referenceNumber'])
                    if self.nobnodes:
                        pub_id = ':' + pub_id
                    ref = Reference(pub_id)
                    title = author_list = source = citation = None
                    if 'title' in r['reference']:
                        title = r['reference']['title']
                        ref.setTitle(title)
                    if 'authors' in r['reference']:
                        author_list = r['reference']['authors']
                        ref.setAuthorList(author_list)
                        citation = re.split('\.\,', author_list)[0] + ' et al'
                    if 'source' in r['reference']:
                        source = r['reference']['source']
                    citation = '; '.join(du.flatten([citation, title, source]))
                    ref.setShortCitation(citation)
                ref.addRefToGraph(g)
                ref_to_pmid[r['reference']['referenceNumber']] = pub_id

                # add is_about for the pub
                omim_id = 'OMIM:'+str(entry_num)
                gu.addTriple(g, omim_id, gu.object_properties['mentions'], pub_id)

        return ref_to_pmid
Пример #6
0
    def _process_allele_phenotype(self, limit):
        """
        Make allele to phenotype associations using derived_pheno_class
        and derived_pheno_manifest cvterm in the flybase db, an example entry is:

        FBal0257663    @FBcv0000351:lethal@ | @FBcv0000308:female limited@,
                       with @FBal0130657:Scer\GAL4<up>dome-PG14</up>@

        The first term is the phenotype, and all follow up terms are qualifiers,
        self.globaltt['has_qualifier'])

        Our previous approach was to use the genotype id associated with
        FBal0257663/FBal0130657 , however, this required us to create blank
        nodes and was considered unnecessarily granular

        Note that sometimes identifiers do not exist for a term, eg
        @:heat sensitive | tetracycline conditional@

        derived_pheno_class - FBcv terms, these are phenotypes
        derived_pheno_manifest -  Anatomy terms FBbt, we currently
        make phenotype IRI equivalents that end up in UPheno, but
        this is being developed and updated, see
        https://github.com/monarch-initiative/dipper/issues/770

        Adds triples to self.graph

        :param limit: number of rows to process
        :return: None

        """
        model = Model(self.graph)
        src_key = 'allele_phenotype'
        raw = '/'.join((self.rawdir, self.queries[src_key]['file']))
        LOG.info("processing allele phenotype associations")
        col = self.queries[src_key]['columns']

        transgenic_alleles = self._get_foreign_transgenic_alleles()

        # flybase terms - terms we prefix with FlyBase:
        fly_prefixes = ['FBal', 'FBti', 'FBab', 'FBba', 'FBtp']

        # a alphanumeric id followed by a colon then
        # any character but a colon bordered by @s
        term_regex = re.compile(r'@([\w]*):([^:@]*)@')
        id_regex = re.compile(r'([a-zA-Z]+)(\d+)')

        with open(raw, 'r') as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t')
            row = next(reader)  # headers
            self.check_fileheader(col, row)

            for row in reader:
                allele_id = row[col.index('allele_id')]
                pheno_desc = row[col.index('pheno_desc')]
                pheno_type = row[col.index('pheno_type')]
                pub_id = row[col.index('pub_id')]
                pub_title = row[col.index('pub_title')]
                pmid_id = row[col.index('pmid_id')]

                # Don't get phenotypes for transgenic alleles
                if allele_id in transgenic_alleles:
                    continue

                allele_curie = 'FlyBase:' + allele_id

                terms = re.findall(term_regex, pheno_desc)

                if not terms:
                    LOG.warning('Could not @terms@ in description: %s',
                                pheno_desc)
                    continue

                term_ids, term_labels = zip(*terms)
                id_match = re.match(id_regex, term_ids[0])

                if id_match is not None:
                    prefix, reference = id_match.group(1, 2)
                else:
                    raise ValueError("Could not parse id {}".format(
                        term_ids[0]))

                # derived_pheno_class should all start with a FBcv term
                if pheno_type == 'derived_pheno_class' and prefix != 'FBcv':
                    LOG.warning(
                        'derived_pheno_class does not '
                        'start with FBcv: %s', pheno_desc)
                    continue

                # Create phenotype curie
                if pheno_type == 'derived_pheno_class':
                    phenotype_curie = prefix + ':' + reference
                elif pheno_type == 'derived_pheno_manifest':
                    # These are not proper FBcv phenotype terms
                    # but rather anatomical entities, go terms, sometimes free text
                    # skip parsing for now
                    continue
                else:
                    raise ValueError(
                        "Unexpected phenotype type: {}".format(pheno_type))

                if pmid_id:
                    ref_curie = 'PMID:' + pmid_id
                else:
                    ref_curie = 'FlyBase:' + pub_id
                    reference = Reference(self.graph, ref_curie)
                    reference.setTitle(pub_title)
                    reference.addRefToGraph()

                assoc = G2PAssoc(self.graph, self.name, allele_curie,
                                 phenotype_curie,
                                 self.globaltt['has phenotype'])
                assoc.add_source(ref_curie)
                # Associations need to be disambiguated via their qualifiers
                # see http://flybase.org/reports/FBal0207398 as an example
                assoc.set_association_id(
                    assoc.make_association_id(self.name, allele_curie,
                                              self.globaltt['has phenotype'],
                                              phenotype_curie, term_ids[1:]))
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                # add the rest as qualifiers
                for term in term_ids[1:]:
                    if term:
                        # FBal, GO, FBti, FBab ...
                        id_match = re.match(id_regex, term)
                        if id_match is not None:
                            prefix, reference = id_match.group(1, 2)
                            if prefix in fly_prefixes:
                                term_curie = 'FlyBase:' + term
                            else:
                                term_curie = prefix + ':' + reference
                        else:
                            raise ValueError(
                                "Could not parse id {}".format(term))

                    else:
                        # There is not an id for a term,
                        # eg @:heat sensitive | tetracycline conditional@
                        continue

                    self.graph.addTriple(assoc_id,
                                         self.globaltt['has_qualifier'],
                                         term_curie)

                if limit is not None and reader.line_num > limit:
                    break