Exemplos de text em Python, exemplos de utopia.document.text em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: gpcrdb.py Projeto: adhalanay/utopia-documents-mirror

    def on_activate_event(self, document):
        if len(document.annotations('GPCRDB cache')) == 0:
            print 'annotating stuff . . .'

            pubmedId = utopialib.utils.metadata(document,
                                                'identifiers[pubmed]')
            if pubmedId is not None:
                print 'found pubmed id: ' + pubmedId
            else:
                print 'did not find pubmed id'

            ns = {'r': 'GPCR'}

            textMentions = self.getMentions(document.text(), pubmedId)

            objectlist = []
            mention_cache = {}
            for mention in textMentions:
                if mention.mentionType != 'SPECIES':
                    mention_cache.setdefault(mention.html, [])
                    mention_cache[mention.html].append(mention)

            for html, mentions in mention_cache.iteritems():
                annotation = self.createAnnotation(document, html, mentions)
                annotation['displayRelevance'] = '2000'
                annotation['displayRank'] = '2000'
                document.addAnnotation(annotation)

            document.addAnnotation(spineapi.Annotation(), 'GPCRDB cache')

Exemplo n.º 2

0

Exibir arquivo

Arquivo: nucleardb.py Projeto: project-renard-survey/utopia-documents-mirror

    def on_activate_event(self, document):
        if len(document.annotations('NucleaRDB cache')) == 0:
            print 'annotating stuff . . .'

            pubmedId = common.utils.metadata(document, 'pmid')
            if pubmedId is not None:
                print 'found pubmed id: ' + pubmedId
            else:
                print 'did not find pubmed id'

            ns = {'r': 'GPCR'}

            textMentions = self.getMentions(document.text(), pubmedId)

            objectlist = []
            mention_cache = {}
            for mention in textMentions:
                if mention.mentionType != 'SPECIES':
                    mention_cache.setdefault(mention.html, [])
                    mention_cache[mention.html].append(mention)

            for html, mentions in mention_cache.iteritems():
                annotation = self.createAnnotation(document, html, mentions)
                annotation['displayRelevance']='2000'
                annotation['displayRank']= '2000'
                document.addAnnotation(annotation)

            document.addAnnotation(spineapi.Annotation(), 'NucleaRDB cache')

Exemplo n.º 3

0

Exibir arquivo

    def on_activate_event(self, document, data={}):
        action = data.get('action')
        domain = data.get('domain')

        if self.annotatedDomains is None:
            self.annotatedDomains = []

        if action == 'annotate':
            print 'starting 3DM annotation . . .'
            pubmedId = utopia.tools.utils.metadata(document, 'identifiers[pubmed]')
            if pubmedId is None:
                pubmedId = '0'
            print 'sending text to remote server (' + pubmedId + '). . .'
            textMentions = self.getMentions(domain, document.text())
            print 'received response, adding annotations for domain ' + domain + ' . . .'
            mention_cache = {}
            for mention in textMentions:
                if mention['mentionType'] != 'SPECIES' and mention['mentionType'] != 'PDB':
                    html, css, js = self.buildHtml(domain, mention)
                    mention['html'] = html.encode('utf-8')
                    mention['css'] = css.encode('utf-8')
                    mention['js'] = js.encode('utf-8')
                    mention_cache.setdefault(mention['html'], [])
                    mention_cache[mention['html']].append(mention)

            for html, mentions in mention_cache.iteritems():
                annotation = self.createAnnotation(domain, document, html, mentions)
                annotation['displayRelevance'] = '2000'
                annotation['displayRank'] = '2000'
                document.addAnnotation(annotation)

            document.addAnnotation(Annotation(), domain)
            print 'done adding annotations.'

Exemplo n.º 4

0

Exibir arquivo

    def on_activate_event(self, document):
        text = document.text().encode('utf8')
        text_hash = hashlib.md5(text).hexdigest()

        url = 'http://beta.sciencewise.info/api/utopia'
        payload = urllib.urlencode({'text': text, 'chksum': text_hash})
        response = urllib2.urlopen(url, payload, timeout=8).read()
        results = json.loads(response)
        annotations = []

        for result in results:
            before = result.get('context', {}).get('before', '')
            term = result.get('value', '')
            after = result.get('context', {}).get('after', '')
            link = result.get('link')
            definitions = []
            for definition in result.get('definitions', []):
                definitions.append(
                    (definition.get('url'), definition.get('title')))

            if len(term) > 0 and len(before) + len(term) + len(
                    after) > 0 and link is not None:
                matches = document.findInContext(before, term, after)
                if len(matches) > 0:
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'ScienceWISE'
                    annotation['property:webpageUrl'] = link
                    annotation['property:term'] = term
                    annotation['property:name'] = 'Definitions of {0}'.format(
                        term)
                    annotation[
                        'property:description'] = 'ScienceWISE ontology definitions'
                    annotation['property:sourceDatabase'] = 'sciencewise'
                    annotation[
                        'property:sourceDescription'] = '<p><a href="http://sciencewise.info/">ScienceWISE</a> provides phycists with article annotation and scientific bookmarking.</p>'
                    for url, title in definitions:
                        annotation.insertProperty('property:definitions',
                                                  '{0} {1}'.format(url, title))
                    for match in matches:
                        annotation.addExtent(match)
                    annotations.append(annotation)

        if len(annotations) > 0:
            document.addAnnotations(annotations)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: sciencewise.py Projeto: project-renard-survey/utopia-documents-mirror

    def on_activate_event(self, document):
        text = document.text().encode('utf8')
        text_hash = hashlib.md5(text).hexdigest()

        url = 'http://beta.sciencewise.info/api/utopia'
        payload = urllib.urlencode({ 'text': text, 'chksum': text_hash })
        response = urllib2.urlopen(url, payload, timeout=8).read()
        results = json.loads(response)
        annotations = []

        for result in results:
            before = result.get('context', {}).get('before', '')
            term = result.get('value', '')
            after = result.get('context', {}).get('after', '')
            link = result.get('link')
            definitions = []
            for definition in result.get('definitions', []):
                definitions.append((definition.get('url'), definition.get('title')))

            if len(term) > 0 and len(before) + len(term) + len(after) > 0 and link is not None:
                matches = document.findInContext(before, term, after)
                if len(matches) > 0:
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'ScienceWISE'
                    annotation['property:webpageUrl'] = link
                    annotation['property:term'] = term
                    annotation['property:name'] = 'Definitions of {0}'.format(term)
                    annotation['property:description'] = 'ScienceWISE ontology definitions'
                    annotation['property:sourceDatabase'] = 'sciencewise'
                    annotation['property:sourceDescription'] = '<p><a href="http://sciencewise.info/">ScienceWISE</a> provides phycists with article annotation and scientific bookmarking.</p>'
                    for url, title in definitions:
                        annotation.insertProperty('property:definitions', '{0} {1}'.format(url, title))
                    for match in matches:
                        annotation.addExtent(match)
                    annotations.append(annotation)

        if len(annotations) > 0:
            document.addAnnotations(annotations)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: Annotator3dmNG.py Projeto: project-renard-survey/utopia-documents-mirror

    def on_activate_event(self, document, data={}):
        action = data.get("action")
        domain = data.get("domain")

        if self.annotatedDomains == None:
            self.annotatedDomains = []

        if action == "annotate":
            print "starting 3DM anntotation . . ."
            ns = {"r": "GPCR"}
            pubmedId = common.utils.metadata(document, "pmid")
            if pubmedId == None:
                pubmedId = "0"
            print "sending text to remote server (" + pubmedId + "). . ."
            textMentions = self.getMentions(domain, document.text(), pubmedId)
            print "recieved response, adding annotations for domain " + domain + " . . ."
            objectlist = []
            mention_cache = {}
            for mention in textMentions:
                if mention.mentionType != "SPECIES" and mention.mentionType != "PDB":
                    newData = self.rewriteData(mention)
                    mention.data = newData
                    html, css, js = self.buildHtml(domain, mention)
                    mention.html = html.encode("utf-8")
                    mention.css = css.encode("utf-8")
                    mention.js = js.encode("utf-8")
                    mention_cache.setdefault(mention.html, [])
                    mention_cache[mention.html].append(mention)

            for html, mentions in mention_cache.iteritems():
                annotation = self.createAnnotation(domain, document, html, mentions)
                annotation["displayRelevance"] = "2000"
                annotation["displayRank"] = "2000"
                document.addAnnotation(annotation)

            document.addAnnotation(Annotation(), domain)
            print "done adding annotations."

Exemplo n.º 7

0

Exibir arquivo

    def after_ready_event(self, document):
        # Get (if present) the RSCMetadataLink annotation
        for annotation in document.annotations():
            if annotation.get('concept') == 'RSCMetadataLink':
                text = document.text()

                doi = annotation['property:doi'].upper()
                rscId = annotation['property:rscId'].upper()

                xmlquery = '<SearchCriteria><SearchTerm><Category>Journal</Category><ContentType>All</ContentType><Criterias><NameValue><Name>FreeText</Name><Value>"%s"</Value></NameValue></Criterias><Source>Utopia</Source></SearchTerm><PageNo>1</PageNo><PageSize>10</PageSize><SortBy>Relevance</SortBy></SearchCriteria>' % doi

                baseurl = 'http://pubs.rsc.org/en/federated/search'
                params = {
                    'federatedsearchname': 'Utopia',
                    'inputxml': xmlquery
                }
                url = baseurl + '?%s' % urllib.urlencode(params)
                searchresult = urllib2.urlopen(url, timeout=14).read()
                root = etree.fromstring(searchresult)
                #print etree.tostring(root, pretty_print=True, encoding='utf8')

                articles = root.findall(
                    './{http://www.rsc.org/schema/rscart38}article')
                #print articles

                # the search use above can return more than one article, so select out only the one with
                # the correct doi

                thearticle = None
                articleID = None
                for article in articles:
                    found_doi = article.findtext(
                        "./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='doi']"
                    )
                    if found_doi is None:
                        found_doi = article.findtext(
                            "./{http://www.rsc.org/schema/rscart38}art-admin/{http://www.rsc.org/schema/rscart38}doi"
                        )
                    if found_doi is not None and found_doi.upper() == doi:
                        thearticle = article
                        articleIDelem = article.find(
                            "./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='docid']"
                        )
                        if articleIDelem is not None:
                            articleID = articleIDelem.text
                        break

                # if we get back a single valid article...
                if thearticle != None:
                    #print articleID

                    compoundsInArticle = []
                    compoundText = {}

                    annotationsInArticle = []
                    annotationText = {}

                    # create a list of all the compounds that are mentioned in the article body
                    compnames = thearticle.findall(
                        './{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}compname'
                    )
                    #print compnames
                    for compname in compnames:
                        # This line removes (erroneous?) elements from inside the XML
                        etree.strip_elements(
                            compname,
                            '{http://www.rsc.org/schema/rscart38}compound',
                            with_tail=False)
                        #print compname.attrib['idrefs'], compname.text
                        compoundsInArticle.append(compname.attrib['idrefs'])
                        compoundText[
                            compname.attrib['idrefs']] = etree.tounicode(
                                compname, method='text')

                    annotationnames = thearticle.findall(
                        './{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}annref'
                    )
                    #print annotationnames
                    for annotationname in annotationnames:
                        # This line removes (erroneous?) elements from inside the XML
                        etree.strip_elements(
                            annotationname,
                            '{http://www.rsc.org/schema/rscart38}annotation',
                            with_tail=False)
                        #print annotationname.attrib['idrefs'], annotationname.text
                        annotationsInArticle.append(
                            annotationname.attrib['idrefs'])
                        annotationText[
                            annotationname.attrib['idrefs']] = etree.tounicode(
                                annotationname, method='text')

                    #print compoundText, annotationText
                    #sprint annotationsInArticle

                    # then for all the compounds that are defined in the article back
                    compounds = thearticle.findall(
                        './{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}compound'
                    )
                    for compound in compounds:
                        id = compound.attrib['id']
                        if id in compoundsInArticle:
                            url = None
                            id = compound.attrib['id']

                            # if the compound has a CSID, then the URL links to the chemspider page
                            csid = compound.find(
                                "./{http://www.rsc.org/schema/rscart38}link[@type='CSID']"
                            )

                            # if the compound has a CSID, create a Chemspider URL for it
                            if csid is not None and csid.text is not None:
                                url = 'http://www.chemspider.com/Chemical-Structure.%s.html' % csid.text[
                                    5:]
                            else:
                                # otherwise, use the RSC landing page
                                url = 'http://www.rsc.org/publishing/journals/prospect/cheminfo.asp?XMLID=%s&compoundtext=%s&MSID=%s' % (
                                    id[4:], compoundText[id], articleID)

                            if url is not None:
                                options = spineapi.WholeWordsOnly + spineapi.IgnoreCase
                                matches = document.search(
                                    compoundText[id], options)
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'Hyperlink'
                                annotation['property:webpageUrl'] = url
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation)

                    # similarly, for all the annotations
                    annotations = thearticle.findall(
                        './{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}annotation'
                    )
                    for annotation in annotations:
                        id = annotation.attrib['id']
                        url = None
                        if id in annotationsInArticle:
                            id = annotation.attrib['id']

                            # get the link id
                            link = annotation.findtext(
                                "./{http://www.rsc.org/schema/rscart38}link")

                            # if the compound has a link, create an RSC ontology landing page for it
                            if link is not None:
                                if link[:3] == 'AU:':
                                    url = 'http://goldbook.iupac.org/%s.html' % link[
                                        3:]
                                else:
                                    url = 'http://www.rsc.org/publishing/journals/prospect/ontology.asp?id=%s&MSID=%s' % (
                                        link, articleID)

                            if url is not None:
                                matches = document.search(
                                    annotationText[id], spineapi.IgnoreCase +
                                    spineapi.WholeWordsOnly)
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'Hyperlink'
                                annotation['property:webpageUrl'] = url
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation)
                break

Exemplo n.º 8

0

Exibir arquivo

Arquivo: rsc.py Projeto: project-renard-survey/utopia-documents-mirror

    def after_ready_event(self, document):
        # Get (if present) the RSCMetadataLink annotation
        for annotation in document.annotations():
            if annotation.get('concept') == 'RSCMetadataLink':
                text = document.text()

                doi = annotation['property:doi'].upper()
                rscId = annotation['property:rscId'].upper()

                xmlquery = '<SearchCriteria><SearchTerm><Category>Journal</Category><ContentType>All</ContentType><Criterias><NameValue><Name>FreeText</Name><Value>"%s"</Value></NameValue></Criterias><Source>Utopia</Source></SearchTerm><PageNo>1</PageNo><PageSize>10</PageSize><SortBy>Relevance</SortBy></SearchCriteria>' % doi

                baseurl = 'http://pubs.rsc.org/en/federated/search'
                params = { 'federatedsearchname': 'Utopia',
                           'inputxml': xmlquery }
                url = baseurl + '?%s' % urllib.urlencode(params)
                searchresult = urllib2.urlopen(url, timeout=14).read()
                root = etree.fromstring(searchresult)
                #print etree.tostring(root, pretty_print=True, encoding='utf8')

                articles = root.findall('./{http://www.rsc.org/schema/rscart38}article')
                #print articles

                # the search use above can return more than one article, so select out only the one with
                # the correct doi

                thearticle = None
                articleID = None
                for article in articles:
                    found_doi = article.findtext("./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='doi']")
                    if found_doi is None:
                        found_doi = article.findtext("./{http://www.rsc.org/schema/rscart38}art-admin/{http://www.rsc.org/schema/rscart38}doi")
                    if found_doi is not None and found_doi.upper() == doi:
                        thearticle = article
                        articleIDelem = article.find("./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='docid']")
                        if articleIDelem is not None:
                            articleID = articleIDelem.text
                        break

                # if we get back a single valid article...
                if thearticle != None:
                    #print articleID

                    compoundsInArticle = []
                    compoundText = {}

                    annotationsInArticle = []
                    annotationText = {}

                    # create a list of all the compounds that are mentioned in the article body
                    compnames = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}compname')
                    #print compnames
                    for compname in compnames:
                        # This line removes (erroneous?) elements from inside the XML
                        etree.strip_elements(compname, '{http://www.rsc.org/schema/rscart38}compound', with_tail=False)
                        #print compname.attrib['idrefs'], compname.text
                        compoundsInArticle.append(compname.attrib['idrefs'])
                        compoundText[compname.attrib['idrefs']] = etree.tounicode(compname, method='text')

                    annotationnames = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}annref')
                    #print annotationnames
                    for annotationname in annotationnames:
                        # This line removes (erroneous?) elements from inside the XML
                        etree.strip_elements(annotationname, '{http://www.rsc.org/schema/rscart38}annotation', with_tail=False)
                        #print annotationname.attrib['idrefs'], annotationname.text
                        annotationsInArticle.append(annotationname.attrib['idrefs'])
                        annotationText[annotationname.attrib['idrefs']] = etree.tounicode(annotationname, method='text')

                    #print compoundText, annotationText
                    #sprint annotationsInArticle

                    # then for all the compounds that are defined in the article back
                    compounds = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}compound')
                    for compound in compounds:
                        id = compound.attrib['id']
                        if id in compoundsInArticle:
                            url = None
                            id = compound.attrib['id']

                            # if the compound has a CSID, then the URL links to the chemspider page
                            csid = compound.find("./{http://www.rsc.org/schema/rscart38}link[@type='CSID']" )

                            # if the compound has a CSID, create a Chemspider URL for it
                            if csid is not None and csid.text is not None:
                                url = 'http://www.chemspider.com/Chemical-Structure.%s.html' % csid.text[5:]
                            else:
                                # otherwise, use the RSC landing page
                                url = 'http://www.rsc.org/publishing/journals/prospect/cheminfo.asp?XMLID=%s&compoundtext=%s&MSID=%s' % (id[4:], compoundText[id], articleID)

                            if url is not None:
                                options = spineapi.WholeWordsOnly + spineapi.IgnoreCase
                                matches = document.search(compoundText[id], options)
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'Hyperlink'
                                annotation['property:webpageUrl'] = url
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation)

                    # similarly, for all the annotations
                    annotations = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}annotation')
                    for annotation in annotations:
                        id = annotation.attrib['id']
                        url = None
                        if id in annotationsInArticle:
                            id = annotation.attrib['id']

                            # get the link id
                            link = annotation.findtext("./{http://www.rsc.org/schema/rscart38}link" )

                            # if the compound has a link, create an RSC ontology landing page for it
                            if link is not None:
                                if link[:3] == 'AU:':
                                    url = 'http://goldbook.iupac.org/%s.html' % link[3:]
                                else:
                                    url = 'http://www.rsc.org/publishing/journals/prospect/ontology.asp?id=%s&MSID=%s' % (link, articleID)

                            if url is not None:
                                matches = document.search(annotationText[id], spineapi.IgnoreCase + spineapi.WholeWordsOnly)
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'Hyperlink'
                                annotation['property:webpageUrl'] = url
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation)
                break