def on_load_event(self, document): # Email links for match in document.search(self.email, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): if not areas_intersect(match.areas(), self.existing_areas): annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = 'mailto:%s' % match.text() annotation['session:volatile'] = '1' annotation.addExtent(match) document.addAnnotation(annotation) else: print 'ignoring clashing email link text:', match.text().encode('utf8') # HTTP(S) links for match in document.search(self.http, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): if not areas_intersect(match.areas(), self.existing_areas): if match.begin().lineArea()[1] == 0: # Only while vertical links are rendered wrongly FIXME url = match.text() if not url.startswith('http'): url = 'http://' + url annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = '%s' % url annotation['session:volatile'] = '1' annotation.addExtent(match) document.addAnnotation(annotation) else: print 'ignoring clashing http link text:', match.text().encode('utf8')
def on_load_event(self, document): # Email links for match in document.search(self.email, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): if not areas_intersect(match.areas(), self.existing_areas): annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = 'mailto:%s' % match.text() annotation['session:volatile'] = '1' annotation.addExtent(match) document.addAnnotation(annotation) else: print('ignoring clashing email link text:', match.text().encode('utf8')) # HTTP(S) links for match in document.search(self.http, spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): if not areas_intersect(match.areas(), self.existing_areas): if match.begin().lineArea()[1] == 0: # Only while vertical links are rendered wrongly FIXME url = match.text() if not url.startswith('http'): url = 'http://' + url annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = '%s' % url annotation['session:volatile'] = '1' annotation.addExtent(match) document.addAnnotation(annotation) else: print('ignoring clashing http link text:', match.text().encode('utf8'))
def on_activate_event(self, document): regex = r'(functional abnormalities)' # Scan the document for some regular expression matches = document.search(regex, spineapi.RegExp + spineapi.WholeWordsOnly) to_add = {} # Dictionary of annotations to add for match in matches: # Sanitise matches found in document for dict keys match_text = match.text().lower().strip() match_text_quoted = urllib.quote(match_text) # Has same text already been annotated? annotation = to_add.get(match_text, None) if annotation is None: # If no, create new annotation annotation = spineapi.Annotation() annotation['concept'] = 'NeuroSynthAnnotation' annotation['property:name'] = match_text annotation['property:description'] = 'Link to NeuroSynth' annotation['property:webpageUrl'] = 'http://beta.neurosynth.org/features/{0}/'.format(match_text_quoted) annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#00AA00' # Green to_add[match_text] = annotation if annotation is not None: # Add the match to the annotation, in any case annotation.addExtent(match) # Finally, add the annotations to the document document.addAnnotations(to_add.values())
def _populate(self, document): padding = r'[\s\p{Pd}]' regex = r'(gpcr|g%sprotein%scoupled%sreceptor)' % (padding, padding, padding) # If GPCR is found in the paper, auto-annotate mentions = document.search(regex, spineapi.IgnoreCase + spineapi.RegExp) if len(mentions) > 0: self.annotate(document)
def on_ready_event(self, document): doi = utopialib.utils.metadata(document, 'identifiers[doi]') # Find and aggregate AGI instances in the document matches_by_agi = {} for match in document.search(self.agiRegex, spineapi.RegExp + spineapi.WholeWordsOnly): agi = match.text() matches_by_agi.setdefault(agi, []) matches_by_agi[agi].append(match) # For each AGI add a new bit of HTML if len(matches_by_agi) > 0: for agi, matches in matches_by_agi.iteritems(): html = ''' <p style="overflow: auto; width: 100%"> <strong>{0}</strong> <span style="float: right">{1}</span> </p> ''' annotation = spineapi.Annotation() annotation['concept'] = 'AGI' annotation['property:agi'] = agi annotation['property:name'] = 'Plant gene databases' annotation[ 'property:description'] = 'American Society of Plant Biologists' annotation[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/aspb_logo.png', 'image/png') if doi is not None: if doi.startswith('10.1104/'): annotation[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/pp_logo.png', 'image/png') annotation[ 'property:description'] = 'From Plant Physiology' elif doi.startswith('10.1105/'): annotation[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/tpc_logo.png', 'image/png') annotation['property:description'] = 'From Plant Cell' annotation['property:sourceDescription'] = ''' <p> The <a href="http://www.aspb.org/">American Society of Plant Biologists</a> have deemed these linked databases important sources of information. </p> ''' annotation.addExtents(matches) document.addAnnotation(annotation)
def on_ready_event(self, document): doi = common.utils.metadata(document, 'doi', '') # Find and aggregate AGI instances in the document matches_by_agi = {} for match in document.search(self.agiRegex, spineapi.RegExp + spineapi.WholeWordsOnly): agi = match.text() matches_by_agi.setdefault(agi, []) matches_by_agi[agi].append(match) # For each AGI add a new bit of HTML if len(matches_by_agi) > 0: for agi, matches in matches_by_agi.iteritems(): html = ''' <p style="overflow: auto; width: 100%"> <strong>{0}</strong> <span style="float: right">{1}</span> </p> ''' annotation = spineapi.Annotation() annotation['concept'] = 'AGI' annotation['property:agi'] = agi annotation['property:name'] = 'Plant gene databases' annotation['property:description'] = 'American Society of Plant Biologists' annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/aspb_logo.png', 'image/png') if doi.startswith('10.1104/'): annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/pp_logo.png', 'image/png') annotation['property:description'] = 'From Plant Physiology' elif doi.startswith('10.1105/'): annotation['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/tpc_logo.png', 'image/png') annotation['property:description'] = 'From Plant Cell' annotation['property:sourceDescription'] = ''' <p> The <a href="http://www.aspb.org/">American Society of Plant Biologists</a> have deemed these linked databases important sources of information. </p> ''' annotation.addExtents(matches) document.addAnnotation(annotation)
def on_activate_event(self, document): _debug('activate base') # Scan the document for some regular expression matches = document.search( self.get_terms_regex(), spineapi.RegExp + spineapi.WholeWordsOnly + spineapi.IgnoreCase ) to_add = {} # Dictionary of annotations to add try: for match in matches: _debug("Match %s" % str(match)) # Sanitise matches found in document for dict keys match_text = match.text().lower().strip() # Has same text already been annotated? annotation = to_add.get(match_text, None) if annotation is None: # If no, create new annotation annotation = spineapi.Annotation() annotation['concept'] = 'Annotation %s' % self._name annotation['property:name'] = match_text annotation['property:description'] = 'Link to %s' % self._name annotation['property:webpageUrl'] = self.get_url(match_text) annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#00AA00' # Green to_add[match_text] = annotation if annotation is not None: # Add the match to the annotation, in any case _debug("Added %s" % str(annotation)) annotation.addExtent(match) except Exception, e: _debug("ERROR: %s" % str(e))
def on_activate_event(self, document): _debug('activate base') # Scan the document for some regular expression matches = document.search( self.get_terms_regex(), spineapi.RegExp + spineapi.WholeWordsOnly + spineapi.IgnoreCase) to_add = {} # Dictionary of annotations to add try: for match in matches: _debug("Match %s" % str(match)) # Sanitise matches found in document for dict keys match_text = match.text().lower().strip() # Has same text already been annotated? annotation = to_add.get(match_text, None) if annotation is None: # If no, create new annotation annotation = spineapi.Annotation() annotation['concept'] = 'Annotation %s' % self._name annotation['property:name'] = match_text annotation[ 'property:description'] = 'Link to %s' % self._name annotation['property:webpageUrl'] = self.get_url( match_text) annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#00AA00' # Green to_add[match_text] = annotation if annotation is not None: # Add the match to the annotation, in any case _debug("Added %s" % str(annotation)) annotation.addExtent(match) except Exception, e: _debug("ERROR: %s" % str(e))
def on_ready_event(self, document): doi = common.utils.metadata(document, 'doi') if doi is not None: info = {} # Resolve the DOI to find the publisher's website response = urllib2.urlopen('http://dx.doi.org/{0}'.format(doi), timeout=8) # Parse page to find (if there) the full text URL parser = etree.HTMLParser() html = etree.parse(response, parser) # Only continue if this is a highwire HTML page if len(html.xpath("/html/head/meta[@name='HW.identifier']")) == 0: return # Now make sure we have the full text XHTML citation_fulltext_html_url = html.xpath("/html/head/meta[@name='citation_fulltext_html_url']/@content") if len(citation_fulltext_html_url) > 0: citation_fulltext_html_url = citation_fulltext_html_url[0] # Fetch that full text page (if different to the current one) if citation_fulltext_html_url != response.geturl(): response = urllib2.urlopen(citation_fulltext_html_url, timeout=8) html = etree.parse(response, parser) #print etree.tostring(html, pretty_print=True, encoding='utf8') # Now parse out the bibliography info['citations'] = [] info['citations_by_id'] = {} for bibitem in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' ref-list ')]//ol[contains(concat(' ', normalize-space(@class), ' '), ' cit-list ')]/li"): citation = query(bibitem, { 'id': 'a/@id', 'label': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' ref-label ')]/text()", 'title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-article-title ')]/text()", 'year': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-date ')]/text()", 'publication-title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-jnl-abbrev ')]/text()", 'volume': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-vol ')]/text()", 'issue': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-issue ')]/text()", 'pagefrom': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-fpage ')]/text()", 'pageto': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-lpage ')]/text()", 'pmid': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-pmid ')]/text()", 'doi': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-doi ')]/text()", 'etree': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-metadata ')]", }) authors = [] for a in bibitem.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-auth ')]"): surname = a.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-surname ')]/text()") given_names = a.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-given-names ')]/text()") if len(surname) > 0 and len(given_names) > 0: authors.append(u'{0}, {1}'.format(surname[0], given_names[0]).strip(', ')) if len(authors) > 0: citation['authors'] = authors citation['contexts'] = [] citation['displayText'] = common.utils.format_citation(citation) info['citations'].append(citation) info['citations_by_id'][citation['id']] = citation #print citation ####################################################################################### # Parse in-text citations if present min_length = 10 max_length = 20 for paragraph in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' article ')]//p"): text_stack = [paragraph.text or ''] xref_stack = [None] for elem in paragraph: if len(elem.xpath('self::a[@class="xref-bibr"]')) > 0: text_stack.append(etree.tostring(elem, method='text', encoding=unicode, with_tail=False)) text_stack.append(elem.tail or '') xref = info['citations_by_id'].get(elem.get('href', '')[1:]) if xref is not None: xref_stack += [[xref], None] else: xref_stack += [[], None] elif isinstance(elem, etree._Entity): points = entities.get(elem.text[1:-1]) if points is not None: text_stack[-1] += ''.join((unichr(p) for p in points)) else: text_stack[-1] += etree.tostring(elem, encoding=unicode) else: if elem.get('position') == 'float': text_stack[-1] += elem.tail or '' else: text_stack[-1] += etree.tostring(elem, method='text', encoding=unicode) # Find and collapse ranges in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() #print len(text), (text in u'-\u2010\u2011\u2012\u2013\u2014\u2015'), u''.join(text_stack[i-1:i+2]).encode('utf8') # if this text is a dash, we need to coalesce the text fragments if len(text) == 1 and text in u'-\u2010\u2011\u2012\u2013\u2014\u2015': text_stack[i-1:i+2] = [u''.join(text_stack[i-1:i+2])] xref_stack[i-1:i+2] = [xref_stack[i-1] + xref_stack[i+1]] #for text in text_stack: # print text.encode('utf8') # Then make sure we resolve the implied citations for i in xrange(1, len(xref_stack), 2): # Get actual cross references xrefs = xref_stack[i] # Expand cross references try: if len(xrefs) == 2: labelfrom = int(xrefs[0].get('label')) labelto = int(xrefs[1].get('label')) candidates = {} midlabels = [unicode(midlabel) for midlabel in xrange(labelfrom+1, labelto)] for candidate in info['citations']: if candidate.get('label') in midlabels: candidates[int(candidate.get('label'))] = candidate xrefs[1:-1] = candidates.values() except: raise # Find and collapse lists in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() # if this text is a comma, we need to coalesce the text fragments if len(text) == 1 and text == ',': text_stack[i-1:i+2] = [u''.join(text_stack[i-1:i+2])] xref_stack[i-1:i+2] = [xref_stack[i-1] + xref_stack[i+1]] # Expand citations to include brackets (on both sides) for i in xrange(len(xref_stack) - 2, 0, -2): before = text_stack[i-1].strip()[-1:] text = text_stack[i].strip() after = text_stack[i+1].strip()[:1] # if this text is a comma, we need to coalesce the text fragments #print before.encode('utf'), after.encode('utf') if len(before) > 0 and before in '({[' and len(after) > 0 and after in ')}]': text_stack[i-1] = re.sub(r'[({[](\s*)$', r'\1', text_stack[i-1]) text_stack[i+1] = re.sub(r'^(\s*)[)}\]]', r'\1', text_stack[i+1]) text_stack[i] = before + text_stack[i] + after #print repr(text_stack) for i in xrange(1, len(xref_stack), 2): # Get context before = u' '.join(text_stack[:i]).strip() label = text_stack[i].strip() after = u' '.join(text_stack[i+1:]).strip() # Strip out extraneous brackets if len(xref_stack[i]) > 1: # Hack to differentiate single / multiple citations # as multiple numbers tend not to have spaces between them label = re.sub(ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,])\s?[({[]?', r'\1', label) else: label = re.sub(ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,]\s?)[({[]?', r'\1', label) # Normalise context before = re.sub(r'\s+', ' ', before)[-max_length:].strip() label = re.sub(r'\s+', ' ', label) after = re.sub(r'\s+', ' ', after)[:max_length].strip() #print (before.encode('utf8'), label.encode('utf8'), after.encode('utf8')) if len(before + after) > min_length: for xref in xref_stack[i]: xref['contexts'].append((before, label, after)) #print xref_stack[i] ####################################################################################### # Parse tables if present info['tables'] = {} for table_url in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' table-inline ')]/div[contains(concat(' ', normalize-space(@class), ' '), ' callout ')]//li[1]/a/@href"): table_url = urlparse.urljoin(citation_fulltext_html_url, table_url) #print table_url response = urllib2.urlopen(table_url, timeout=8) table_html = etree.parse(response, parser) for table_expansion in table_html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' table-expansion ')]"): id = table_expansion.get('id') table = {} table['xml'] = table_expansion.xpath('.//table[1]')[0] table['caption_raw'] = table_expansion.xpath(".//span[contains(concat(' ', normalize-space(@class), ' '), ' caption-title ')][1]")[0] if 'caption' not in table and 'caption_raw' in table: table['caption'] = table['caption_raw'] if 'caption' in table: table['caption'] = re.sub(r'\s+', ' ', etree.tostring(table['caption'], method='text', encoding=unicode).strip()) if 'xml' in table: table['xml'] = etree.tostring(table['xml'], encoding='utf8') info['tables'][id] = table #print table #print info if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Generate sensible titles / descriptions / icons? journalTitle = info.get('publication-title', '') journalTitleSuffix = '' publisher = info.get('publisher', 'the publisher') if len(journalTitle) > 0: journalTitleSuffix = ' ({0})'.format(journalTitle) # Create Metadata link annotation link = document.newAccList('metadata', 90) link['property:sourceIcon'] = '' link['property:sourceTitle'] = publisher link['property:sourceDescription'] = ''' <p>This information was provided by {0}{1}.</p> '''.format(publisher, journalTitleSuffix) # Create Metadata annotation annotation = spineapi.Annotation() annotation['concept'] = 'DocumentMetadata' for k in self.keys: v = info.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = spineapi.Annotation() annotation['concept'] = 'DocumentReference' for k in self.keys: v = citation.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): #print (pre, label, post) matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation['concept'] = 'ForwardCitation' annotation['property:state'] = 'found' if 'title' in citation: annotation['property:title'] = citation['title'] if 'id' in citation: annotation['property:bibid'] = citation['id'] if 'doi' in citation and citation['doi'].startswith('10.1371/'): citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid']) for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'): if k in citation: annotation['property:{0}'.format(k)] = citation[k] #print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) #print citation except: raise pass # FIXME for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict = True) #print regex # convert oasis tables ns = {'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table'} xml = etree.fromstring(table['xml']) if xml.tag == '{{{0}}}table'.format(ns['oasis']): for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns): columns = {} for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns): columns[colspec.get('colname')] = int(colspec.get('colnum')) for section in tgroup.xpath('./oasis:thead|./oasis:tbody', namespaces=ns): isHead = (section.tag == '{{{0}}}thead'.format(ns['oasis'])) for row in section.xpath('./oasis:row', namespaces=ns): for entry in row.xpath('./oasis:entry', namespaces=ns): colname = entry.get('colname') colst = entry.get('namest') colend = entry.get('nameend') if colst is not None and colend is not None: colspan = columns[colend] - columns[colst] + 1 else: colspan = 1 if colspan > 1: entry.set('colspan', unicode(colspan)) morerows = entry.get('morerows') if morerows is not None: rowspan = int(morerows) + 1 else: rowspan = 1 if rowspan > 1: entry.set('rowspan', unicode(rowspan)) entry.tag = 'td' row.tag = 'tr' if isHead: section.tag = 'thead' else: section.tag = 'tbody' xml.append(section) xml.tag = 'table' #print etree.tostring(xml, pretty_print=True, encoding='utf8') table['xml'] = etree.tostring(xml, encoding='utf8') matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): doi = utopia.tools.utils.metadata(document, 'identifiers[doi]') if doi is not None: match = self.splitRegEx.match(doi) if match is not None: articleNumber = match.group('number') annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' annotation['property:logo'] = utopia.get_plugin_data_as_url( 'images/logo.png', 'image/png') annotation['property:title'] = 'eLife' annotation[ 'property:webpageUrl'] = 'http://www.elifesciences.org/' document.addAnnotation(annotation, 'PublisherMetadata') # Turn all the DOIs that are sub-DOIs of this document into links regex = r'{0}\.\d+'.format(re.escape(doi)) for match in document.search(regex, spineapi.RegExp): url = 'http://dx.doi.org/{0}'.format(match.text()) annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = url annotation['session:volatile'] = '1' annotation.addExtent(match) document.addAnnotation(annotation) # Try to get the NLM directly from eLife url = 'http://elife.elifesciences.org/elife-source-xml/10.7554/eLife.{0}' url = url.format(articleNumber) try: nlm = urllib2.urlopen(url, timeout=8).read() except (urllib2.URLError, socket.timeout): return info = utopia.tools.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopia.tools.eutils.efetch( id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext( 'ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format( id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList('metadata', 100) link[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/annotation_icon.png', 'image/png') link['property:sourceTitle'] = 'eLife' link['property:sourceDescription'] = ''' <p>The <a href="http://www.elifesciences.org/">eLife</a> open access publishing platform.</p> ''' # Create Metadata annotation annotation = utopia.tools.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopia.tools.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopia.tools.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation( annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) print regex matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): volume, page = None, None # Only send if the DOI has a Portland prefix doi = common.utils.metadata(document, 'doi') if doi is not None and doi[:7] in registrants: crossref_unixref = common.utils.metadata(document, 'raw_crossref_unixref') if crossref_unixref is not None: # Parse CrossRef redirect URL dom = etree.fromstring(crossref_unixref.encode('utf8')) resource = dom.findtext('doi_record/crossref/journal/journal_article/doi_data/resource') if resource is not None: match = self.resourceRegExp.match(resource) if match is not None: volume, page = match.groups() ### FIXME What information should be shown? Portland? BJ? #annotation = spineapi.Annotation() #annotation['concept'] = 'PublisherIdentity' #annotation['property:logo'] = utopia.get_plugin_data_as_url('images/logo.png', 'image/png') #annotation['property:title'] = 'Portland Press Limited' #annotation['property:webpageUrl'] = 'http://www.portlandpress.com/' #document.addAnnotation(annotation, 'PublisherMetadata') # If this document was resolved, off we go to fetch the NLM if None not in (volume, page): # Make a request to the utopia ext web service url = 'https://utopia.cs.manchester.ac.uk/ext/portland/nlm?{0}' url = url.format(urllib.urlencode({'volume': volume, 'page': page})) try: nlm = urllib2.urlopen(url, timeout=8).read() except: raise return info = common.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/biochemj.png', 'image/png') link['property:sourceTitle'] = 'Portland' link['property:sourceDescription'] = ''' <p><a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p> ''' # Create Metadata annotation annotation = spineapi.Annotation() annotation['concept'] = 'DocumentMetadata' for k in self.keys: v = info.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = spineapi.Annotation() annotation['concept'] = 'DocumentReference' for k in self.keys: v = citation.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation['concept'] = 'ForwardCitation' annotation['property:state'] = 'found' if 'title' in citation: annotation['property:title'] = citation['title'] if 'id' in citation: annotation['property:bibid'] = citation['id'] if 'doi' in citation and citation['doi'].startswith('10.1371/'): citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid']) for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'): if k in citation: annotation['property:{0}'.format(k)] = citation[k] #print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) #print citation except: raise pass # FIXME for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict = True) #print regex matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): '''Fetch information from the Lazarus service''' permission = self.get_config('permission', False) if permission: # If an outline already exists, don't make a new one needs_outline = True for annotation in document.annotations(): if annotation.get('concept') == 'OutlineItem': needs_outline = False break # The Lazarus server needs to know what this document is document_id = utopia.tools.utils.metadata(document, 'identifiers[utopia]') this_doi = utopia.tools.utils.metadata(document, 'identifiers[doi]') if this_doi is not None: this_doi = u'doi:' + this_doi # Speak to server params = {'fingerprint': document.fingerprints()} url = '{0}?{1}'.format(laz_docUrl, urllib.urlencode(params, doseq=True)) response = urllib2.urlopen(url, timeout=60) if response.getcode() == 204: request = urllib2.Request( url, data=document.data(), headers={'Content-Type': 'application/pdf'}) response = urllib2.urlopen(request, timeout=60) #response = open('/Users/dave/Desktop/ananiadou_tibtech06.pdf-response.xml', 'r') # Create Metadata link annotation link = document.newAccList('metadata', 50) link['property:sourceDatabase'] = 'lazarus' link['property:sourceTitle'] = 'Lazarus' link['property:sourceDescription'] = self.sourceDescription link['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') headers = [] pos = [] refs = [] annotations = [] concepts = {} hits = [] expression_annotations = [] for kAnnotation in kend.converter.XML.parse( response, kend.model.Document): #print kend.converter.XML.serialise(kAnnotation)[0] try: annotations.append( utopia.tools.converters.Annotation.kend2spineapi( kAnnotation, document)) except: pass annotations.sort(key=lambda a: int(a.get('structure:order', 0))) for sAnnotation in annotations: if sAnnotation['concept'] == 'structure_element': role, level = self.getHeaderRole(sAnnotation) if role is not None and needs_outline: while len(pos) < level: pos.append(0) while len(pos) > level: pos.pop() pos[-1] += 1 outline = u'.'.join([unicode(i) for i in pos]) anchor_name = '#lazarus.outline.{0}'.format(outline) anchor = spineapi.Annotation() anchor['concept'] = 'Anchor' anchor['property:anchor'] = anchor_name anchor.addExtents(sAnnotation.extents()) anchor.addAreas(sAnnotation.areas()) document.addAnnotation(anchor) header = spineapi.Annotation() header['concept'] = 'OutlineItem' header['property:outlinePosition'] = outline header['property:outlineTitle'] = u' '.join( [e.text() for e in sAnnotation.extents()]) header['property:destinationAnchorName'] = anchor_name document.addAnnotation(header) print((u' ' * level + u'.'.join([unicode(i) for i in pos]) + u' ' + u' '.join([ e.text() for e in sAnnotation.extents() ])).encode('utf8')) elif 'bibitem' in sAnnotation.getAllProperties( 'structure:role'): #refs.append(sAnnotation) pass elif sAnnotation['concept'] == 'Citation': # Hack to fix a mistake in authors property name if 'property:author' in sAnnotation and not 'property:authors' in sAnnotation: sAnnotation[ 'property:authors'] = sAnnotation.getAllProperties( 'property:author') refs.append(sAnnotation) elif sAnnotation['concept'] == 'LazarusConcept': concept_id = sAnnotation.get('property:identifier') if concept_id is not None: sAnnotation['id'] = str(uuid.uuid4()) concepts[concept_id] = sAnnotation document.addAnnotation(sAnnotation, 'Lazarus Concept') elif sAnnotation['concept'] == 'LazarusConceptHit': hits.append(sAnnotation) elif sAnnotation['concept'] == 'LazarusSentenceExpression': expression_annotations.append(sAnnotation) else: document.addAnnotation(sAnnotation) for ref in refs: #print(ref.get('structure:order', '0')) pass refs = sorted(refs, key=lambda ref: int(ref.get('property:order', '0'))) for ref in refs: #print(ref.get('structure:order', '0')) pass for ref in refs: # Create Bibliography annotations #citation = {'unstructured': u' '.join([e.text() for e in ref.extents()])} #annotation = utopia.tools.utils.citation_to_annotation(citation) #annotation['property:order'] = ref.get('structure:order') #annotation.addExtents(ref.extents()) #annotation.addAreas(ref.areas()) #document.addAnnotation(annotation, link['scratch']) document.addAnnotation(ref, link['scratch']) # Now link hits to concepts for i, hit in enumerate(hits): concept_id = hit.get('property:identifier') concept = concepts.get(concept_id) if concept is not None: concept_uuid = concept.get('id') hit['property:concept_id'] = concept_uuid identifier = concept.get('property:identifier') name = concept.get('property:name', '???') sources = concept.get('property:externalSources', 'json:[]') if sources.startswith('json:'): sources = json.loads(sources[5:]) if 'property:stdInchiKey' in concept: sources.append({ 'database': ' InchiKey', 'identifier': concept['property:stdInchiKey'] }) if 'property:canonicalSmiles' in concept: sources.append({ 'database': ' SMILES', 'identifier': concept['property:canonicalSmiles'] }) kind = concept.get('property:kind') kind = self.dbs.get(kind, {}).get('title', kind) links = {} for source in sources: uri = source.get('uri') if 'primary' in source.get('relationship', []): links.setdefault('definition', []) links['definition'].append(u''' <a href="{uri}" title="{uri}">{database}</a> '''.format(**source)) elif uri is None: if source.get('database') in (' InchiKey', ' SMILES'): links.setdefault('main', []) links['main'].append(u''' <tr><td>{database}:</td><td>{identifier}</td></tr> '''.format(**source)) else: identifier = source.get('identifier') links_category = 'xref' if 'seeAlso' in source.get('relationship', []) or uri is None: links_category = 'seeAlso' links.setdefault(links_category, []) if identifier is not None: links[links_category].append(u''' <a href="{uri}" title="{uri}">{name}...</a> ({identifier}) '''.format(**source)) else: links[links_category].append(u''' <a href="{uri}" title="{uri}">{name}...</a> '''.format(**source)) style = u''' <style> .lazarus-table tbody { border: none; } .lazarus-table td:first-of-type { text-align: right; font-weight: bold; } .lazarus-table td { vertical-align: top; } .lazarus-table td:first-of-type { white-space: nowrap; } .lazarus-table td:not(:first-of-type) { word-break: break-all; } .lazarus-table tr td { padding-top: 0ex; padding-bottom: 0ex; } .lazarus-table tbody:not(:first-of-type) tr:first-of-type td { padding-top: 1ex; } </style> ''' html = u''' <table class="lazarus-table"> <tr><td>Name:</td><td>{name}</td></tr> '''.format(**{'name': name}) categories = { 'xref': 'Related:', 'seeAlso': 'See also:', 'definition': 'Defined in:' } for links_category in ('main', 'xref', 'seeAlso', 'definition'): links_title = categories.get(links_category) these_links = sorted( list(set(links.get(links_category, [])))) if len(these_links) > 0: html += '<tbody>' if links_category != 'main': html += u'<tr><td>{0}</td><td>'.format( links_title) html += u'<br>'.join(these_links) html += '</td></tr>' else: html += ''.join(these_links) html += '</tbody>' #pprint('------------------------') html += u''' </table> ''' #print(html) hasLinks = len( links.get('xref', []) + links.get('seeAlso', [])) > 0 ann = spineapi.Annotation() ann['concept'] = 'Collated' ann['property:name'] = u'{0}'.format(name) ann['property:description'] = 'Lazarus Concept' ann['session:semanticTerm'] = name ann['property:html'] = [style, html] ann['property:sourceDescription'] = self.sourceDescription ann['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') ann['session:overlay'] = 'hyperlink' ann['session:color'] = '#880000' count = 0 print('====', 7) if 'property:hitFragments' in hit: hitFragments = hit.getAllProperties( 'property:hitFragments') or [] #pprint(hitFragments) for hitFragment in hitFragments: pre, _, rest = hitFragment.partition('{!') match, _, post = rest.partition('!}') #pprint((pre, match, post)) matches = document.findInContext(pre, match, post, fuzzy=True) count += len(matches) ann.addExtents(matches) if hasLinks and count > 0: document.addAnnotation(ann) style = u''' <style> .lazarus-expression .box { background-color: #FFF0E8; border-color: #EEE0D8; } .lazarus-related { padding-left: 42px; background-image: url(%s); background-repeat: no-repeat; background-position: top left; background-size: 37px 48px; min-height: 53px; } .lazarus-related + .lazarus-related { margin-top: 5px; border-top: 1px dotted #aaa; padding-top: 5px; background-position-y: 5px; min-height: 58px; } .lazarus-sentence { padding-left: 0.5em; color: black; } .lazarus-sentence.negative { border-left: solid 5px #bb0000; } .lazarus-sentence.positive { border-left: solid 5px #008800; } .lazarus-sentence.negative a { color: #bb0000; } .lazarus-sentence.positive a { color: #008800; } </style> ''' % utopia.get_plugin_data_as_url('images/pdf-page-icon.png', 'image/png') expressions = [] for sAnnotation in expression_annotations: exp = sAnnotation.get('property:expressions', 'json:{}') if exp.startswith('json:'): exp = json.loads(exp[5:]) context = sAnnotation.get('property:context') if context is not None: if exp.get('negative', False): exp['posneg'] = 'negative' else: exp['posneg'] = 'positive' pprint(context) pprint(exp) matched_context = exp.get('context') matches = [] if matched_context is not None: matches = document.search( re.sub(r'\s+', ' ', matched_context)) if len(matches) > 0: anchor_id = str(uuid.uuid4())[1:-1] anchor = spineapi.Annotation() anchor['concept'] = 'Anchor' anchor['property:anchor'] = anchor_id anchor.addExtents(matches) document.addAnnotation(anchor) exp.update({ 'anchor_id': anchor_id, 'sentence': context }) expressions.append(exp) js = u''' <script> $(document).on('DOMNodeInserted', function(e) { var element = e.target; $(element).filter('a[target="tab"]').add('a[target="tab"]', element).each(function () { var fragment = $(this).closest('.-papyro-internal-citation').data('citation')['userdef']['first_fragment']; $(this).attr('target', 'pdf; show=highlight; text=[' + encodeURIComponent(fragment) + ']'); }); }); $(function () { var lazarus = { expressions: %s, fingerprints: %s, relUrl: %s }; var more_expressions_link = $('#lazarus-expression > p.more').hide(); var more_expressions_spinner = $('#lazarus-expression > div.spinner'); Spinners.create(more_expressions_spinner); Spinners.play(more_expressions_spinner); var exp_divs = []; var identifiers = []; for (var e = 0; e < lazarus.expressions.length; e++) { var expression = lazarus.expressions[e]; var exp_div = $('<div class="box"></div>'); exp_div.data('expression', expression); exp_div.hide(); exp_divs.push(exp_div); identifiers.push(expression.identifiers); } var params = { fingerprint: lazarus.fingerprints }; var url = lazarus.relUrl + '?' + $.param(params, traditional=true); $.ajax({ url: url, type: 'POST', dataType: 'json', data: JSON.stringify(identifiers), contentType: "application/json", error: function (xhr, ajaxOptions, thrownError) { console.log(xhr.statusText); console.log(xhr.responseText); console.log(xhr.status); console.log(thrownError); // FIXME do something here Spinners.remove(more_expressions_spinner); }, success: function (related) { // Sort related according to the number of articles found related.results.sort(function (l, r) { var lv = Object.keys(l.related).length; var rv = Object.keys(r.related).length; return (lv > rv) ? -1 : (lv < rv) ? 1 : 0; }); $.each(related.results, function (idx, result) { var exp_div = exp_divs[idx]; var expression = exp_div.data('expression'); expression.related = result.related; delete expression.related[%s]; split = expression.sentence.split(expression.context); pre = split[0]; pre = pre.replace(/(\w)$/, '$1 '); pre = pre.replace(/^\s*/, ''); match = expression.context; post = split[1]; post = post.replace(/^(\w)/, ' $1'); post = post.replace(/\s*$/, ''); expression.pre = pre; expression.match = match; expression.post = post; // Create expression element exp_div.append('<p class="lazarus-sentence ' + expression.posneg + '">“' + expression.pre + '<a target="pdf; show=select; anchor=' + expression.anchor_id + '"><strong>' + expression.match + '</strong></a>' + expression.post + '”</p>'); exp_div.data('expression', expression); $('#lazarus-expression > .content').append(exp_div); if (Object.keys(expression.related).length > 0) { var related_div = $('<div class="expandable" title="Related expressions elsewhere"></div>'); var related_div_content = $('<div></div>').appendTo(related_div); function on_expand() { related_div.off('papyro:expandable:expand', on_expand); $.each(expression.related, function (idx, obj) { fragments = []; $.each(obj, function (id, obj) { fragments.push(obj.context); }); fragments.join('\\n'); related_div_content.append($('<div class="lazarus-related unprocessed"></div>').append('<p><strong>“…'+fragments+'…”</strong></p>').hide().data('citation', {identifiers:{doi:idx},userdef:{first_fragment:fragments[0]}})); // .append(utopia.citation.render({identifiers:{doi:idx},first_fragment:fragments[0]}, true, true)) }); expression.related.length = 0; // empty for future if ($('.lazarus-related.unprocessed', exp_div).length > 0) { var more = $('<p class="more right"><a class="more">More related articles...</a></p>'); related_div_content.append(more); function show_five_related(e) { e.preventDefault(); $('.lazarus-related.unprocessed', exp_div).slice(0, 5).each(function (idx, obj) { var citation = $(obj).data('citation'); $(obj).append(utopia.citation.render(citation, true, true)); $(obj).show().removeClass('unprocessed'); }); if ($('.lazarus-related.unprocessed', exp_div).length == 0) { more.remove(); } } more.on('click', show_five_related).click(); } } related_div.on('papyro:expandable:expand', on_expand); exp_div.append(related_div); utopia.processNewContent(related_div); } }); Spinners.remove(more_expressions_spinner); more_expressions_link.show(); $('a.more', more_expressions_link).click(); } }); function append_five(e) { e.preventDefault(); // Show the next five $('#lazarus-expression > .content').children().filter(':hidden').slice(0,5).show(); // Hide the 'more' link if everything is now visible if ($('#lazarus-expression > .content').children().filter(':hidden').length == 0) { more_expressions_link.hide(); } } // Hook up 'more' link $('#lazarus-expression > p.more > a.more').on('click', append_five).click(); }); </script> ''' % (json.dumps(expressions), json.dumps( document.fingerprints()), json.dumps(laz_docRelUrl), json.dumps(this_doi)) #print(js.encode('utf8')) html = u''' <div id="lazarus-expression"><div class="content"></div><div class="spinner"></div><p class="more"><a class="more">More expressions...</a></p></div> ''' if len(expressions) > 0: ann = spineapi.Annotation() ann['concept'] = 'Collated' ann['property:name'] = 'Lazarus Expressions' ann['property:description'] = u'Summarizing expression(s)' ann['property:html'] = [js, style, html] ann['property:sourceDescription'] = self.sourceDescription ann['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') document.addAnnotation(ann) else: # no permission noprompt = self.get_config('noprompt', False) if not noprompt: annotation = spineapi.Annotation() annotation['concept'] = 'Collated' params = { 'uuid': self.uuid(), } annotation['property:html'] = utopia.get_plugin_data( 'tpl/denied.html').format(**params) annotation['property:name'] = 'Lazarus' annotation[ 'property:description'] = 'Lazarus functionality is turned off' annotation[ 'property:sourceDescription'] = self.sourceDescription annotation[ 'property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/lazarus-prefs-logo.png', 'image/png') annotation['session:default'] = '1' document.addAnnotation(annotation)
def on_ready_event(self, document): doi = utopialib.utils.metadata(document, 'identifiers[doi]') if doi is not None: info = {} # Resolve the DOI to find the publisher's website response = urllib2.urlopen('http://dx.doi.org/{0}'.format(doi), timeout=8) # Parse page to find (if there) the full text URL parser = etree.HTMLParser() html = etree.parse(response, parser) # Only continue if this is a highwire HTML page if len(html.xpath("/html/head/meta[@name='HW.identifier']")) == 0: return # Now make sure we have the full text XHTML citation_fulltext_html_url = html.xpath( "/html/head/meta[@name='citation_fulltext_html_url']/@content") if len(citation_fulltext_html_url) > 0: citation_fulltext_html_url = citation_fulltext_html_url[0] # Fetch that full text page (if different to the current one) if citation_fulltext_html_url != response.geturl(): response = urllib2.urlopen(citation_fulltext_html_url, timeout=8) html = etree.parse(response, parser) #print etree.tostring(html, pretty_print=True, encoding='utf8') # Now parse out the bibliography info['citations'] = [] info['citations_by_id'] = {} for bibitem in html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' ref-list ')]//ol[contains(concat(' ', normalize-space(@class), ' '), ' cit-list ')]/li" ): citation = query( bibitem, { 'id': 'a/@id', 'label': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' ref-label ')]/text()", 'title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-article-title ')]/text()", 'year': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-date ')]/text()", 'publication-title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-jnl-abbrev ')]/text()", 'volume': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-vol ')]/text()", 'issue': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-issue ')]/text()", 'pagefrom': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-fpage ')]/text()", 'pageto': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-lpage ')]/text()", 'pmid': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-pmid ')]/text()", 'doi': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-doi ')]/text()", 'etree': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-metadata ')]", }) authors = [] for a in bibitem.xpath( ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-auth ')]" ): surname = a.xpath( ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-surname ')]/text()" ) given_names = a.xpath( ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-given-names ')]/text()" ) if len(surname) > 0 and len(given_names) > 0: authors.append(u'{0}, {1}'.format( surname[0], given_names[0]).strip(', ')) if len(authors) > 0: citation['authors'] = authors citation['contexts'] = [] citation['displayText'] = utopia.citation.format(citation) info['citations'].append(citation) info['citations_by_id'][citation['id']] = citation #print citation ####################################################################################### # Parse in-text citations if present min_length = 10 max_length = 20 for paragraph in html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' article ')]//p" ): text_stack = [paragraph.text or ''] xref_stack = [None] for elem in paragraph: if len(elem.xpath('self::a[@class="xref-bibr"]')) > 0: text_stack.append( etree.tostring(elem, method='text', encoding=unicode, with_tail=False)) text_stack.append(elem.tail or '') xref = info['citations_by_id'].get( elem.get('href', '')[1:]) if xref is not None: xref_stack += [[xref], None] else: xref_stack += [[], None] elif isinstance(elem, etree._Entity): points = entities.get(elem.text[1:-1]) if points is not None: text_stack[-1] += ''.join( (unichr(p) for p in points)) else: text_stack[-1] += etree.tostring( elem, encoding=unicode) else: if elem.get('position') == 'float': text_stack[-1] += elem.tail or '' else: text_stack[-1] += etree.tostring( elem, method='text', encoding=unicode) # Find and collapse ranges in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() #print len(text), (text in u'-\u2010\u2011\u2012\u2013\u2014\u2015'), u''.join(text_stack[i-1:i+2]).encode('utf8') # if this text is a dash, we need to coalesce the text fragments if len( text ) == 1 and text in u'-\u2010\u2011\u2012\u2013\u2014\u2015': text_stack[i - 1:i + 2] = [ u''.join(text_stack[i - 1:i + 2]) ] xref_stack[i - 1:i + 2] = [ xref_stack[i - 1] + xref_stack[i + 1] ] #for text in text_stack: # print text.encode('utf8') # Then make sure we resolve the implied citations for i in xrange(1, len(xref_stack), 2): # Get actual cross references xrefs = xref_stack[i] # Expand cross references try: if len(xrefs) == 2: labelfrom = int(xrefs[0].get('label')) labelto = int(xrefs[1].get('label')) candidates = {} midlabels = [ unicode(midlabel) for midlabel in xrange( labelfrom + 1, labelto) ] for candidate in info['citations']: if candidate.get('label') in midlabels: candidates[int(candidate.get( 'label'))] = candidate xrefs[1:-1] = candidates.values() except: raise # Find and collapse lists in the text for i in xrange(len(xref_stack) - 3, 1, -2): text = text_stack[i].strip() # if this text is a comma, we need to coalesce the text fragments if len(text) == 1 and text == ',': text_stack[i - 1:i + 2] = [ u''.join(text_stack[i - 1:i + 2]) ] xref_stack[i - 1:i + 2] = [ xref_stack[i - 1] + xref_stack[i + 1] ] # Expand citations to include brackets (on both sides) for i in xrange(len(xref_stack) - 2, 0, -2): before = text_stack[i - 1].strip()[-1:] text = text_stack[i].strip() after = text_stack[i + 1].strip()[:1] # if this text is a comma, we need to coalesce the text fragments #print before.encode('utf'), after.encode('utf') if len(before) > 0 and before in '({[' and len( after) > 0 and after in ')}]': text_stack[i - 1] = re.sub(r'[({[](\s*)$', r'\1', text_stack[i - 1]) text_stack[i + 1] = re.sub(r'^(\s*)[)}\]]', r'\1', text_stack[i + 1]) text_stack[i] = before + text_stack[i] + after #print repr(text_stack) for i in xrange(1, len(xref_stack), 2): # Get context before = u' '.join(text_stack[:i]).strip() label = text_stack[i].strip() after = u' '.join(text_stack[i + 1:]).strip() # Strip out extraneous brackets if len( xref_stack[i] ) > 1: # Hack to differentiate single / multiple citations # as multiple numbers tend not to have spaces between them label = re.sub( ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,])\s?[({[]?', r'\1', label) else: label = re.sub( ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,]\s?)[({[]?', r'\1', label) # Normalise context before = re.sub(r'\s+', ' ', before)[-max_length:].strip() label = re.sub(r'\s+', ' ', label) after = re.sub(r'\s+', ' ', after)[:max_length].strip() #print (before.encode('utf8'), label.encode('utf8'), after.encode('utf8')) if len(before + after) > min_length: for xref in xref_stack[i]: xref['contexts'].append((before, label, after)) #print xref_stack[i] ####################################################################################### # Parse tables if present info['tables'] = {} for table_url in html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' table-inline ')]/div[contains(concat(' ', normalize-space(@class), ' '), ' callout ')]//li[1]/a/@href" ): table_url = urlparse.urljoin(citation_fulltext_html_url, table_url) #print table_url response = urllib2.urlopen(table_url, timeout=8) table_html = etree.parse(response, parser) for table_expansion in table_html.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), ' table-expansion ')]" ): id = table_expansion.get('id') table = {} table['xml'] = table_expansion.xpath('.//table[1]')[0] table['caption_raw'] = table_expansion.xpath( ".//span[contains(concat(' ', normalize-space(@class), ' '), ' caption-title ')][1]" )[0] if 'caption' not in table and 'caption_raw' in table: table['caption'] = table['caption_raw'] if 'caption' in table: table['caption'] = re.sub( r'\s+', ' ', etree.tostring(table['caption'], method='text', encoding=unicode).strip()) if 'xml' in table: table['xml'] = etree.tostring(table['xml'], encoding='utf8') info['tables'][id] = table #print table #print info if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Generate sensible titles / descriptions / icons? journalTitle = info.get('publication-title', '') journalTitleSuffix = '' publisher = info.get('publisher', 'the publisher') if len(journalTitle) > 0: journalTitleSuffix = ' ({0})'.format(journalTitle) # Create Metadata link annotation link = document.newAccList('metadata', 90) link['property:sourceIcon'] = '' link['property:sourceTitle'] = publisher link['property:sourceDescription'] = ''' <p>This information was provided by {0}{1}.</p> '''.format(publisher, journalTitleSuffix) # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): #print (pre, label, post) matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) #print regex # convert oasis tables ns = { 'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table' } xml = etree.fromstring(table['xml']) if xml.tag == '{{{0}}}table'.format(ns['oasis']): for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns): columns = {} for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns): columns[colspec.get('colname')] = int( colspec.get('colnum')) for section in tgroup.xpath( './oasis:thead|./oasis:tbody', namespaces=ns): isHead = ( section.tag == '{{{0}}}thead'.format( ns['oasis'])) for row in section.xpath('./oasis:row', namespaces=ns): for entry in row.xpath('./oasis:entry', namespaces=ns): colname = entry.get('colname') colst = entry.get('namest') colend = entry.get('nameend') if colst is not None and colend is not None: colspan = columns[ colend] - columns[colst] + 1 else: colspan = 1 if colspan > 1: entry.set( 'colspan', unicode(colspan)) morerows = entry.get('morerows') if morerows is not None: rowspan = int(morerows) + 1 else: rowspan = 1 if rowspan > 1: entry.set( 'rowspan', unicode(rowspan)) entry.tag = 'td' row.tag = 'tr' if isHead: section.tag = 'thead' else: section.tag = 'tbody' xml.append(section) xml.tag = 'table' #print etree.tostring(xml, pretty_print=True, encoding='utf8') table['xml'] = etree.tostring(xml, encoding='utf8') matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): # See if we have any publishers' NLM hosted for this DOI doi = common.utils.metadata(document, 'doi') #print '----- DOI', doi if doi is not None: info = None try: url = 'https://utopia.cs.manchester.ac.uk/ext/hosted/nlm?' url += urllib.urlencode({'doi': doi.encode('utf8')}) nlm = urllib2.urlopen(url, timeout=8).read() info = common.nlm.parse(nlm) except (urllib2.URLError, socket.timeout): # info will remain None pass #print info if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Generate sensible titles / descriptions / icons? journalTitle = info.get('publication-title', '') journalTitleSuffix = '' publisher = info.get('publisher', 'the publisher') if len(journalTitle) > 0: journalTitleSuffix = ' ({0})'.format(journalTitle) # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = '' link['property:sourceTitle'] = publisher link['property:sourceDescription'] = ''' <p>This information was provided by {0}{1}.</p> '''.format(publisher, journalTitleSuffix) # Publisher identity if doi[:8] in ('10.1104/', '10.1105/'): annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' logo = utopia.get_plugin_data_as_url('images/aspb_logo.png', 'image/png') webpageUrl = 'http://www.aspb.org/' title = publisher #print '====', publisher, '---', journalTitle, '---', webpageUrl if doi.startswith('10.1104/'): logo = utopia.get_plugin_data_as_url('images/pp_logo.png', 'image/png') title = journalTitle webpageUrl = 'http://www.plantphysiol.org/' elif doi.startswith('10.1105/'): logo = utopia.get_plugin_data_as_url('images/tpc_logo.png', 'image/png') title = journalTitle webpageUrl = 'http://www.plantcell.org/' annotation['property:logo'] = logo annotation['property:title'] = title annotation['property:webpageUrl'] = webpageUrl document.addAnnotation(annotation, 'PublisherMetadata') link['property:sourceIcon'] = logo link['property:sourceTitle'] = title # Create Metadata annotation annotation = spineapi.Annotation() annotation['concept'] = 'DocumentMetadata' for k in self.keys: v = info.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = spineapi.Annotation() annotation['concept'] = 'DocumentReference' for k in self.keys: v = citation.get(k) if v is not None: annotation['property:{0}'.format(k)] = v document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation['concept'] = 'ForwardCitation' annotation['property:state'] = 'found' if 'title' in citation: annotation['property:title'] = citation['title'] if 'id' in citation: annotation['property:bibid'] = citation['id'] if 'doi' in citation and citation['doi'].startswith('10.1371/'): citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid']) for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'): if k in citation: annotation['property:{0}'.format(k)] = citation[k] #print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) #print citation except: raise pass # FIXME for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict = True) #print regex # convert oasis tables ns = {'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table'} xml = etree.fromstring(table['xml']) if xml.tag == '{{{0}}}table'.format(ns['oasis']): for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns): columns = {} for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns): columns[colspec.get('colname')] = int(colspec.get('colnum')) for section in tgroup.xpath('./oasis:thead|./oasis:tbody', namespaces=ns): isHead = (section.tag == '{{{0}}}thead'.format(ns['oasis'])) for row in section.xpath('./oasis:row', namespaces=ns): for entry in row.xpath('./oasis:entry', namespaces=ns): colname = entry.get('colname') colst = entry.get('namest') colend = entry.get('nameend') if colst is not None and colend is not None: colspan = columns[colend] - columns[colst] + 1 else: colspan = 1 if colspan > 1: entry.set('colspan', unicode(colspan)) morerows = entry.get('morerows') if morerows is not None: rowspan = int(morerows) + 1 else: rowspan = 1 if rowspan > 1: entry.set('rowspan', unicode(rowspan)) entry.tag = 'td' row.tag = 'tr' if isHead: section.tag = 'thead' else: section.tag = 'tbody' xml.append(section) xml.tag = 'table' #print etree.tostring(xml, pretty_print=True, encoding='utf8') table['xml'] = etree.tostring(xml, encoding='utf8') matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): # Find distinguishing ID pmid = utopia.tools.utils.metadata(document, 'identifiers[pubmed]') # Compile distinct GEO IDs in the text matches = {} for match in document.search( r'GSE\d+', spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): matches.setdefault(match.text(), []) matches[match.text()].append(match) # Create annotations for each GEO ID for gse, extents in matches.iteritems(): entry = self._fetchGEO(gse) print entry dates = u'Submitted {0}'.format(entry['submission_date']) if 'last_update_date' in entry: dates += u'; last updated {0}'.format( entry['last_update_date']) dates += '.' dataCitation = u'''<p>{0}. <strong>{1}</strong>.</p><p>{2}<br>({3})</p><p>{4}</p>'''.format( entry['contributors'], entry['title'], entry.get('overall_design', ''), entry['type'], dates) xhtml = u'<div class="box">{0}{{0}}<p>GEO Accession: <a href="{1}">{2}</a></p></div>'.format( dataCitation, entry['GEO_url'], gse) # Removed broken InSilicoDB link #xhtml += u'<p><a href="{0}">Explore in InSilico DB...</a></p>'.format(entry['InSilicoDB_url']) srcdesc = '''<p>The <a href="http://www.ncbi.nlm.nih.gov/geo">Gene Expression Omnibus (GEO)</a> is a public repository that archives and freely distributes microarray, next-generation sequencing, and other forms of high-throughput functional genomic data submitted by the scientific community.</p>''' if entry.get('pubmed_id') == pmid: # add a global annotation annotation = spineapi.Annotation() annotation['concept'] = 'GEO' annotation['property:name'] = 'Gene Expression Omnibus' annotation['property:sourceDatabase'] = 'geo' annotation[ 'property:description'] = '{0} (Data associated with this article)'.format( gse) annotation['property:sourceDescription'] = srcdesc annotation['property:xhtml'] = xhtml.format( '') # Keep summary blank document.addAnnotation(annotation) # Generate summary summary = entry.get('summary', '') if len(summary) > 0: summary_words = summary.split(' ') summary = u'<p><em>Summary:</em> ' summary += u'{0}'.format(' '.join(summary_words[:32])) if len(summary_words) > 32: summary += u' <span class="readmore">{0}</span>'.format( ' '.join(summary_words[32:])) summary += u'</p>' # local annotation annotation = spineapi.Annotation() annotation['concept'] = 'GEO' annotation['property:name'] = 'Gene Expression Omnibus' annotation['property:sourceDatabase'] = 'geo' annotation['property:description'] = gse annotation['property:sourceDescription'] = srcdesc annotation['property:xhtml'] = xhtml.format(summary) for extent in extents: annotation.addExtent(extent) document.addAnnotation(annotation)
def on_ready_event(self, document): doi = common.utils.metadata(document, "doi", "") match = self.splitRegEx.match(doi) if match is not None: articleNumber = match.group("number") annotation = spineapi.Annotation() annotation["concept"] = "PublisherIdentity" annotation["property:logo"] = utopia.get_plugin_data_as_url("images/logo.png", "image/png") annotation["property:title"] = "eLife" annotation["property:webpageUrl"] = "http://www.elifesciences.org/" document.addAnnotation(annotation, "PublisherMetadata") # Turn all the DOIs that are sub-DOIs of this document into links regex = r"{0}\.\d+".format(re.escape(doi)) for match in document.search(regex, spineapi.RegExp): url = "http://dx.doi.org/{0}".format(match.text()) annotation = spineapi.Annotation() annotation["concept"] = "Hyperlink" annotation["property:webpageUrl"] = url annotation["session:volatile"] = "1" annotation.addExtent(match) document.addAnnotation(annotation) # Try to get the NLM directly from eLife url = "http://elife.elifesciences.org/elife-source-xml/10.7554/eLife.{0}" url = url.format(articleNumber) try: nlm = urllib2.urlopen(url, timeout=8).read() except (urllib2.URLError, socket.timeout): return info = common.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding="utf8") pmids = dict( ( (citation["pmid"], citation["id"]) for citation in info["citations"] if "pmid" in citation and "id" in citation ) ) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( common.eutils.efetch(id=",".join(pmids.keys()), retmode="xml", rettype="abstract"), parser ) for idList in pubmed_abstracts.xpath("PubmedArticle/PubmedData/ArticleIdList"): # print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info["citations_by_id"][pmids[pmid]] for key_name, id_name in (("doi", "doi"), ("pmcid", "pmc"), ("pii", "pii")): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id # print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList("metadata", 100) link["property:sourceIcon"] = utopia.get_plugin_data_as_url("images/annotation_icon.png", "image/png") link["property:sourceTitle"] = "eLife" link[ "property:sourceDescription" ] = """ <p>The <a href="http://www.elifesciences.org/">eLife</a> open access publishing platform.</p> """ # Create Metadata annotation annotation = spineapi.Annotation() annotation["concept"] = "DocumentMetadata" for k in self.keys: v = info.get(k) if v is not None: annotation["property:{0}".format(k)] = v document.addAnnotation(annotation, link["scratch"]) # Create Bibliography annotations for citation in info.get("citations", []): annotation = spineapi.Annotation() annotation["concept"] = "DocumentReference" for k in self.keys: v = citation.get(k) if v is not None: annotation["property:{0}".format(k)] = v document.addAnnotation(annotation, link["scratch"]) ####################################################################################### # Apply parsed data to document # Citations for citation in info["citations"]: # Find cross refs for pre, label, post in citation.get("contexts", []): matches = document.findInContext(pre, label, post) # print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation["concept"] = "ForwardCitation" annotation["property:state"] = "found" if "title" in citation: annotation["property:title"] = citation["title"] if "id" in citation: annotation["property:bibid"] = citation["id"] if "doi" in citation and citation["doi"].startswith("10.1371/"): citation[ "pdf" ] = "http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF".format( "info:doi/{0}".format(citation["doi"]) ) if "pmcid" in citation: citation["pdf"] = "http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/".format( citation["pmcid"] ) for k in ( "displayText", "label", "pdf", "pmid", "pmc", "pii", "doi", "first_author_surname", "year", "journal", "volume", "page_from", ): if k in citation: annotation["property:{0}".format(k)] = citation[k] # print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link["scratch"]) # print citation except: raise pass # FIXME for id, table in info.get("tables", {}).iteritems(): if "caption" in table and "xml" in table: regex = fuzz(table["caption"], strict=True) print regex matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation["concept"] = "Table" annotation[ "session:upload_files" ] = "data:application/xml;name=data.xml;base64,%s" % base64.standard_b64encode(table["xml"]) annotation["session:volatile"] = "1" annotation.addExtent(matches[0]) document.addAnnotation(annotation, link["scratch"]) else: print "*********** failed to match table:", id
def on_ready_event(self, document): info = common.nlm.parse(common.utils.metadata(document, "raw_pmc_nlm")) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding="utf8") pmids = dict( ( (citation["pmid"], citation["id"]) for citation in info["citations"] if "pmid" in citation and "id" in citation ) ) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( common.eutils.efetch(id=",".join(pmids.keys()), retmode="xml", rettype="abstract"), parser ) for idList in pubmed_abstracts.xpath("PubmedArticle/PubmedData/ArticleIdList"): # print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info["citations_by_id"][pmids[pmid]] for key_name, id_name in (("doi", "doi"), ("pmcid", "pmc"), ("pii", "pii")): id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id # Create Metadata link annotation link = document.newAccList("metadata", 50) link["property:sourceDatabase"] = "pmc" link["property:sourceTitle"] = "PubMed Central" link[ "property:sourceDescription" ] = '<p><a href="http://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a> is the U.S. National Institutes of Health (NIH) digital archive of biomedical and life sciences journal literature.</p>' # Create Metadata annotation annotation = spineapi.Annotation() annotation["concept"] = "DocumentMetadata" for k in self.keys: v = info.get(k) if v is not None: annotation["property:{0}".format(k)] = v document.addAnnotation(annotation, link["scratch"]) # Create Bibliography annotations for citation in info.get("citations", []): annotation = spineapi.Annotation() annotation["concept"] = "DocumentReference" for k in self.keys: v = citation.get(k) if v is not None: annotation["property:{0}".format(k)] = v document.addAnnotation(annotation, link["scratch"]) # Citations for citation in info["citations"]: # Find cross refs for pre, label, post in citation.get("contexts", []): matches = document.findInContext(pre, label, post) # print matches if len(matches) > 0: try: annotation = spineapi.Annotation() annotation["concept"] = "ForwardCitation" annotation["property:state"] = "found" if "title" in citation: annotation["property:title"] = citation["title"] if "id" in citation: annotation["property:bibid"] = citation["id"] if "doi" in citation and citation["doi"].startswith("10.1371/"): citation[ "pdf" ] = "http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF".format( "info:doi/{0}".format(citation["doi"]) ) if "pmcid" in citation: citation["pdf"] = "http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/".format( citation["pmcid"] ) # print citation for k in self.keys + ("authors", "pdf", "first_author_surname"): if k in citation: annotation["property:{0}".format(k)] = citation[k] # print annotation.get('property:label'), annotation.get('property:pdf') for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link["scratch"]) # print citation except: raise pass # FIXME # Tables for id, table in info.get("tables", {}).iteritems(): if "caption" in table and "xml" in table: regex = fuzz(table["caption"], strict=True) matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation["concept"] = "Table" annotation[ "session:upload_files" ] = "data:application/xml;name=data.xml;base64,%s" % base64.standard_b64encode(table["xml"]) annotation.addExtent(matches[0]) document.addAnnotation(annotation, link["scratch"]) else: print "*********** failed to match table:", id
def on_ready_event(self, document): # Only send if the DOI has a Springer prefix doi = utopialib.utils.metadata(document, 'identifiers[doi]') if doi is not None and doi[:7] in registrants: annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' if False and doi.startswith( '10.1186/'): # This turns out not to be reliable annotation['property:logo'] = utopia.get_plugin_data_as_url( 'images/gigascience_logo.png', 'image/png') annotation['property:title'] = 'Giga Science' annotation[ 'property:webpageUrl'] = 'http://www.gigasciencejournal.com/' else: annotation['property:logo'] = utopia.get_plugin_data_as_url( 'images/logo.png', 'image/png') annotation['property:title'] = 'Springer' annotation['property:webpageUrl'] = 'http://www.springer.com/' document.addAnnotation(annotation, 'PublisherMetadata') # Make a request to the utopia ext web service url = 'https://utopia.cs.manchester.ac.uk/ext/springer/nlm?{0}' url = url.format(urllib.urlencode({'doi': doi})) try: nlm = urllib2.urlopen(url, timeout=8).read() except (urllib2.URLError, socket.timeout): return info = utopialib.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/annotation_icon.png', 'image/png') link['property:sourceTitle'] = 'Springer' link['property:sourceDescription'] = ''' <p><a href="http://www.springer.com/">Springer</a> publishing company.</p> ''' # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) print regex matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_activate_event(self, document): ns = {'r': 'Reflect'} maxTextFragmentSize = 1000000 textFragments = [] seenItemNames = set() ignoredEntityTypes = [-11] # Retrieve the full text of the document, split into fragments for page in document.pages(): pageText = re.sub(r'\s+', r' ', page.pageText()) if len(textFragments) == 0 or len(textFragments[-1][0]) + len(pageText) > maxTextFragmentSize: textFragments.append([pageText, page]) else: textFragments[-1][0] = textFragments[-1][0] + ' ' + pageText for text, page in textFragments: # Package it as URL encoded form encoding payload = 'document=%s' % urllib.quote(text.encode('utf8')) # Send it off to the reflect server response = urllib2.urlopen("http://reflect.ws/REST/GetEntities", payload, timeout=8) # Parse response root = etree.fromstring(response.read(), self.parser) reflections = {} annotations = {} for item in root.xpath('//r:item', namespaces = ns): itemName = item.findtext('{%s}name' % ns['r']).lower().strip() if itemName not in seenItemNames: for entity in item.xpath('.//r:entity', namespaces = ns): entityType = entity.findtext('{%s}type' % ns['r']) if entityType is not None: entityType = int(entityType) if entityType not in ignoredEntityTypes: entityIdentifier = entity.findtext('{%s}identifier' % ns['r']) if itemName not in reflections: reflections[itemName] = set() reflections[itemName].add((entityType, entityIdentifier)) # For each match, create an annotation that the UI will handle later regex = '(%s)' % '|'.join([re.escape(key) for key in reflections.iterkeys()]) matches = document.search(regex, IgnoreCase + WholeWordsOnly + RegExp, start = page) for match in matches: if match.begin().wordArea()[1] == 0: itemName = match.text().lower().strip() annotation = annotations.get(itemName, None) if annotation is None and itemName in reflections: annotation = Annotation() annotation['concept'] = 'Reflection' annotation['property:webpageUrl'] = \ 'http://reflect.ws/fcgi-bin/solveAmbig.fcgi?entities=%s' % \ ';'.join(['%d.%s' % (t, id) for (t, id) in reflections[itemName]]) annotation['property:name'] = itemName annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#0A0' annotations[itemName] = annotation seenItemNames.add(itemName) if annotation is not None: annotation.addExtent(match) else: print "ERROR: matched '%s' but could not find in reflections map" % itemName.encode('utf8') print reflections.keys() document.addAnnotations(annotations.values())
def on_ready_event(self, document): info = utopialib.nlm.parse( utopialib.utils.metadata(document, 'raw_pmc_nlm')) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id # Create Metadata link annotation link = document.newAccList('metadata', 50) link['property:sourceDatabase'] = 'pmc' link['property:sourceTitle'] = 'PubMed Central' link[ 'property:sourceDescription'] = '<p><a href="http://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a> is the U.S. National Institutes of Health (NIH) digital archive of biomedical and life sciences journal literature.</p>' # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation(citation) document.addAnnotation(annotation, link['scratch']) # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format(citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise # Tables for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def after_ready_event(self, document): # Get (if present) the RSCMetadataLink annotation for annotation in document.annotations(): if annotation.get('concept') == 'RSCMetadataLink': text = document.text() doi = annotation['property:doi'].upper() rscId = annotation['property:rscId'].upper() xmlquery = '<SearchCriteria><SearchTerm><Category>Journal</Category><ContentType>All</ContentType><Criterias><NameValue><Name>FreeText</Name><Value>"%s"</Value></NameValue></Criterias><Source>Utopia</Source></SearchTerm><PageNo>1</PageNo><PageSize>10</PageSize><SortBy>Relevance</SortBy></SearchCriteria>' % doi baseurl = 'http://pubs.rsc.org/en/federated/search' params = { 'federatedsearchname': 'Utopia', 'inputxml': xmlquery } url = baseurl + '?%s' % urllib.urlencode(params) searchresult = urllib2.urlopen(url, timeout=14).read() root = etree.fromstring(searchresult) #print etree.tostring(root, pretty_print=True, encoding='utf8') articles = root.findall('./{http://www.rsc.org/schema/rscart38}article') #print articles # the search use above can return more than one article, so select out only the one with # the correct doi thearticle = None articleID = None for article in articles: found_doi = article.findtext("./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='doi']") if found_doi is None: found_doi = article.findtext("./{http://www.rsc.org/schema/rscart38}art-admin/{http://www.rsc.org/schema/rscart38}doi") if found_doi is not None and found_doi.upper() == doi: thearticle = article articleIDelem = article.find("./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='docid']") if articleIDelem is not None: articleID = articleIDelem.text break # if we get back a single valid article... if thearticle != None: #print articleID compoundsInArticle = [] compoundText = {} annotationsInArticle = [] annotationText = {} # create a list of all the compounds that are mentioned in the article body compnames = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}compname') #print compnames for compname in compnames: # This line removes (erroneous?) elements from inside the XML etree.strip_elements(compname, '{http://www.rsc.org/schema/rscart38}compound', with_tail=False) #print compname.attrib['idrefs'], compname.text compoundsInArticle.append(compname.attrib['idrefs']) compoundText[compname.attrib['idrefs']] = etree.tounicode(compname, method='text') annotationnames = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}annref') #print annotationnames for annotationname in annotationnames: # This line removes (erroneous?) elements from inside the XML etree.strip_elements(annotationname, '{http://www.rsc.org/schema/rscart38}annotation', with_tail=False) #print annotationname.attrib['idrefs'], annotationname.text annotationsInArticle.append(annotationname.attrib['idrefs']) annotationText[annotationname.attrib['idrefs']] = etree.tounicode(annotationname, method='text') #print compoundText, annotationText #sprint annotationsInArticle # then for all the compounds that are defined in the article back compounds = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}compound') for compound in compounds: id = compound.attrib['id'] if id in compoundsInArticle: url = None id = compound.attrib['id'] # if the compound has a CSID, then the URL links to the chemspider page csid = compound.find("./{http://www.rsc.org/schema/rscart38}link[@type='CSID']" ) # if the compound has a CSID, create a Chemspider URL for it if csid is not None and csid.text is not None: url = 'http://www.chemspider.com/Chemical-Structure.%s.html' % csid.text[5:] else: # otherwise, use the RSC landing page url = 'http://www.rsc.org/publishing/journals/prospect/cheminfo.asp?XMLID=%s&compoundtext=%s&MSID=%s' % (id[4:], compoundText[id], articleID) if url is not None: options = spineapi.WholeWordsOnly + spineapi.IgnoreCase matches = document.search(compoundText[id], options) annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = url for match in matches: annotation.addExtent(match) document.addAnnotation(annotation) # similarly, for all the annotations annotations = thearticle.findall('./{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}annotation') for annotation in annotations: id = annotation.attrib['id'] url = None if id in annotationsInArticle: id = annotation.attrib['id'] # get the link id link = annotation.findtext("./{http://www.rsc.org/schema/rscart38}link" ) # if the compound has a link, create an RSC ontology landing page for it if link is not None: if link[:3] == 'AU:': url = 'http://goldbook.iupac.org/%s.html' % link[3:] else: url = 'http://www.rsc.org/publishing/journals/prospect/ontology.asp?id=%s&MSID=%s' % (link, articleID) if url is not None: matches = document.search(annotationText[id], spineapi.IgnoreCase + spineapi.WholeWordsOnly) annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = url for match in matches: annotation.addExtent(match) document.addAnnotation(annotation) break
def on_activate_event(self, document): ns = {'r': 'Reflect'} maxTextFragmentSize = 1000000 textFragments = [] seenItemNames = set() ignoredEntityTypes = [-11] # Retrieve the full text of the document, split into fragments for page in document.pages(): pageText = re.sub(r'\s+', r' ', page.pageText()) if len(textFragments) == 0 or len(textFragments[-1][0]) + len( pageText) > maxTextFragmentSize: textFragments.append([pageText, page]) else: textFragments[-1][0] = textFragments[-1][0] + ' ' + pageText for text, page in textFragments: # Package it as URL encoded form encoding payload = 'document=%s' % urllib.quote(text.encode('utf8')) # Send it off to the reflect server response = urllib2.urlopen("http://reflect.ws/REST/GetEntities", payload, timeout=8) # Parse response root = etree.fromstring(response.read(), self.parser) reflections = {} annotations = {} for item in root.xpath('//r:item', namespaces=ns): itemName = etree.tostring(item.find('{%s}name' % ns['r']), method="text", encoding=unicode, with_tail=False).lower().strip() if itemName not in seenItemNames: for entity in item.xpath('.//r:entity', namespaces=ns): entityType = entity.findtext('{%s}type' % ns['r']) if entityType is not None: entityType = int(entityType) if entityType not in ignoredEntityTypes: entityIdentifier = entity.findtext( '{%s}identifier' % ns['r']) if itemName not in reflections: reflections[itemName] = set() reflections[itemName].add( (entityType, entityIdentifier)) # For each match, create an annotation that the UI will handle later regex = '(%s)' % '|'.join( [re.escape(key) for key in reflections.iterkeys()]) matches = document.search(regex, IgnoreCase + WholeWordsOnly + RegExp, start=page) for match in matches: if match.begin().wordArea()[1] == 0: itemName = match.text().lower().strip() annotation = annotations.get(itemName, None) if annotation is None and itemName in reflections: annotation = Annotation() annotation['concept'] = 'Reflection' annotation['property:webpageUrl'] = \ 'http://reflect.ws/fcgi-bin/solveAmbig.fcgi?entities=%s' % \ ';'.join(['%d.%s' % (t, id) for (t, id) in reflections[itemName]]) annotation['property:name'] = itemName annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#0A0' annotations[itemName] = annotation seenItemNames.add(itemName) if annotation is not None: annotation.addExtent(match) else: print "ERROR: matched '%s' but could not find in reflections map" % itemName.encode( 'utf8') print reflections.keys() document.addAnnotations(annotations.values())
def after_ready_event(self, document): # Get (if present) the RSCMetadataLink annotation for annotation in document.annotations(): if annotation.get('concept') == 'RSCMetadataLink': text = document.text() doi = annotation['property:doi'].upper() rscId = annotation['property:rscId'].upper() xmlquery = '<SearchCriteria><SearchTerm><Category>Journal</Category><ContentType>All</ContentType><Criterias><NameValue><Name>FreeText</Name><Value>"%s"</Value></NameValue></Criterias><Source>Utopia</Source></SearchTerm><PageNo>1</PageNo><PageSize>10</PageSize><SortBy>Relevance</SortBy></SearchCriteria>' % doi baseurl = 'http://pubs.rsc.org/en/federated/search' params = { 'federatedsearchname': 'Utopia', 'inputxml': xmlquery } url = baseurl + '?%s' % urllib.urlencode(params) searchresult = urllib2.urlopen(url, timeout=14).read() root = etree.fromstring(searchresult) #print etree.tostring(root, pretty_print=True, encoding='utf8') articles = root.findall( './{http://www.rsc.org/schema/rscart38}article') #print articles # the search use above can return more than one article, so select out only the one with # the correct doi thearticle = None articleID = None for article in articles: found_doi = article.findtext( "./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='doi']" ) if found_doi is None: found_doi = article.findtext( "./{http://www.rsc.org/schema/rscart38}art-admin/{http://www.rsc.org/schema/rscart38}doi" ) if found_doi is not None and found_doi.upper() == doi: thearticle = article articleIDelem = article.find( "./{http://www.rsc.org/schema/rscart38}metainfo/{http://www.rsc.org/schema/rscart38}meta[@field='docid']" ) if articleIDelem is not None: articleID = articleIDelem.text break # if we get back a single valid article... if thearticle != None: #print articleID compoundsInArticle = [] compoundText = {} annotationsInArticle = [] annotationText = {} # create a list of all the compounds that are mentioned in the article body compnames = thearticle.findall( './{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}compname' ) #print compnames for compname in compnames: # This line removes (erroneous?) elements from inside the XML etree.strip_elements( compname, '{http://www.rsc.org/schema/rscart38}compound', with_tail=False) #print compname.attrib['idrefs'], compname.text compoundsInArticle.append(compname.attrib['idrefs']) compoundText[ compname.attrib['idrefs']] = etree.tounicode( compname, method='text') annotationnames = thearticle.findall( './{http://www.rsc.org/schema/rscart38}art-body/{http://www.rsc.org/schema/rscart38}annref' ) #print annotationnames for annotationname in annotationnames: # This line removes (erroneous?) elements from inside the XML etree.strip_elements( annotationname, '{http://www.rsc.org/schema/rscart38}annotation', with_tail=False) #print annotationname.attrib['idrefs'], annotationname.text annotationsInArticle.append( annotationname.attrib['idrefs']) annotationText[ annotationname.attrib['idrefs']] = etree.tounicode( annotationname, method='text') #print compoundText, annotationText #sprint annotationsInArticle # then for all the compounds that are defined in the article back compounds = thearticle.findall( './{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}compound' ) for compound in compounds: id = compound.attrib['id'] if id in compoundsInArticle: url = None id = compound.attrib['id'] # if the compound has a CSID, then the URL links to the chemspider page csid = compound.find( "./{http://www.rsc.org/schema/rscart38}link[@type='CSID']" ) # if the compound has a CSID, create a Chemspider URL for it if csid is not None and csid.text is not None: url = 'http://www.chemspider.com/Chemical-Structure.%s.html' % csid.text[ 5:] else: # otherwise, use the RSC landing page url = 'http://www.rsc.org/publishing/journals/prospect/cheminfo.asp?XMLID=%s&compoundtext=%s&MSID=%s' % ( id[4:], compoundText[id], articleID) if url is not None: options = spineapi.WholeWordsOnly + spineapi.IgnoreCase matches = document.search( compoundText[id], options) annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = url for match in matches: annotation.addExtent(match) document.addAnnotation(annotation) # similarly, for all the annotations annotations = thearticle.findall( './{http://www.rsc.org/schema/rscart38}art-back/{http://www.rsc.org/schema/rscart38}annotation' ) for annotation in annotations: id = annotation.attrib['id'] url = None if id in annotationsInArticle: id = annotation.attrib['id'] # get the link id link = annotation.findtext( "./{http://www.rsc.org/schema/rscart38}link") # if the compound has a link, create an RSC ontology landing page for it if link is not None: if link[:3] == 'AU:': url = 'http://goldbook.iupac.org/%s.html' % link[ 3:] else: url = 'http://www.rsc.org/publishing/journals/prospect/ontology.asp?id=%s&MSID=%s' % ( link, articleID) if url is not None: matches = document.search( annotationText[id], spineapi.IgnoreCase + spineapi.WholeWordsOnly) annotation = spineapi.Annotation() annotation['concept'] = 'Hyperlink' annotation['property:webpageUrl'] = url for match in matches: annotation.addExtent(match) document.addAnnotation(annotation) break
def on_ready_event(self, document): volume, page = None, None # Only send if the DOI has a Portland prefix doi = utopialib.utils.metadata(document, 'identifiers[doi]') if doi is not None and doi[:7] in registrants: crossref_unixref = utopialib.utils.metadata( document, 'raw_crossref_unixref') if crossref_unixref is not None: # Parse CrossRef redirect URL dom = etree.fromstring(crossref_unixref.encode('utf8')) resource = dom.findtext( 'doi_record/crossref/journal/journal_article/doi_data/resource' ) if resource is not None: match = self.resourceRegExp.match(resource) if match is not None: volume, page = match.groups() ### FIXME What information should be shown? Portland? BJ? #annotation = spineapi.Annotation() #annotation['concept'] = 'PublisherIdentity' #annotation['property:logo'] = utopia.get_plugin_data_as_url('images/logo.png', 'image/png') #annotation['property:title'] = 'Portland Press Limited' #annotation['property:webpageUrl'] = 'http://www.portlandpress.com/' #document.addAnnotation(annotation, 'PublisherMetadata') # If this document was resolved, off we go to fetch the NLM if None not in (volume, page): # Make a request to the utopia ext web service url = 'https://utopia.cs.manchester.ac.uk/ext/portland/nlm?{0}' url = url.format(urllib.urlencode({ 'volume': volume, 'page': page })) try: nlm = urllib2.urlopen(url, timeout=8).read() except: raise return info = utopialib.nlm.parse(nlm) if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = utopia.get_plugin_data_as_url( 'images/biochemj.png', 'image/png') link['property:sourceTitle'] = 'Portland' link['property:sourceDescription'] = ''' <p><a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p> ''' # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) #print regex matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): # See if we have any publishers' NLM hosted for this DOI doi = utopialib.utils.metadata(document, 'identifiers[doi]') #print '----- DOI', doi if doi is not None: info = None try: url = 'https://utopia.cs.manchester.ac.uk/ext/hosted/nlm?' url += urllib.urlencode({'doi': doi.encode('utf8')}) nlm = urllib2.urlopen(url, timeout=8).read() info = utopialib.nlm.parse(nlm) except (urllib2.URLError, socket.timeout): # info will remain None pass #print info if info is not None and len(info) > 0: # Enrich citation information with identifiers from PMC parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8') pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation)) if len(pmids) > 0: pubmed_abstracts = etree.fromstring( utopialib.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser) for idList in pubmed_abstracts.xpath( 'PubmedArticle/PubmedData/ArticleIdList'): #print etree.tostring(idList) pmid = idList.findtext('ArticleId[@IdType="pubmed"]') if pmid in pmids: citation = info['citations_by_id'][pmids[pmid]] for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')): id = idList.findtext( 'ArticleId[@IdType="{0}"]'.format(id_name)) if key_name not in citation and id is not None: citation[key_name] = id #print 'KEY', key_name, id # Generate sensible titles / descriptions / icons? journalTitle = info.get('publication-title', '') journalTitleSuffix = '' publisher = info.get('publisher', 'the publisher') if len(journalTitle) > 0: journalTitleSuffix = ' ({0})'.format(journalTitle) # Create Metadata link annotation link = document.newAccList('metadata', 100) link['property:sourceIcon'] = '' link['property:sourceTitle'] = publisher link['property:sourceDescription'] = ''' <p>This information was provided by {0}{1}.</p> '''.format(publisher, journalTitleSuffix) # Publisher identity if doi[:8] in ('10.1104/', '10.1105/'): annotation = spineapi.Annotation() annotation['concept'] = 'PublisherIdentity' logo = utopia.get_plugin_data_as_url( 'images/aspb_logo.png', 'image/png') webpageUrl = 'http://www.aspb.org/' title = publisher #print '====', publisher, '---', journalTitle, '---', webpageUrl if doi.startswith('10.1104/'): logo = utopia.get_plugin_data_as_url( 'images/pp_logo.png', 'image/png') title = journalTitle webpageUrl = 'http://www.plantphysiol.org/' elif doi.startswith('10.1105/'): logo = utopia.get_plugin_data_as_url( 'images/tpc_logo.png', 'image/png') title = journalTitle webpageUrl = 'http://www.plantcell.org/' annotation['property:logo'] = logo annotation['property:title'] = title annotation['property:webpageUrl'] = webpageUrl document.addAnnotation(annotation, 'PublisherMetadata') link['property:sourceIcon'] = logo link['property:sourceTitle'] = title # Create Metadata annotation annotation = utopialib.utils.citation_to_annotation( info.get('self', {}), 'DocumentMetadata') document.addAnnotation(annotation, link['scratch']) # Create Bibliography annotations for citation in info.get('citations', []): annotation = utopialib.utils.citation_to_annotation( citation) document.addAnnotation(annotation, link['scratch']) ####################################################################################### # Apply parsed data to document # Citations for citation in info['citations']: # Find cross refs for pre, label, post in citation.get('contexts', []): matches = document.findInContext(pre, label, post) #print matches if len(matches) > 0: try: annotation = utopialib.utils.citation_to_annotation( citation, concept='ForwardCitation') if 'doi' in citation and citation[ 'doi'].startswith('10.1371/'): citation[ 'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format( 'info:doi/{0}'.format( citation['doi'])) if 'pmcid' in citation: citation[ 'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format( citation['pmcid']) for match in matches: annotation.addExtent(match) document.addAnnotation(annotation, link['scratch']) except: raise for id, table in info.get('tables', {}).iteritems(): if 'caption' in table and 'xml' in table: regex = fuzz(table['caption'], strict=True) #print regex # convert oasis tables ns = { 'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table' } xml = etree.fromstring(table['xml']) if xml.tag == '{{{0}}}table'.format(ns['oasis']): for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns): columns = {} for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns): columns[colspec.get('colname')] = int( colspec.get('colnum')) for section in tgroup.xpath( './oasis:thead|./oasis:tbody', namespaces=ns): isHead = ( section.tag == '{{{0}}}thead'.format( ns['oasis'])) for row in section.xpath('./oasis:row', namespaces=ns): for entry in row.xpath('./oasis:entry', namespaces=ns): colname = entry.get('colname') colst = entry.get('namest') colend = entry.get('nameend') if colst is not None and colend is not None: colspan = columns[ colend] - columns[colst] + 1 else: colspan = 1 if colspan > 1: entry.set( 'colspan', unicode(colspan)) morerows = entry.get('morerows') if morerows is not None: rowspan = int(morerows) + 1 else: rowspan = 1 if rowspan > 1: entry.set( 'rowspan', unicode(rowspan)) entry.tag = 'td' row.tag = 'tr' if isHead: section.tag = 'thead' else: section.tag = 'tbody' xml.append(section) xml.tag = 'table' #print etree.tostring(xml, pretty_print=True, encoding='utf8') table['xml'] = etree.tostring(xml, encoding='utf8') matches = document.search( regex, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) == 1: annotation = spineapi.Annotation() annotation['concept'] = 'Table' annotation[ 'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode( table['xml']) annotation['session:volatile'] = '1' annotation.addExtent(matches[0]) document.addAnnotation(annotation, link['scratch']) else: print '*********** failed to match table:', id
def on_ready_event(self, document): # Find distinguishing ID pmid = common.utils.metadata(document, 'pmid') # Compile distinct GEO IDs in the text matches = {} for match in document.search(r'GSE\d+', spineapi.IgnoreCase + spineapi.WholeWordsOnly + spineapi.RegExp): matches.setdefault(match.text(), []) matches[match.text()].append(match) # Create annotations for each GEO ID for gse, extents in matches.iteritems(): entry = self._fetchGEO(gse) dates = u'Submitted {0}'.format(entry['submission_date']) if 'last_update_date' in entry: dates += u'; last updated {0}'.format(entry['last_update_date']) dates += '.' dataCitation=u'''<p>{0}. <strong>{1}</strong>.</p><p>{2}<br>({3})</p><p>{4}</p>'''.format( entry['contributors'].decode('utf8'), entry['title'].decode('utf8'), entry['overall_design'].decode('utf8'), entry['type'].decode('utf8'), dates) xhtml = u'<div class="box">{0}{{0}}<p>GEO Accession: <a href="{1}">{2}</a></p></div>'.format( dataCitation, entry['GEO_url'].decode('utf8'), gse) xhtml += u'<p><a href="{0}">Explore in InSilico DB...</a></p>'.format(entry['InSilicoDB_url']) srcdesc='''<p>The <a href="http://www.ncbi.nlm.nih.gov/geo">Gene Expression Omnibus (GEO)</a> is a public repository that archives and freely distributes microarray, next-generation sequencing, and other forms of high-throughput functional genomic data submitted by the scientific community.</p>''' if entry.get('pubmed_id') == pmid: # add a global annotation annotation = spineapi.Annotation() annotation['concept'] = 'GEO' annotation['property:name'] = 'Gene Expression Omnibus' annotation['property:sourceDatabase'] = 'geo' annotation['property:description'] = '{0} (Data associated with this article)'.format(gse) annotation['property:sourceDescription'] = srcdesc annotation['property:xhtml'] = xhtml.format('') # Keep summary blank document.addAnnotation(annotation) # Generate summary summary = entry.get('summary', '') if len(summary) > 0: summary_words = summary.split(' ') summary = u'<p><em>Summary:</em> ' summary += u'{0}'.format(' '.join(summary_words[:32])) if len(summary_words) > 32: summary += u' <span class="readmore">{0}</span>'.format(' '.join(summary_words[32:])) summary += u'</p>' # local annotation annotation = spineapi.Annotation() annotation['concept'] = 'GEO' annotation['property:name'] = 'Gene Expression Omnibus' annotation['property:sourceDatabase'] = 'geo' annotation['property:description'] = gse annotation['property:sourceDescription'] = srcdesc annotation['property:xhtml'] = xhtml.format(summary) for extent in extents: annotation.addExtent(extent) document.addAnnotation(annotation)
def on_load_event(self, document): # Keep track of errors so that we can inform the user def add_error(component, method, category=None, message=None, exception=None): if exception is not None: if isinstance(exception, urllib2.URLError) and isinstance(exception.reason, socket.timeout): exception = exception.reason if isinstance(exception, socket.timeout): category = "timeout" message = "The server did not respond" elif isinstance(exception, urllib2.HTTPError): category = "server" message = unicode(getattr(exception, "reason", "The server did not respond as expected")) elif isinstance(exception, urllib2.URLError): category = "connection" message = unicode(getattr(exception, "reason", "The server could not be found")) error = spineapi.Annotation() error["concept"] = "Error" error["property:component"] = component error["property:method"] = method error["property:category"] = category if message is not None: error["property:message"] = message document.addAnnotation(error, "errors.metadata") def add_success(component, method): error = spineapi.Annotation() error["concept"] = "Success" error["property:component"] = component error["property:method"] = method error["property:category"] = "success" document.addAnnotation(error, "errors.metadata") metadata = {"scraped": {}, "arxiv": {}, "pubmed": {}, "pmc": {}, "crossref": {}, "utopia": {}} authors = [] publication = None volume = None issue = None year = None pages = None ################################################################################# # Scrape DOI and title doi = common.doi.scrape(document) metadata["scraped"]["doi"] = doi print "scraper: doi:", (doi and doi.encode("utf8")) title = common.title.scrape(document) metadata["scraped"]["title"] = title print "scraper: title:", (title and title.encode("utf8")) ################################################################################# # Scrape arXiv ID arxivid = common.arxiv.scrape(document) if arxivid is not None: metadata["scraped"]["arxivid"] = arxivid try: arxiv_results = common.arxiv.resolve(arxivid) if arxiv_results is not None: arxiv_results.update({":whence": "arxiv", ":weight": 10}) common.utils.store_metadata(document, **arxiv_results) except Exception as e: add_error("ArXiv", "resolve", exception=e) traceback.print_exc() else: add_success("ArXiv", "resolve") ################################################################################# # Fold in the CrossRef data issn = common.utils.metadata(document, "issn") if title is not None or doi is not None: if doi is None: try: xref_results = common.crossref.search(title) if len(xref_results) == 1: xref_title = xref_results[0].get("title") if xref_title is not None: print "crossref: resolved title:", xref_title.encode("utf8") # Accept the crossref title if present in the document (do magic dash pattern thing) xref_title = re.sub( ur"[^-\u002D\u007E\u00AD\u058A\u05BE\u1400\u1806\u2010-\u2015\u2053\u207B\u208B\u2212\u2E17\u2E3A\u2E3B\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", lambda x: re.escape(x.group(0)), xref_title, ) xref_title = re.sub( ur"[\u002D\u007E\u00AD\u058A\u05BE\u1400\u1806\u2010-\u2015\u2053\u207B\u208B\u2212\u2E17\u2E3A\u2E3B\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D-]+", lambda x: r"\p{{Pd}}{{{0}}}".format(len(x.group(0))), xref_title, ) # print 'crossref: resolved title pattern:', xref_title.encode('utf8') matches = document.search(xref_title, spineapi.RegExp + spineapi.IgnoreCase) if len(matches) > 0: doi = xref_results[0].get("doi") print "crossref: accepting resolved doi" except Exception as e: add_error("CrossRef", "search", exception=e) traceback.print_exc() else: add_success("CrossRef", "search") if doi is not None: # What is this DOI's article's title according to crossref? try: xref_results = common.crossref.resolve(doi) xref_results.update({":whence": "crossref", ":weight": 20}) xref_title = xref_results.get("title", "") if len(xref_title) > 0: print "crossref: resolved title:", xref_title.encode("utf8") if ( re.sub(r"[^\w]+", " ", title).strip() == re.sub(r"[^\w]+", " ", xref_title).strip() ): # Fuzzy match print "crossref: titles match precisely" common.utils.store_metadata(document, **xref_results) else: # Accept the crossref title over the scraped title, if present in the document matches = document.findInContext("", xref_title, "") # Fuzzy match if len(matches) > 0: common.utils.store_metadata(document, **xref_results) title = xref_title print "crossref: overriding scraped title with crossref title" else: print "crossref: ignoring resolved metadata" # FIXME should we discard the DOI at this point? except Exception as e: add_error("CrossRef", "resolve", exception=e) traceback.print_exc() else: add_success("CrossRef", "resolve") ########################################################################################### # Fold in the PubMed data pii = common.utils.metadata(document, "pii") pmid = common.utils.metadata(document, "pmid") pmcid = common.utils.metadata(document, "pmcid") if pmid is None and doi is not None: # resolve on DOI try: pmid = common.pubmed.resolve(doi, "doi") except Exception as e: add_error("PubMed", "resolve", exception=e) traceback.print_exc() else: add_success("PubMed", "resolve") if pmid is None and title is not None: # resolve on title try: pubmed_results = common.pubmed.search(title) pubmed_title = pubmed_results.get("title", "").strip(" .") if len(pubmed_title) > 0: print "pubmed: resolved title:", pubmed_title.encode("utf8") pubmed_pmid = pubmed_results.get("pmid") print "pubmed: resolved pmid:", pubmed_pmid if ( re.sub(r"[^\w]+", " ", title).strip() == re.sub(r"[^\w]+", " ", pubmed_title).strip() ): # Fuzzy match print "pubmed: titles match precisely" title = pubmed_title pmid = pubmed_pmid else: # Accept the pubmed title over the scraped title, if present in the document matches = document.findInContext("", pubmed_title, "") # Fuzzy match if len(matches) > 0: title = matches[0].text() pmid = pubmed_pmid print "pubmed: overriding scraped title with pubmed title" else: print "pubmed: ignoring resolved title" except Exception as e: add_error("PubMed", "search", exception=e) traceback.print_exc() else: add_success("PubMed", "search") if pmid is not None: try: nlm = common.pubmed.fetch(pmid) if nlm is not None: xml = etree.fromstring(nlm) pubmed_authors = [] for author in xml.findall("PubmedArticle/MedlineCitation/Article/AuthorList/Author"): name = u"" lastName = author.findtext("LastName") forename = author.findtext("ForeName") if lastName is not None: name = lastName + u", " if forename is not None: name += forename if len(name) > 0: pubmed_authors.append(name) if len(pubmed_authors) == 0: pubmed_authors = None pubmed_pmid = xml.findtext("PubmedArticle/MedlineCitation/PMID") common.utils.store_metadata( document, **{ ":whence": "pubmed", ":weight": 10, "raw_pubmed_nlm": nlm, "authors": pubmed_authors, "pmid": pubmed_pmid, "title": xml.findtext("PubmedArticle/MedlineCitation/Article[1]/ArticleTitle"), "issn": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/ISSN[1]"), "doi": xml.findtext('PubmedArticle/PubmedData/ArticleIdList/ArticleId[@IdType="doi"]'), "pmcid": xml.findtext('PubmedArticle/PubmedData/ArticleIdList/ArticleId[@IdType="pmc"]'), "pii": xml.findtext('PubmedArticle/PubmedData/ArticleIdList/ArticleId[@IdType="pii"]'), "publication-title": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/Title"), "volume": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Volume"), "issue": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Issue"), "year": xml.findtext( "PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/PubDate/Year" ), "pages": xml.findtext("PubmedArticle/MedlineCitation/Article[1]/Pagination/MedlinePgn"), "abstract": xml.findtext("PubmedArticle/MedlineCitation/Article[1]/Abstract/AbstractText"), } ) pmid = pubmed_pmid or pmid # FIXME I'm sure the above should be in common.pubmed except Exception as e: add_error("PubMed", "fetch", exception=e) traceback.print_exc() else: add_success("PubMed", "fetch") ########################################################################################### # Fold in the PubMedCentral data if pmcid is None and doi is not None: # resolve on DOI try: pmcid = common.pmc.resolve(doi, "doi") except Exception as e: add_error("PubMed Central", "resolve", exception=e) traceback.print_exc() else: add_success("PubMed Central", "resolve") if pmcid is None and pmid is not None: # resolve on PubMed ID try: pmcid = common.pmc.resolve(pmid, "pmid") except Exception as e: add_error("PubMed Central", "resolve", exception=e) traceback.print_exc() else: add_success("PubMed Central", "resolve") if pmcid is not None: common.utils.store_metadata(document, **{":whence": "pmc", ":weight": 10, "pmcid": pmcid}) try: nlm = common.pmc.fetch(pmcid) if nlm is not None: common.utils.store_metadata(document, **{":whence": "pmc", ":weight": 10, "raw_pmc_nlm": nlm}) except Exception as e: add_error("PubMed Central", "fetch", exception=e) traceback.print_exc() else: add_success("PubMed Central", "fetch") ########################################################################################### scraped = metadata["scraped"] scraped.update({":whence": "document", ":weight": 5}) common.utils.store_metadata(document, **scraped)