def createAnnotation(self, domain, document, html, mentions): annotation = Annotation() annotation['concept'] = 'Bio3DMInformation' annotation['property:name'] = '%s: "%s"' % ( mentions[0]['mentionType'].title(), mentions[0]['formalRepresentation']) annotation['property:description'] = '3DM %s record' % mentions[0][ 'mentionType'].title() annotation['property:sourceDatabase'] = domain annotation['property:html'] = mentions[0]['html'] annotation['property:css'] = mentions[0]['css'] annotation['property:js'] = mentions[0]['js'] annotation['property:sourceDatabase'] = 'bioprodict' annotation[ 'property:sourceDescription'] = '<p><a href="http://www.bio-prodict.nl">Bio-Prodict\'s</a> 3DM information systems provide protein family-specific annotations for this article</p>' annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#336611' for mention in mentions: for textRange in mention['textRangeList']: start = int(textRange['start']) end = int(textRange['end']) match = document.substr(start, end - start) annotation.addExtent(match) return annotation
def createAnnotation(self, domain, document, html, mentions): annotation = Annotation() annotation["concept"] = "Bio3DMInformation" annotation["property:name"] = '%s: "%s"' % (mentions[0].mentionType.title(), mentions[0].formalRepresentation) annotation["property:description"] = "3DM %s record" % mentions[0].mentionType.title() annotation["property:sourceDatabase"] = domain annotation["property:html"] = mentions[0].html annotation["property:css"] = mentions[0].css annotation["property:js"] = mentions[0].js annotation["property:sourceDatabase"] = "bioprodict" annotation[ "property:sourceDescription" ] = '<p><a href="http://www.bio-prodict.nl">Bio-Prodict\'s</a> 3DM information systems provide protein family-specific annotations for this article</p>' for mention in mentions: for textRange in mention.textRangeList: start = int(textRange.start) end = int(textRange.end) match = document.substr(start, end - start) annotation.addExtent(match) return annotation
def on_activate_event(self, document): ns = {'r': 'Reflect'} maxTextFragmentSize = 1000000 textFragments = [] seenItemNames = set() ignoredEntityTypes = [-11] # Retrieve the full text of the document, split into fragments for page in document.pages(): pageText = re.sub(r'\s+', r' ', page.pageText()) if len(textFragments) == 0 or len(textFragments[-1][0]) + len( pageText) > maxTextFragmentSize: textFragments.append([pageText, page]) else: textFragments[-1][0] = textFragments[-1][0] + ' ' + pageText for text, page in textFragments: # Package it as URL encoded form encoding payload = 'document=%s' % urllib.quote(text.encode('utf8')) # Send it off to the reflect server response = urllib2.urlopen("http://reflect.ws/REST/GetEntities", payload, timeout=8) # Parse response root = etree.fromstring(response.read(), self.parser) reflections = {} annotations = {} for item in root.xpath('//r:item', namespaces=ns): itemName = etree.tostring(item.find('{%s}name' % ns['r']), method="text", encoding=unicode, with_tail=False).lower().strip() if itemName not in seenItemNames: for entity in item.xpath('.//r:entity', namespaces=ns): entityType = entity.findtext('{%s}type' % ns['r']) if entityType is not None: entityType = int(entityType) if entityType not in ignoredEntityTypes: entityIdentifier = entity.findtext( '{%s}identifier' % ns['r']) if itemName not in reflections: reflections[itemName] = set() reflections[itemName].add( (entityType, entityIdentifier)) # For each match, create an annotation that the UI will handle later regex = '(%s)' % '|'.join( [re.escape(key) for key in reflections.iterkeys()]) matches = document.search(regex, IgnoreCase + WholeWordsOnly + RegExp, start=page) for match in matches: if match.begin().wordArea()[1] == 0: itemName = match.text().lower().strip() annotation = annotations.get(itemName, None) if annotation is None and itemName in reflections: annotation = Annotation() annotation['concept'] = 'Reflection' annotation['property:webpageUrl'] = \ 'http://reflect.ws/fcgi-bin/solveAmbig.fcgi?entities=%s' % \ ';'.join(['%d.%s' % (t, id) for (t, id) in reflections[itemName]]) annotation['property:name'] = itemName annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#0A0' annotations[itemName] = annotation seenItemNames.add(itemName) if annotation is not None: annotation.addExtent(match) else: print "ERROR: matched '%s' but could not find in reflections map" % itemName.encode( 'utf8') print reflections.keys() document.addAnnotations(annotations.values())
def on_activate_event(self, document): ns = {'r': 'Reflect'} maxTextFragmentSize = 1000000 textFragments = [] seenItemNames = set() ignoredEntityTypes = [-11] # Retrieve the full text of the document, split into fragments for page in document.pages(): pageText = re.sub(r'\s+', r' ', page.pageText()) if len(textFragments) == 0 or len(textFragments[-1][0]) + len(pageText) > maxTextFragmentSize: textFragments.append([pageText, page]) else: textFragments[-1][0] = textFragments[-1][0] + ' ' + pageText for text, page in textFragments: # Package it as URL encoded form encoding payload = 'document=%s' % urllib.quote(text.encode('utf8')) # Send it off to the reflect server response = urllib2.urlopen("http://reflect.ws/REST/GetEntities", payload, timeout=8) # Parse response root = etree.fromstring(response.read(), self.parser) reflections = {} annotations = {} for item in root.xpath('//r:item', namespaces = ns): itemName = item.findtext('{%s}name' % ns['r']).lower().strip() if itemName not in seenItemNames: for entity in item.xpath('.//r:entity', namespaces = ns): entityType = entity.findtext('{%s}type' % ns['r']) if entityType is not None: entityType = int(entityType) if entityType not in ignoredEntityTypes: entityIdentifier = entity.findtext('{%s}identifier' % ns['r']) if itemName not in reflections: reflections[itemName] = set() reflections[itemName].add((entityType, entityIdentifier)) # For each match, create an annotation that the UI will handle later regex = '(%s)' % '|'.join([re.escape(key) for key in reflections.iterkeys()]) matches = document.search(regex, IgnoreCase + WholeWordsOnly + RegExp, start = page) for match in matches: if match.begin().wordArea()[1] == 0: itemName = match.text().lower().strip() annotation = annotations.get(itemName, None) if annotation is None and itemName in reflections: annotation = Annotation() annotation['concept'] = 'Reflection' annotation['property:webpageUrl'] = \ 'http://reflect.ws/fcgi-bin/solveAmbig.fcgi?entities=%s' % \ ';'.join(['%d.%s' % (t, id) for (t, id) in reflections[itemName]]) annotation['property:name'] = itemName annotation['session:overlay'] = 'hyperlink' annotation['session:color'] = '#0A0' annotations[itemName] = annotation seenItemNames.add(itemName) if annotation is not None: annotation.addExtent(match) else: print "ERROR: matched '%s' but could not find in reflections map" % itemName.encode('utf8') print reflections.keys() document.addAnnotations(annotations.values())