def run(self): elements = ['abbrev', 'abstract', 'ack', 'address', 'aff', 'alt-text', 'app', 'app-group', 'array', 'article-title', 'attrib', 'author-comment', 'author-notes', 'award-group', 'bio', 'boxed-text', 'caption', 'chem-struct', 'chem-struct-wrap', 'col', 'colgroup', 'collab', 'compound-kwd', 'contrib', 'contrib-group', 'corresp', 'custom-meta', 'def', 'def-item', 'def-list', 'disp-formula', 'disp-formula-group', 'disp-quote', 'element-citation', 'ext-link', 'fig', 'fig-group', 'fn', 'fn-group', 'funding-source', 'glossary', 'glyph-data', 'graphic', 'inline-formula', 'inline-graphic', 'inline-supplementary-material', 'institution', 'kwd', 'kwd-group', 'list', 'list-item', 'long-desc', 'media', 'milestone-end', 'milestone-start', 'mixed-citation', 'named-content', 'nlm-citation', 'note', 'notes', 'p', 'person-group', 'preformat', 'product', 'ref', 'ref-list', 'related-article', 'related-object', 'response', 'sec', 'sig', 'sig-block', 'source', 'speech', 'statement', 'sub-article', 'supplementary-material', 'table', 'table-wrap', 'table-wrap-group', 'tbody', 'td', 'term', 'tex-math', 'tfoot', 'th', 'thead', 'title', 'tr', 'trans-abstract', 'trans-source', 'trans-title', 'trans-title-group', 'verse-group', 'xref'] manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() for element in elements: self.debug.print_debug(self, u'Assigning ID to all {0} elements'.format(element)) for item in tree.xpath(u'//{0}'.format(element)): if not 'id' in item.attrib: item.attrib['id'] = u'ID{0}'.format(unicode(uuid.uuid4())) tree.write(self.gv.nlm_file_path) tree.write(self.gv.nlm_temp_file_path)
def run(self): elements = ['abbrev', 'abstract', 'ack', 'address', 'aff', 'alt-text', 'app', 'app-group', 'array', 'article-title', 'attrib', 'author-comment', 'author-notes', 'award-group', 'bio', 'boxed-text', 'caption', 'chem-struct', 'chem-struct-wrap', 'col', 'colgroup', 'collab', 'compound-kwd', 'contrib', 'contrib-group', 'corresp', 'custom-meta', 'def', 'def-item', 'def-list', 'disp-formula', 'disp-formula-group', 'disp-quote', 'element-citation', 'ext-link', 'fig', 'fig-group', 'fn', 'fn-group', 'funding-source', 'glossary', 'glyph-data', 'graphic', 'inline-formula', 'inline-graphic', 'inline-supplementary-material', 'institution', 'kwd', 'kwd-group', 'list', 'list-item', 'long-desc', 'media', 'milestone-end', 'milestone-start', 'mixed-citation', 'named-content', 'nlm-citation', 'note', 'notes', 'p', 'person-group', 'preformat', 'product', 'ref', 'ref-list', 'related-article', 'related-object', 'response', 'sec', 'sig', 'sig-block', 'source', 'speech', 'statement', 'sub-article', 'supplementary-material', 'table', 'table-wrap', 'table-wrap-group', 'tbody', 'td', 'term', 'tex-math', 'tfoot', 'th', 'thead', 'title', 'tr', 'trans-abstract', 'trans-source', 'trans-title', 'trans-title-group', 'verse-group', 'xref'] manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() for element in elements: self.debug.print_debug(self, u'Assigning ID to all {0} elements'.format(element)) for item in tree.xpath(u'//{0}'.format(element)): if not 'id' in item.attrib: item.attrib['id'] = u'ID{0}'.format(uuid.uuid4()) tree.write(self.gv.nlm_file_path) tree.write(self.gv.nlm_temp_file_path)
def run_prompt(self): self.run(False) self.debug.print_debug(self, u'Entering interactive mode') prompt = Interactive(self.gv) manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() ref_items = tree.xpath('//back/ref-list/ref') # note that we don't want to exit even if there are no references to link because the user may want to delete # some delete_all = False for p in tree.xpath('//xref[@ref-type="bibr"]'): text = manipulate.get_stripped_text(p) prompt.print_(prompt.colorize('green', ("-" * 80))) if 'rid' in p.attrib and p.attrib['rid'] == 'TO_LINK': prompt.print_( u"Found an unhandled reference marker: {0}".format(text)) elif 'rid' in p.attrib: remote = next((x for x in ref_items if 'id' in x.attrib and ( x.attrib['id'] == p.attrib['rid'])), None) remote_text = manipulate.get_stripped_text( remote) if remote else '' prompt.print_( u"Found a handled reference marker: \"{0}\" which links to \"{1}\"" .format(text, remote_text)) opts = ('Skip', 'Delete', 'deleTe all', 'Enter search', 'Ibid', 'enter Link id', 'skip Rest', 'show Context') sel = '' if delete_all: sel = 'd' else: sel = prompt.input_options(opts) result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree) if result == 'abort': manipulate.save_tree(tree) return elif result == 'delall': delete_all = True manipulate.save_tree(tree)
def process_database_references(self, db): manipulate = NlmManipulate(self.gv) master_tree = manipulate.load_dom_tree() tree = master_tree.xpath('//back/ref-list/ref') for element in tree: cont = True text = manipulate.get_stripped_text(element) year_test = re.compile('((19|20)\d{2})|(n\.d\.)') match = year_test.search(text) if match: # strip out elements in brackets that might scupper parsing text = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', text) list_split = text.split(',') list_split = [x.strip() for x in list_split] if len(list_split) < 10: for length in range(1, len(list_split)): if not cont: break for permute in itertools.permutations( list_split, length): key = match.groups(0)[0] + ''.join(permute).strip() if isinstance(key, unicode): key = key.encode("utf-16le") if key in db: obj = db[key] print('Found {0} in database "{1}"'.format( obj.object_type(), obj.title)) new_element = etree.fromstring( obj.get_citation()) hex_dig = u'ID{0}'.format(unicode( uuid.uuid4())) new_element.attrib['id'] = hex_dig if 'id' in element.attrib: current_id = element.attrib['id'] referrers = master_tree.xpath( '//*[@rid={0}]'.format(current_id)) for link in referrers: link.attrib['rid'] = hex_dig element.addnext(new_element) element.getparent().remove(element) cont = False break return manipulate, master_tree
def process_database_references(self, db): manipulate = NlmManipulate(self.gv) master_tree = manipulate.load_dom_tree() tree = master_tree.xpath('//back/ref-list/ref') for element in tree: cont = True text = manipulate.get_stripped_text(element) year_test = re.compile('((19|20)\d{2})|(n\.d\.)') match = year_test.search(text) if match: # strip out elements in brackets that might scupper parsing text = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', text) list_split = text.split(',') list_split = [x.strip() for x in list_split] if len(list_split) < 10: for length in range(1, len(list_split)): if not cont: break for permute in itertools.permutations(list_split, length): key = match.groups(0)[0] + ''.join(permute).strip() if isinstance(key, unicode): key = key.encode("utf-16le") if key in db: obj = db[key] print ('Found {0} in database "{1}"'.format(obj.object_type(), obj.title)) new_element = etree.fromstring(obj.get_citation()) hash_object = hashlib.sha256(key) hex_dig = hash_object.hexdigest() new_element.attrib['id'] = hex_dig if 'id' in element.attrib: current_id = element.attrib['id'] referrers = master_tree.xpath('//*[@rid={0}]'.format(current_id)) for link in referrers: link.attrib['rid'] = hex_dig element.addnext(new_element) element.getparent().remove(element) cont = False break return manipulate, master_tree
def run_prompt(self): self.run(False) self.debug.print_debug(self, u"Entering interactive mode") prompt = Interactive(self.gv) manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() ref_items = tree.xpath("//back/ref-list/ref") # note that we don't want to exit even if there are no references to link because the user may want to delete # some delete_all = False for p in tree.xpath('//xref[@ref-type="bibr"]'): text = manipulate.get_stripped_text(p) if "rid" in p.attrib and p.attrib["rid"] == "TO_LINK": prompt.print_(u"Found an unhandled reference marker: {0}".format(text)) elif "rid" in p.attrib: remote = next((x for x in ref_items if "id" in x.attrib and (x.attrib["id"] == p.attrib["rid"])), None) remote_text = manipulate.get_stripped_text(remote) prompt.print_(u'Found a handled reference marker: "{0}" which links to "{1}"'.format(text, remote_text)) opts = ( "Skip", "Delete", "deleTe all", "Enter search", "Ibid", "enter Link id", "skip Rest", "show Context", ) sel = "" if delete_all: sel = "d" else: sel = prompt.input_options(opts) result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree) if result == "abort": manipulate.save_tree(tree) return elif result == "delall": delete_all = True manipulate.save_tree(tree)
def prune(self): self.debug.print_debug(self, u'Deleting all stubs from article') manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() for p in tree.xpath('//xref[@ref-type="bibr" and @rid="TO_LINK"]'): self.extract_contents(p) manipulate.save_tree(tree)
def link_items(self, source_id, dest_id, manipulate=None, tree=None): self.debug.print_debug(self, u'Attempting to link XREF {0} to REF {1}'.format(source_id, dest_id)) if manipulate is None: manipulate = NlmManipulate(self.gv) if tree is None: tree = manipulate.load_dom_tree() source = tree.xpath('//xref[@id="{0}"]'.format(source_id))[0] dest = tree.xpath('//ref[@id="{0}"]'.format(dest_id))[0] ReplaceObject(self.gv, source, dest).link() manipulate.save_tree(tree)
def run_ext_link_compliance(self): self.debug.print_debug(self, u'Attempting to correct any mis-nested graphics elements') manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() bad_links = tree.xpath('//ext-link/graphic') for link in bad_links: link_parent = link.getparent() parent = link_parent.getparent() parent.insert(parent.index(link_parent)+1, link) tree.write(self.gv.nlm_file_path) tree.write(self.gv.nlm_temp_file_path)
def run_prompt(self): self.run(False) self.debug.print_debug(self, u'Entering interactive mode') prompt = Interactive(self.gv) manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() ref_items = tree.xpath('//back/ref-list/ref') # note that we don't want to exit even if there are no references to link because the user may want to delete # some delete_all = False for p in tree.xpath('//xref[@ref-type="bibr"]'): text = manipulate.get_stripped_text(p) prompt.print_(prompt.colorize('green',("-" * 80))) if 'rid' in p.attrib and p.attrib['rid'] == 'TO_LINK': prompt.print_(u"Found an unhandled reference marker: {0}".format(text)) elif 'rid' in p.attrib: remote = next((x for x in ref_items if 'id' in x.attrib and (x.attrib['id'] == p.attrib['rid'])), None) remote_text = manipulate.get_stripped_text(remote) prompt.print_(u"Found a handled reference marker: \"{0}\" which links to \"{1}\"".format(text, remote_text)) opts = ('Skip', 'Delete', 'deleTe all', 'Enter search', 'Ibid', 'enter Link id', 'skip Rest', 'show Context') sel = '' if delete_all: sel = 'd' else: sel = prompt.input_options(opts) result = self.handle_input(manipulate, opts, p, prompt, ref_items, sel, tree=tree) if result == 'abort': manipulate.save_tree(tree) return elif result == 'delall': delete_all = True manipulate.save_tree(tree)
def run(self, interactive): if interactive: self.run_prompt() return manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() # pre-cleanup: remove all empty ext-links as these break the linker items_to_clean = tree.xpath('//ext-link') count = 0 for item in items_to_clean: if '{http://www.w3.org/1999/xlink}href' in item.attrib and \ item.attrib['{http://www.w3.org/1999/xlink}href'] == '': count += 1 item.tag = 'REMOVE' etree.strip_tags(item.getparent(), 'REMOVE') if count > 0: manipulate.save_tree(tree) self.debug.print_debug(self, u'Removed {0} blank ext-link tags'.format(count)) ref_items = tree.xpath('//back/ref-list/ref') self.clean_ref_items(tree, ref_items, manipulate) # handle numbered reference items references_and_numbers = {} for ref in ref_items: text = manipulate.get_stripped_text(ref) ref_match = re.compile('^(?P<number>\d+)\.*') result = ref_match.match(text) if result: references_and_numbers[result.group('number')] = ref parsed = self.process_ibid_authors(ref_items) if parsed > 0: manipulate.save_tree(tree) self.debug.print_debug(self, u'Replace {0} instances of "---." at start of references'.format(parsed)) to_link = [] to_stub = [] square_bracket_count = {} for p in tree.xpath('//sec//p[not(mml:math)] | //td', namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'}): text = manipulate.get_stripped_text(p) reference_test = re.compile('\((?P<text>[^%]+?)\)') matches = reference_test.finditer(text) # exclude any square brackets with numbers inside sub_match = re.compile('\[(?P<square>\d*[,\-;\d\s]*)\]') smatch = sub_match.search(text) if smatch: smatches = sub_match.finditer(text) for smatch in smatches: self.debug.print_debug(self, u'Handling references in square ' u'brackets: [{0}] '.format(smatch.group('square'))) for item in re.split(';|,', smatch.group('square')): if '-' in item: parent, tail = manipulate.find_text(p, item) if parent is not None: new_string = '' try: split_range = item.strip().split('-') for no in range(int(split_range[0]), int(split_range[1]) + 1): new_string += str(no) + ',' except: self.debug.print_debug(self, u'Unable to parse reference ' u'number in range {0}'.format(item)) break if new_string.endswith(',') and not item.endswith(','): new_string = new_string[0:len(new_string) - 1] if tail and new_string != '': parent.tail = parent.tail.replace(item, new_string) elif not tail and new_string != '': parent.text = parent.text.replace(item, new_string) try: split_range = item.strip().split('-') for no in range(int(split_range[0]), int(split_range[1]) + 1): self.debug.print_debug(self, u'Parsing reference ' u'number in range {0}'.format(str(no))) to_stub.append(ReplaceStub(self.gv, p, str(no), tree, manipulate, 'TO_LINK_NUMBER', length_ignore=True)) except: self.debug.print_debug(self, u'Unable to parse reference ' u'number in range {0}'.format(item)) break else: # just replace the components split_range = item.strip().split('-') for link in split_range: to_stub.append(ReplaceStub(self.gv, p, link, tree, manipulate, 'TO_LINK_NUMBER', length_ignore=True)) else: if len(item.strip()) < 60: to_stub.append(ReplaceStub(self.gv, p, item.strip(), tree, manipulate, 'TO_LINK_NUMBER', length_ignore=True)) square_bracket_count[item.strip()] = 1 else: for match in matches: for item in match.group('text').split(u';'): if len(item.strip()) < 60: to_stub.append(ReplaceStub(self.gv, p, item.strip(), tree, manipulate)) for link in to_stub: link.link(to_stub) #pass etree.strip_elements(tree, 'REMOVE') use_index_method = False if len(square_bracket_count) != len(references_and_numbers): # we found more than 3 [1], [2] style references but no reference elements beginning with numbers # so, we will simply try to use the /index/ of the reference item (-1 for zero-based compensation) self.debug.print_debug(self, u'Using indexical method for square bracket correlation') use_index_method = True if len(ref_items) == 0: self.debug.print_debug(self, u'Found no references to link') manipulate.save_tree(tree) return for p in tree.xpath('//xref[@rid="TO_LINK_NUMBER"]'): text = manipulate.get_stripped_text(p) if not use_index_method: if text in references_and_numbers: ReplaceObject(self.gv, p, references_and_numbers[text]).link() else: p.attrib['rid'] = 'TO_LINK' else: try: ReplaceObject(self.gv, p, ref_items[int(text) - 1]).link() except: self.debug.print_debug(self, u'Failed to link to reference {0} + 1 using ' u'indexical method'.format(text)) p.attrib['rid'] = 'TO_LINK' for p in tree.xpath('//xref[@rid="TO_LINK"]'): text = manipulate.get_stripped_text(p) item = text bare_items = item.strip().replace(u',', '').split(u' ') for ref in ref_items: found = True bare_ref = manipulate.get_stripped_text(ref) bare_refs = bare_ref.split(' ') replace_chars = '[,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"]' for sub_item in bare_items: found_ref = False for sub_ref in bare_refs: if re.sub(replace_chars, '', sub_item.strip()).strip() == sub_ref.strip(replace_chars): found_ref = True break if not found_ref: found = False if len(bare_items) > 0 and found: to_link.append(ReplaceObject(self.gv, p, ref)) elif len(bare_items) > 0: replace_chars = '[,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\d]' found = True for sub_item in bare_items: found_ref = False subbed_text = re.sub(replace_chars, '', sub_item.strip()).strip() for sub_ref in bare_refs: sub_ref = re.sub(replace_chars, '', sub_ref.strip()).strip() if subbed_text == '' and len(bare_items) > 1: found_ref = True break if subbed_text == sub_ref and subbed_text != '' and sub_ref != '': found_ref = True break if not found_ref: found = False # we don't allow linking to the last item here because it is almost universally wrong if len(bare_items) > 0 and found and ref_items.index(ref) != len(ref_items) - 1: to_link.append(ReplaceObject(self.gv, p, ref)) if len(to_link) == 0: self.debug.print_debug(self, u'Found no references to link') for link in to_link: link.link() #pass manipulate.save_tree(tree)
def run_tables(self): self.debug.print_debug( self, u'Attempting to classify captions for table objects') manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() tables = tree.xpath('//table-wrap') table_titles = [] table_ids = [] table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+') table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+') separator = ':' for table in tables: use_next = False use_previous = False used_title = False # get the next sibling p = table.getnext() pprev = table.getprevious() old_title = None if p is not None and p.tag == 'p': cont = True for sub in p: if sub.tag == 'graphic': cont = False if cont: text = manipulate.get_stripped_text(p) if table_regex_colon.match(text): use_next = True separator = ':' elif table_regex_dot.match(text): use_next = True separator = '.' if not use_next: cont = True for sub in pprev: if sub.tag == 'graphic': cont = False if cont: if pprev is not None and pprev.tag == 'p': text = manipulate.get_stripped_text(pprev) if table_regex_colon.match(text): use_previous = True separator = ':' elif table_regex_dot.match(text): use_previous = True separator = '.' if not use_next or use_previous: # see if the title in this section potentially contains text we can match parent = table.getparent() titles = parent.xpath('title') if len(titles) > 0: p = titles[0] text = manipulate.get_stripped_text(p) if table_regex_colon.match(text): use_next = True separator = ':' used_title = True elif table_regex_dot.match(text): use_next = True separator = '.' used_title = True if use_next or use_previous: if use_next: text = manipulate.get_stripped_text(p) else: text = manipulate.get_stripped_text(pprev) p = pprev # likely this is a table identifier split_title = text.split(separator) title = split_title[0] caption = (''.join(split_title[1:])).strip() # strip all formatting from caption for ease of parsing # TODO: preserve formatting (far harder) new_p = etree.Element('p') new_p.text = caption if p.tag.endswith('title'): new_title = etree.Element('title') new_title.text = '' old_title = new_title p.addnext(new_title) p.getparent().remove(p) else: p.getparent().remove(p) p = new_p self.debug.print_debug( self, u'Handling title and caption for "{0}"'.format(title)) title_element = None # use an existing title element if one exists try: title_element = table.xpath('label')[0] except: title_element = etree.Element('label') table.insert(0, title_element) title_element.text = title caption_element = etree.Element('caption') NlmManipulate.append_safe(caption_element, p, self) table.insert(1, caption_element) if not 'id' in table.attrib: table.attrib['id'] = u'ID{0}'.format(unicode(uuid.uuid4())) table_titles.append(title) table_ids.append(table.attrib['id']) if used_title: # if we took the title out, then we should move the parent into its previous sibling and then # strip tags old_title.tag = 'REMOVE' etree.strip_elements(tree, 'REMOVE') section = table.getparent() previous = section.getprevious() while previous is not None and not previous.tag.endswith( 'sec'): previous = previous.getprevious() if previous is not None: previous.append(section) section.tag = 'REMOVE' etree.strip_tags(tree, 'REMOVE') self.debug.print_debug( self, u'Moved table and siblings to previous section') else: previous = section.getparent() if previous is not None and previous.tag.endswith( 'sec'): previous.append(section) section.tag = 'REMOVE' etree.strip_tags(tree, 'REMOVE') self.debug.print_debug( self, u'Moved table and siblings to parent section') paragraphs = tree.xpath('//p') self.link(table_ids, table_titles, paragraphs, 'table') tree.write(self.gv.nlm_file_path) tree.write(self.gv.nlm_temp_file_path)
def run_tables(self): self.debug.print_debug(self, u'Attempting to classify captions for table objects') manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() tables = tree.xpath('//table-wrap') table_titles = [] table_ids = [] table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+') table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+') separator = ':' for table in tables: use_next = False use_previous = False used_title = False # get the next sibling p = table.getnext() pprev = table.getprevious() old_title = None if p is not None and p.tag == 'p': cont = True for sub in p: if sub.tag == 'graphic': cont = False if cont: text = manipulate.get_stripped_text(p) if table_regex_colon.match(text): use_next = True separator = ':' elif table_regex_dot.match(text): use_next = True separator = '.' if not use_next: cont = True for sub in pprev: if sub.tag == 'graphic': cont = False if cont: if pprev is not None and pprev.tag == 'p': text = manipulate.get_stripped_text(pprev) if table_regex_colon.match(text): use_previous = True separator = ':' elif table_regex_dot.match(text): use_previous = True separator = '.' if not use_next or use_previous: # see if the title in this section potentially contains text we can match parent = table.getparent() titles = parent.xpath('title') if len(titles) > 0: p = titles[0] text = manipulate.get_stripped_text(p) if table_regex_colon.match(text): use_next = True separator = ':' used_title = True elif table_regex_dot.match(text): use_next = True separator = '.' used_title = True if use_next or use_previous: if use_next: text = manipulate.get_stripped_text(p) else: text = manipulate.get_stripped_text(pprev) p = pprev # likely this is a table identifier split_title = text.split(separator) title = split_title[0] caption = (''.join(split_title[1:])).strip() # strip all formatting from caption for ease of parsing # TODO: preserve formatting (far harder) new_p = etree.Element('p') new_p.text = caption if p.tag.endswith('title'): new_title = etree.Element('title') new_title.text = '' old_title = new_title p.addnext(new_title) p.getparent().remove(p) else: p.getparent().remove(p) p = new_p self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title)) title_element = None # use an existing title element if one exists try: title_element = table.xpath('label')[0] except: title_element = etree.Element('label') table.insert(0, title_element) title_element.text = title caption_element = etree.Element('caption') NlmManipulate.append_safe(caption_element, p, self) table.insert(1, caption_element) if not 'id' in table.attrib: table.attrib['id'] = u'ID{0}'.format(unicode(uuid.uuid4())) table_titles.append(title) table_ids.append(table.attrib['id']) if used_title: # if we took the title out, then we should move the parent into its previous sibling and then # strip tags old_title.tag = 'REMOVE' etree.strip_elements(tree, 'REMOVE') section = table.getparent() previous = section.getprevious() while previous is not None and not previous.tag.endswith('sec'): previous = previous.getprevious() if previous is not None: previous.append(section) section.tag = 'REMOVE' etree.strip_tags(tree, 'REMOVE') self.debug.print_debug(self, u'Moved table and siblings to previous section') else: previous = section.getparent() if previous is not None and previous.tag.endswith('sec'): previous.append(section) section.tag = 'REMOVE' etree.strip_tags(tree, 'REMOVE') self.debug.print_debug(self, u'Moved table and siblings to parent section') paragraphs = tree.xpath('//p') self.link(table_ids, table_titles, paragraphs, 'table') tree.write(self.gv.nlm_file_path) tree.write(self.gv.nlm_temp_file_path)
def run_tables(self): self.debug.print_debug(self, u'Attempting to classify captions for table objects') manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() tables = tree.xpath('//table-wrap') table_titles = [] table_ids = [] table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+') table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+') separator = ':' for table in tables: caption_element = None use_next = False use_previous = False used_title = False # get the next sibling p = table.getnext() pprev = table.getprevious() old_title = None if p is not None and p.tag == 'p': cont = True for sub in p: if sub.tag == 'graphic': cont = False if cont: text = manipulate.get_stripped_text(p) if table_regex_colon.match(text): use_next = True separator = ':' elif table_regex_dot.match(text): use_next = True separator = '.' if not use_next: cont = True for sub in pprev: if sub.tag == 'graphic': cont = False if cont: if pprev is not None and pprev.tag == 'p': text = manipulate.get_stripped_text(pprev) if table_regex_colon.match(text): use_previous = True separator = ':' elif table_regex_dot.match(text): use_previous = True separator = '.' if not use_next or use_previous: # see if the title in this section potentially contains text we can match parent = table.getparent() titles = parent.xpath('title') if len(titles) > 0: p = titles[0] text = manipulate.get_stripped_text(p) if table_regex_colon.match(text): use_next = True separator = ':' used_title = True elif table_regex_dot.match(text): use_next = True separator = '.' used_title = True if use_next or use_previous: if use_next: text = manipulate.get_stripped_text(p) else: text = manipulate.get_stripped_text(pprev) p = pprev # likely this is a table identifier split_title = text.split(separator) title = split_title[0].strip() caption = (''.join(split_title[1:])).strip() # strip all formatting from caption for ease of parsing # TODO: preserve formatting (far harder) new_p = etree.Element('p') new_p.text = caption if p.tag.endswith('title'): new_title = etree.Element('title') new_title.text = '' old_title = new_title p.addnext(new_title) p.getparent().remove(p) else: p.getparent().remove(p) p = new_p self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title)) title_element = None # use an existing title element if one exists try: title_element = table.xpath('label')[0] except: title_element = etree.Element('label') table.insert(0, title_element) title_element.text = title caption_element = etree.Element('caption') NlmManipulate.append_safe(caption_element, p, self) table.insert(1, caption_element) if not 'id' in table.attrib: table.attrib['id'] = u'ID{0}'.format(uuid.uuid4()) table_titles.append(title) table_ids.append(table.attrib['id']) if used_title: # if we took the title out, then we should move the parent into its previous sibling and then # strip tags old_title.tag = 'REMOVE' etree.strip_elements(tree, 'REMOVE') section = table.getparent() previous = section.getprevious() while previous is not None and not previous.tag.endswith('sec'): previous = previous.getprevious() if previous is not None: previous.append(section) section.tag = 'REMOVE' etree.strip_tags(tree, 'REMOVE') self.debug.print_debug(self, u'Moved table and siblings to previous section') else: previous = section.getparent() if previous is not None and previous.tag.endswith('sec'): previous.append(section) section.tag = 'REMOVE' etree.strip_tags(tree, 'REMOVE') self.debug.print_debug(self, u'Moved table and siblings to parent section') # If none of that worked, try to find caption in table rows if caption_element is None: table_rows = table.find("table").getchildren() # Check if first row has fewer columns than others # Therefore not likely to be data or a header columns_count = {} first_column = {} row_number = 0 for row in table_rows: row_number += 1 columns_count[row_number] = len(row.getchildren()) try: first_column[row_number] = row.getchildren()[0].text except: first_column[row_number] = "" fewest_columns = min(columns_count, key=columns_count.get) if len(columns_count) > 2 and columns_count[1] == fewest_columns and columns_count[2] != fewest_columns: # If it has fewest columns, also check Levenshtein distance # To ensure this row is unlike the others if editdistance.eval(first_column[1], first_column[2]) > editdistance.eval(first_column[2], first_column[3]): # OK, we have something, move it caption_element = etree.Element('caption') caption_element.text = first_column[1] NlmManipulate.append_safe(table, caption_element, self) table.find("table").remove(table_rows[0]) paragraphs = tree.xpath('//p') self.link(table_ids, table_titles, paragraphs, 'table') tree.write(self.gv.nlm_file_path) tree.write(self.gv.nlm_temp_file_path)
def run(self): elements = [ "abbrev", "abstract", "ack", "address", "aff", "alt-text", "app", "app-group", "array", "article-title", "attrib", "author-comment", "author-notes", "award-group", "bio", "boxed-text", "caption", "chem-struct", "chem-struct-wrap", "col", "colgroup", "collab", "compound-kwd", "contrib", "contrib-group", "corresp", "custom-meta", "def", "def-item", "def-list", "disp-formula", "disp-formula-group", "disp-quote", "element-citation", "ext-link", "fig", "fig-group", "fn", "fn-group", "funding-source", "glossary", "glyph-data", "graphic", "inline-formula", "inline-graphic", "inline-supplementary-material", "institution", "kwd", "kwd-group", "list", "list-item", "long-desc", "media", "milestone-end", "milestone-start", "mixed-citation", "named-content", "nlm-citation", "note", "notes", "p", "person-group", "preformat", "product", "ref", "ref-list", "related-article", "related-object", "response", "sec", "sig", "sig-block", "source", "speech", "statement", "sub-article", "supplementary-material", "table", "table-wrap", "table-wrap-group", "tbody", "td", "term", "tex-math", "tfoot", "th", "thead", "title", "tr", "trans-abstract", "trans-source", "trans-title", "trans-title-group", "verse-group", "xref", ] manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() for element in elements: self.debug.print_debug(self, u"Assigning ID to all {0} elements".format(element)) for item in tree.xpath(u"//{0}".format(element)): if not "id" in item.attrib: item.attrib["id"] = u"ID{0}".format(unicode(uuid.uuid4())) tree.write(self.gv.nlm_file_path) tree.write(self.gv.nlm_temp_file_path)
def process_zotero(self): from zotero import libzotero zotero = libzotero.LibZotero(unicode(self.gv.settings.get_setting(u'zotero', self)), self.gv) manipulate = NlmManipulate(self.gv) master_tree = manipulate.load_dom_tree() tree = master_tree.xpath('//back/ref-list/ref') for element in tree: original_term = manipulate.get_stripped_text(element) term = original_term #term = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', term) term = re.sub(r'(?<![0-9])[1-9][0-9]{0,2}(?![0-9])', r'', term) term = re.sub(r'[\-,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\!\\/]', '', term) term = re.sub(u'[^\s]+?\s[Ee]dition', u' ', term) term = re.sub(u'\s.\s', u' ', term) term = re.sub(u'(?<=[A-Z])\.', u' ', term) term = term.replace(u'“', u'') term = term.replace(u'\'s', u'') term = term.replace(u'’s', u'') term = term.replace(u'’', u'') term = term.replace(u' Ed. ', u' ') term = term.replace(u' Ed ', u' ') term = term.replace(u' Trans. ', u' ') term = term.replace(u' Trans ', u' ') term = term.replace(u' trans ', u' ') term = term.replace(u' trans. ', u' ') term = term.replace(u' by. ', u' ') term = term.replace(u' by ', u' ') term = term.replace(u' ed. ', u' ') term = term.replace(u' ed ', u' ') term = term.replace(u' In ', u' ') term = term.replace(u' in ', u' ') term = term.replace(u' print ', u' ') term = term.replace(u' Print ', u' ') term = term.replace(u' and ', u' ') term = term.replace(u'”', u'') term = re.sub(r'[Aa]ccessed', '', term) term = re.sub(r'meTypesetbr', '', term) term = re.sub(r'\s+', ' ', term) results = zotero.search(term.strip()) while len(results) == 0 and len(term.strip().split(' ')) > 2: # no results found. # begin iterating backwards term = ' '.join(term.strip().split(' ')[:-1]) results = zotero.search(term.strip()) if len(results) == 1: res = results[0].JATS_format() if res is not None: ref = etree.fromstring(res) if 'id' in element.attrib: ref.attrib['id'] = element.attrib['id'] element.addnext(ref) original_term = re.sub(u'--', u'', original_term) comment = etree.Comment(original_term) ref.addnext(comment) element.tag = 'REMOVE' etree.strip_elements(master_tree, 'REMOVE') manipulate.save_tree(master_tree)
def run_graphics_sibling(self): # images are hard to handle because Word/OO puts them in different places # for instance, the caption can come before or after; # <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float" # orientation="portrait" xlink:type="simple"/> self.debug.print_debug(self, u'Attempting to classify captions for graphics objects [sibling]') manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() graphics = tree.xpath('//graphic') graphic_titles = [] graphic_ids = [] graphic_regex_dot = re.compile('^.+?\s*\d+\..+') graphic_regex_colon = re.compile('^.+?\s*\d+\:.+') separator = ':' for graphic in graphics: use_next = False use_previous = False # get the next sibling p = graphic.getparent().getnext() pprev = graphic.getparent().getprevious() if p is not None and p.tag == 'p': text = manipulate.get_stripped_text(p) if graphic_regex_colon.match(text): use_next = True separator = ':' elif graphic_regex_dot.match(text): use_next = True separator = '.' if not use_next: if pprev is not None and pprev.tag == 'p': text = manipulate.get_stripped_text(pprev) if graphic_regex_colon.match(text): use_previous = True separator = ':' elif graphic_regex_dot.match(text): use_previous = True separator = '.' if not use_next or use_previous: # see if the title in this section potentially contains text we can match parent = graphic.getparent() while parent is not None and not parent.tag.endswith('sec'): parent = parent.getparent() if parent is not None: titles = parent.xpath('title') else: titles = [] if len(titles) > 0: p = titles[0] text = manipulate.get_stripped_text(p) if graphic_regex_colon.match(text): use_next = True separator = ':' elif graphic_regex_dot.match(text): use_next = True separator = '.' if use_next or use_previous: if use_next: text = manipulate.get_stripped_text(p) else: text = manipulate.get_stripped_text(pprev) p = pprev # likely this is a table identifier split_title = text.split(separator) title = split_title[0].strip() caption = (''.join(split_title[1:])).strip() self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title)) title_element = None # use an existing title element if one exists try: title_element = graphic.xpath('label')[0] except: title_element = etree.Element('label') graphic.insert(0, title_element) title_element.text = title caption_element = etree.Element('caption') new_p = etree.Element('p') new_p.text = caption NlmManipulate.append_safe(caption_element, new_p, self) NlmManipulate.append_safe(graphic, caption_element, self) if p.tag.endswith('title'): new_title = etree.Element('title') new_title.text = '' p.addnext(new_title) p.getparent().remove(p) else: p.getparent().remove(p) if graphic.tail: graphic.tail = graphic.tail.replace(title + separator, '') graphic.tail = graphic.tail.replace(caption + separator, '') graphic.tail = graphic.tail.replace(caption, '') if not 'id' in graphic.attrib: graphic.attrib['id'] = u'ID{0}'.format(unicode(uuid.uuid4())) graphic_titles.append(title) graphic_ids.append(graphic.attrib['id']) paragraphs = tree.xpath('//p') self.link(graphic_ids, graphic_titles, paragraphs, 'fig') tree.write(self.gv.nlm_file_path) tree.write(self.gv.nlm_temp_file_path)
def run_tables(self): self.debug.print_debug( self, u'Attempting to classify captions for table objects') manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() tables = tree.xpath('//table-wrap') table_titles = [] table_ids = [] table_regex_dot = re.compile('^.+?[\s\-]*\d+\..+') table_regex_colon = re.compile('^.+?[\s\-]*\d+\:.+') separator = ':' for table in tables: caption_element = None use_next = False use_previous = False used_title = False # get the next sibling p = table.getnext() pprev = table.getprevious() old_title = None if p is not None and p.tag == 'p': cont = True for sub in p: if sub.tag == 'graphic': cont = False if cont: text = manipulate.get_stripped_text(p) if table_regex_colon.match(text): use_next = True separator = ':' elif table_regex_dot.match(text): use_next = True separator = '.' if not use_next: cont = True for sub in pprev: if sub.tag == 'graphic': cont = False if cont: if pprev is not None and pprev.tag == 'p': text = manipulate.get_stripped_text(pprev) if table_regex_colon.match(text): use_previous = True separator = ':' elif table_regex_dot.match(text): use_previous = True separator = '.' if not use_next or use_previous: # see if the title in this section potentially contains text we can match parent = table.getparent() titles = parent.xpath('title') if len(titles) > 0: p = titles[0] text = manipulate.get_stripped_text(p) if table_regex_colon.match(text): use_next = True separator = ':' used_title = True elif table_regex_dot.match(text): use_next = True separator = '.' used_title = True if use_next or use_previous: if use_next: text = manipulate.get_stripped_text(p) else: text = manipulate.get_stripped_text(pprev) p = pprev # likely this is a table identifier split_title = text.split(separator) title = split_title[0].strip() caption = (''.join(split_title[1:])).strip() # strip all formatting from caption for ease of parsing # TODO: preserve formatting (far harder) new_p = etree.Element('p') new_p.text = caption if p.tag.endswith('title'): new_title = etree.Element('title') new_title.text = '' old_title = new_title p.addnext(new_title) p.getparent().remove(p) else: p.getparent().remove(p) p = new_p self.debug.print_debug( self, u'Handling title and caption for "{0}"'.format(title)) title_element = None # use an existing title element if one exists try: title_element = table.xpath('label')[0] except: title_element = etree.Element('label') table.insert(0, title_element) title_element.text = title caption_element = etree.Element('caption') NlmManipulate.append_safe(caption_element, p, self) table.insert(1, caption_element) if not 'id' in table.attrib: table.attrib['id'] = u'ID{0}'.format(uuid.uuid4()) table_titles.append(title) table_ids.append(table.attrib['id']) if used_title: # if we took the title out, then we should move the parent into its previous sibling and then # strip tags old_title.tag = 'REMOVE' etree.strip_elements(tree, 'REMOVE') section = table.getparent() previous = section.getprevious() while previous is not None and not previous.tag.endswith( 'sec'): previous = previous.getprevious() if previous is not None: previous.append(section) section.tag = 'REMOVE' etree.strip_tags(tree, 'REMOVE') self.debug.print_debug( self, u'Moved table and siblings to previous section') else: previous = section.getparent() if previous is not None and previous.tag.endswith( 'sec'): previous.append(section) section.tag = 'REMOVE' etree.strip_tags(tree, 'REMOVE') self.debug.print_debug( self, u'Moved table and siblings to parent section') # If none of that worked, try to find caption in table rows if caption_element is None: table_rows = table.find("table").getchildren() # Check if first row has fewer columns than others # Therefore not likely to be data or a header columns_count = {} first_column = {} row_number = 0 for row in table_rows: row_number += 1 columns_count[row_number] = len(row.getchildren()) try: first_column[row_number] = row.getchildren()[0].text except: first_column[row_number] = "" fewest_columns = min(columns_count, key=columns_count.get) if len(columns_count) > 2 and columns_count[ 1] == fewest_columns and columns_count[ 2] != fewest_columns: # If it has fewest columns, also check Levenshtein distance # To ensure this row is unlike the others if editdistance.eval(first_column[1], first_column[2]) > editdistance.eval( first_column[2], first_column[3]): # OK, we have something, move it caption_element = etree.Element('caption') caption_element.text = first_column[1] NlmManipulate.append_safe(table, caption_element, self) table.find("table").remove(table_rows[0]) paragraphs = tree.xpath('//p') self.link(table_ids, table_titles, paragraphs, 'table') tree.write(self.gv.nlm_file_path) tree.write(self.gv.nlm_temp_file_path)
def process_zotero(self): from zotero import libzotero zotero = libzotero.LibZotero( unicode(self.gv.settings.get_setting(u'zotero', self)), self.gv) manipulate = NlmManipulate(self.gv) master_tree = manipulate.load_dom_tree() tree = master_tree.xpath('//back/ref-list/ref') for element in tree: original_term = manipulate.get_stripped_text(element) term = original_term #term = re.sub(r'(.+?)(\(.+?\))(.*)', r'\1\3', term) term = re.sub(r'(?<![0-9])[1-9][0-9]{0,2}(?![0-9])', r'', term) term = re.sub(r'[\-,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\!\\/]', '', term) term = re.sub(u'[^\s]+?\s[Ee]dition', u' ', term) term = re.sub(u'\s.\s', u' ', term) term = re.sub(u'(?<=[A-Z])\.', u' ', term) term = term.replace(u'“', u'') term = term.replace(u'\'s', u'') term = term.replace(u'’s', u'') term = term.replace(u'’', u'') term = term.replace(u' Ed. ', u' ') term = term.replace(u' Ed ', u' ') term = term.replace(u' Trans. ', u' ') term = term.replace(u' Trans ', u' ') term = term.replace(u' trans ', u' ') term = term.replace(u' trans. ', u' ') term = term.replace(u' by. ', u' ') term = term.replace(u' by ', u' ') term = term.replace(u' ed. ', u' ') term = term.replace(u' ed ', u' ') term = term.replace(u' In ', u' ') term = term.replace(u' in ', u' ') term = term.replace(u' print ', u' ') term = term.replace(u' Print ', u' ') term = term.replace(u' and ', u' ') term = term.replace(u'”', u'') term = re.sub(r'[Aa]ccessed', '', term) term = re.sub(r'meTypesetbr', '', term) term = re.sub(r'\s+', ' ', term) results = zotero.search(term.strip()) while len(results) == 0 and len(term.strip().split(' ')) > 2: # no results found. # begin iterating backwards term = ' '.join(term.strip().split(' ')[:-1]) results = zotero.search(term.strip()) if len(results) == 1: res = results[0].JATS_format() if res is not None: ref = etree.fromstring(res) if 'id' in element.attrib: ref.attrib['id'] = element.attrib['id'] element.addnext(ref) original_term = re.sub(u'--', u'', original_term) comment = etree.Comment(original_term) ref.addnext(comment) element.tag = 'REMOVE' etree.strip_elements(master_tree, 'REMOVE') manipulate.save_tree(master_tree)
def run_graphics_sibling(self): # images are hard to handle because Word/OO puts them in different places # for instance, the caption can come before or after; # <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float" # orientation="portrait" xlink:type="simple"/> self.debug.print_debug( self, u'Attempting to classify captions for graphics objects [sibling]') manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() graphics = tree.xpath('//graphic') graphic_titles = [] graphic_ids = [] graphic_regex_dot = re.compile('^.+?\s*\d+\..+') graphic_regex_colon = re.compile('^.+?\s*\d+\:.+') separator = ':' for graphic in graphics: use_next = False use_previous = False # get the next sibling p = graphic.getparent().getnext() pprev = graphic.getparent().getprevious() if p is not None and p.tag == 'p': text = manipulate.get_stripped_text(p) if graphic_regex_colon.match(text): use_next = True separator = ':' elif graphic_regex_dot.match(text): use_next = True separator = '.' if not use_next: if pprev is not None and pprev.tag == 'p': text = manipulate.get_stripped_text(pprev) if graphic_regex_colon.match(text): use_previous = True separator = ':' elif graphic_regex_dot.match(text): use_previous = True separator = '.' if not use_next or use_previous: # see if the title in this section potentially contains text we can match parent = graphic.getparent() while parent is not None and not parent.tag.endswith('sec'): parent = parent.getparent() if parent is not None: titles = parent.xpath('title') else: titles = [] if len(titles) > 0: p = titles[0] text = manipulate.get_stripped_text(p) if graphic_regex_colon.match(text): use_next = True separator = ':' elif graphic_regex_dot.match(text): use_next = True separator = '.' if use_next or use_previous: if use_next: text = manipulate.get_stripped_text(p) else: text = manipulate.get_stripped_text(pprev) p = pprev # likely this is a table identifier split_title = text.split(separator) title = split_title[0].strip() caption = (''.join(split_title[1:])).strip() self.debug.print_debug( self, u'Handling title and caption for "{0}"'.format(title)) title_element = None # use an existing title element if one exists try: title_element = graphic.xpath('label')[0] except: title_element = etree.Element('label') graphic.insert(0, title_element) title_element.text = title caption_element = etree.Element('caption') new_p = etree.Element('p') new_p.text = caption NlmManipulate.append_safe(caption_element, new_p, self) NlmManipulate.append_safe(graphic, caption_element, self) if p.tag.endswith('title'): new_title = etree.Element('title') new_title.text = '' p.addnext(new_title) p.getparent().remove(p) else: p.getparent().remove(p) if graphic.tail: graphic.tail = graphic.tail.replace(title + separator, '') graphic.tail = graphic.tail.replace( caption + separator, '') graphic.tail = graphic.tail.replace(caption, '') if not 'id' in graphic.attrib: graphic.attrib['id'] = u'ID{0}'.format( unicode(uuid.uuid4())) graphic_titles.append(title) graphic_ids.append(graphic.attrib['id']) paragraphs = tree.xpath('//p') self.link(graphic_ids, graphic_titles, paragraphs, 'fig') tree.write(self.gv.nlm_file_path) tree.write(self.gv.nlm_temp_file_path)
def run_graphics(self): # images are hard to handle because Word/OO puts them in different places # for instance, the caption can come before or after; # <p>Figure 1: Martin Eve at the pub<graphic xlink:href="media/image1.jpeg" position="float" # orientation="portrait" xlink:type="simple"/> self.debug.print_debug(self, u"Attempting to classify captions for graphics objects [plain]") manipulate = NlmManipulate(self.gv) tree = manipulate.load_dom_tree() graphics = tree.xpath("//graphic") graphic_titles = [] graphic_ids = [] graphic_regex_dot = re.compile("^.+?\s*\d+\..+") graphic_regex_colon = re.compile("^.+?\s*\d+\:.+") separator = ":" for graphic in graphics: use_next = False # get the next sibling p = graphic.getparent() if p is not None and p.tag == "p": text = manipulate.get_stripped_text(p) if graphic_regex_colon.match(text): use_next = True separator = ":" elif graphic_regex_dot.match(text): use_next = True separator = "." if use_next: text = manipulate.get_stripped_text(p) # likely this is a table identifier split_title = text.split(separator) title = split_title[0].strip() caption = ("".join(split_title[1:])).strip() self.debug.print_debug(self, u'Handling title and caption for "{0}"'.format(title)) title_element = None # use an existing title element if one exists try: title_element = graphic.xpath("label")[0] except: title_element = etree.Element("label") graphic.insert(0, title_element) title_element.text = title caption_element = etree.Element("caption") new_p = etree.Element("p") new_p.text = caption NlmManipulate.append_safe(caption_element, new_p, self) NlmManipulate.append_safe(graphic, caption_element, self) if graphic.tail: graphic.tail = graphic.tail.replace(title + separator, "") graphic.tail = graphic.tail.replace(caption + separator, "") graphic.tail = graphic.tail.replace(caption, "") if not "id" in graphic.attrib: graphic.attrib["id"] = u"ID{0}".format(unicode(uuid.uuid4())) graphic_titles.append(title) graphic_ids.append(graphic.attrib["id"]) paragraphs = tree.xpath("//p") self.link(graphic_ids, graphic_titles, paragraphs, "fig") tree.write(self.gv.nlm_file_path) tree.write(self.gv.nlm_temp_file_path) self.run_graphics_sibling()