Exemplo n.º 1
0
    def run(self, interactive):
        if interactive:
            self.run_prompt()
            return

        manipulate = NlmManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        # pre-cleanup: remove all empty ext-links as these break the linker
        items_to_clean = tree.xpath('//ext-link')

        count = 0

        for item in items_to_clean:
            if '{http://www.w3.org/1999/xlink}href' in item.attrib and \
                    item.attrib['{http://www.w3.org/1999/xlink}href'] == '':
                count += 1
                item.tag = 'REMOVE'
                etree.strip_tags(item.getparent(), 'REMOVE')

        if count > 0:
            manipulate.save_tree(tree)
            self.debug.print_debug(self, u'Removed {0} blank ext-link tags'.format(count))

        ref_items = tree.xpath('//back/ref-list/ref')

        self.clean_ref_items(tree, ref_items, manipulate)

        # handle numbered reference items
        references_and_numbers = {}

        for ref in ref_items:
            text = manipulate.get_stripped_text(ref)
            ref_match = re.compile('^(?P<number>\d+)\.*')
            result = ref_match.match(text)

            if result:
                references_and_numbers[result.group('number')] = ref

        parsed = self.process_ibid_authors(ref_items)

        if parsed > 0:

            manipulate.save_tree(tree)

            self.debug.print_debug(self, u'Replace {0} instances of "---." at start of references'.format(parsed))

        to_link = []
        to_stub = []

        square_bracket_count = {}


        for p in tree.xpath('//sec//p[not(mml:math)] | //td',
                            namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'}):

            text = manipulate.get_stripped_text(p)

            reference_test = re.compile('\((?P<text>[^%]+?)\)')
            matches = reference_test.finditer(text)

            # exclude any square brackets with numbers inside
            sub_match = re.compile('\[(?P<square>\d*[,\-;\d\s]*)\]')
            smatch = sub_match.search(text)

            if smatch:
                smatches = sub_match.finditer(text)
                for smatch in smatches:
                    self.debug.print_debug(self, u'Handling references in square '
                                                 u'brackets: [{0}] '.format(smatch.group('square')))
                    for item in re.split(';|,', smatch.group('square')):
                        if '-' in item:
                            parent, tail = manipulate.find_text(p, item)

                            if parent is not None:
                                new_string = ''

                                try:
                                    split_range = item.strip().split('-')
                                    for no in range(int(split_range[0]), int(split_range[1]) + 1):
                                        new_string += str(no) + ','
                                except:
                                    self.debug.print_debug(self, u'Unable to parse reference '
                                                                 u'number in range {0}'.format(item))
                                    break

                                if new_string.endswith(',') and not item.endswith(','):
                                    new_string = new_string[0:len(new_string) - 1]

                                if tail and new_string != '':
                                    parent.tail = parent.tail.replace(item, new_string)
                                elif not tail and new_string != '':
                                    parent.text = parent.text.replace(item, new_string)

                                try:
                                    split_range = item.strip().split('-')
                                    for no in range(int(split_range[0]), int(split_range[1]) + 1):
                                        self.debug.print_debug(self, u'Parsing reference '
                                                                     u'number in range {0}'.format(str(no)))

                                        to_stub.append(ReplaceStub(self.gv, p, str(no), tree, manipulate,
                                                                   'TO_LINK_NUMBER', length_ignore=True))
                                except:
                                    self.debug.print_debug(self, u'Unable to parse reference '
                                                                 u'number in range {0}'.format(item))
                                    break

                            else:
                                # just replace the components
                                split_range = item.strip().split('-')
                                for link in split_range:
                                    to_stub.append(ReplaceStub(self.gv, p, link, tree, manipulate,
                                                               'TO_LINK_NUMBER', length_ignore=True))
                        else:
                            if len(item.strip()) < 60:
                                to_stub.append(ReplaceStub(self.gv, p, item.strip(), tree, manipulate, 'TO_LINK_NUMBER',
                                                           length_ignore=True))

                        square_bracket_count[item.strip()] = 1
            else:
                for match in matches:
                    for item in match.group('text').split(u';'):
                        if len(item.strip()) < 60:
                            to_stub.append(ReplaceStub(self.gv, p, item.strip(), tree, manipulate))

        for link in to_stub:
            link.link(to_stub)
            #pass

        etree.strip_elements(tree, 'REMOVE')

        use_index_method = False

        if len(square_bracket_count) != len(references_and_numbers):
            # we found more than 3 [1], [2] style references but no reference elements beginning with numbers
            # so, we will simply try to use the /index/ of the reference item (-1 for zero-based compensation)
            self.debug.print_debug(self, u'Using indexical method for square bracket correlation')
            use_index_method = True

        if len(ref_items) == 0:
            self.debug.print_debug(self, u'Found no references to link')

            manipulate.save_tree(tree)

            return

        for p in tree.xpath('//xref[@rid="TO_LINK_NUMBER"]'):
            text = manipulate.get_stripped_text(p)

            if not use_index_method:
                if text in references_and_numbers:
                    ReplaceObject(self.gv, p, references_and_numbers[text]).link()
                else:
                    p.attrib['rid'] = 'TO_LINK'
            else:
                try:
                    ReplaceObject(self.gv, p, ref_items[int(text) - 1]).link()
                except:
                    self.debug.print_debug(self, u'Failed to link to reference {0} + 1 using '
                                                 u'indexical method'.format(text))
                    p.attrib['rid'] = 'TO_LINK'

        for p in tree.xpath('//xref[@rid="TO_LINK"]'):
            text = manipulate.get_stripped_text(p)

            item = text

            bare_items = item.strip().replace(u',', '').split(u' ')

            for ref in ref_items:
                found = True

                bare_ref = manipulate.get_stripped_text(ref)

                bare_refs = bare_ref.split(' ')

                replace_chars = '[,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"]'

                for sub_item in bare_items:
                    found_ref = False
                    for sub_ref in bare_refs:
                        if re.sub(replace_chars, '', sub_item.strip()).strip() == sub_ref.strip(replace_chars):
                            found_ref = True
                            break

                    if not found_ref:
                        found = False

                if len(bare_items) > 0 and found:
                    to_link.append(ReplaceObject(self.gv, p, ref))

                elif len(bare_items) > 0:
                    replace_chars = '[,\.\<\>\(\)\;\:\@\'\#\~\}\{\[\]\"\d]'
                    found = True

                    for sub_item in bare_items:
                        found_ref = False
                        subbed_text = re.sub(replace_chars, '', sub_item.strip()).strip()
                        for sub_ref in bare_refs:
                            sub_ref = re.sub(replace_chars, '', sub_ref.strip()).strip()

                            if subbed_text == '' and len(bare_items) > 1:
                                found_ref = True
                                break

                            if subbed_text == sub_ref and subbed_text != '' and sub_ref != '':
                                found_ref = True
                                break

                        if not found_ref:
                            found = False

                    # we don't allow linking to the last item here because it is almost universally wrong
                    if len(bare_items) > 0 and found and ref_items.index(ref) != len(ref_items) - 1:
                        to_link.append(ReplaceObject(self.gv, p, ref))


        if len(to_link) == 0:
            self.debug.print_debug(self, u'Found no references to link')

        for link in to_link:
            link.link()
            #pass

        manipulate.save_tree(tree)