Python TeiManipulate.load_dom_tree 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: teimanipulate

클래스/타입: TeiManipulate

메소드/함수: load_dom_tree

hotexamples.com에서의 예제들: 4

Python TeiManipulate.load_dom_tree - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 teimanipulate.TeiManipulate.load_dom_tree에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

TeiManipulate(3)

get_object_list(3)

drop_addin(2)

drop_addin_json(2)

load_dom_tree(2)

save_tree(2)

get_stripped_text(1)

예제 #1

파일 보기

파일: teitonlm.py 프로젝트: MartinPaulEve/meTypeset

    def pre_cleanup(self):
        manipulate = TeiManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        # make sure that head elements are not encapsulated within any elements that will stop them from being
        # correctly transformed by the XSL
        allowed = ['{http://www.tei-c.org/ns/1.0}div', '{http://www.tei-c.org/ns/1.0}body']

        head_elements = tree.xpath('//tei:div[tei:head]', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        count = 0

        for element in head_elements:
            current = element

            while current is not None:
                current = current.getparent()

                if current is not None:
                    if current.tag and current.tag not in allowed:
                        current.tag = 'REMOVE'
                        count += 1
                    elif current.tag and current.tag in allowed:
                        break
                else:
                    break

        if count > 0:
            etree.strip_tags(tree, 'REMOVE')
            manipulate.save_tree(tree)
            self.debug.print_debug(self, u'Extracted {0} headings from inside invalid elements'.format(count))

        # split any p tags with sub-tags hi rend="Indent" into new elements

        biblio_elements = tree.xpath('//tei:p'
                                     '[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
                                     'contains(@rend, "Text Body")]]',
                                     namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        for parent in biblio_elements:
            add_position = parent

            for element in parent.xpath('tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
                                        'contains(@rend, "Text Body")]',
                                        namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):

                new_p = etree.Element('p')
                if 'rend' in parent.attrib:
                    new_p.attrib['rend'] = parent.attrib['rend']

                add_position.addnext(new_p)
                new_p.append(element)
                add_position = new_p

            manipulate.save_tree(tree)
            self.debug.print_debug(self, u'Separated out p {0}'.format(manipulate.get_stripped_text(parent)))

예제 #2

파일 보기

파일: metadata.py 프로젝트: MartinPaulEve/meTypeset

    def pre_clean(self):
        self.extract_metadata_fields()

        manipulate = TeiManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        # get all elements in the body
        section = tree.xpath('//tei:body//*', namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        items_to_match = ['{http://www.tei-c.org/ns/1.0}head', '{http://www.tei-c.org/ns/1.0}p',
                          '{http://www.tei-c.org/ns/1.0}cit']

        count = 0

        matched_authors = []

        for item in section:
            if count > 2:
                break

            if item.tag in items_to_match:
                count += 1
                text = self.get_stripped_text(item)

                processed = False

                for author in self.authors:
                    if not author in matched_authors:
                        has_all = True
                        for component in author:
                            if not component in text:
                                has_all = False
                                break

                        if has_all:
                            # found a metadata line
                            matched_authors.append(author)
                            count -= 1
                            item.getparent().remove(item)
                            self.debug.print_debug(self, u'Removed line "{0}" '
                                                         u'because it appears to be author metadata'.format(text))
                            processed = True
                            break

                if not processed:
                    for metadata in self.metadata:
                        if metadata in text:
                            # found a metadata line
                            count -= 1
                            item.getparent().remove(item)
                            self.debug.print_debug(self, u'Removed line "{0}" '
                                                         u'because it appears to be duplicated metadata'.format(text))

        manipulate.save_tree(tree)

예제 #3

파일 보기

파일: teitonlm.py 프로젝트: rtoi/meTypeset

    def pre_cleanup(self):
        manipulate = TeiManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        # make sure that head elements are not encapsulated within any elements that will stop them from being
        # correctly transformed by the XSL
        allowed = [
            '{http://www.tei-c.org/ns/1.0}div',
            '{http://www.tei-c.org/ns/1.0}body'
        ]

        head_elements = tree.xpath(
            '//tei:div[tei:head]',
            namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        count = 0

        for element in head_elements:
            current = element

            while current is not None:
                current = current.getparent()

                if current is not None:
                    if current.tag and current.tag not in allowed:
                        current.tag = 'REMOVE'
                        count += 1
                    elif current.tag and current.tag in allowed:
                        break
                else:
                    break

        if count > 0:
            etree.strip_tags(tree, 'REMOVE')
            manipulate.save_tree(tree)
            self.debug.print_debug(
                self,
                u'Extracted {0} headings from inside invalid elements'.format(
                    count))

        # split any p tags with sub-tags hi rend="Indent" into new elements

        biblio_elements = tree.xpath(
            '//tei:p'
            '[tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
            'contains(@rend, "Text Body")]]',
            namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        for parent in biblio_elements:
            add_position = parent

            for element in parent.xpath(
                    'tei:hi[contains(@rend, "Indent") or contains(@rend, "Default Style") or '
                    'contains(@rend, "Text Body")]',
                    namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):

                new_p = etree.Element('p')
                if 'rend' in parent.attrib:
                    new_p.attrib['rend'] = parent.attrib['rend']

                add_position.addnext(new_p)
                new_p.append(element)
                add_position = new_p

            manipulate.save_tree(tree)
            self.debug.print_debug(
                self, u'Separated out p {0}'.format(
                    manipulate.get_stripped_text(parent)))

예제 #4

파일 보기

파일: metadata.py 프로젝트: rtoi/meTypeset

    def pre_clean(self):
        self.extract_metadata_fields()

        manipulate = TeiManipulate(self.gv)

        tree = manipulate.load_dom_tree()

        # get all elements in the body
        section = tree.xpath('//tei:body//*',
                             namespaces={'tei': 'http://www.tei-c.org/ns/1.0'})

        items_to_match = [
            '{http://www.tei-c.org/ns/1.0}head',
            '{http://www.tei-c.org/ns/1.0}p',
            '{http://www.tei-c.org/ns/1.0}cit'
        ]

        count = 0

        matched_authors = []

        for item in section:
            if count > 2:
                break

            if item.tag in items_to_match:
                count += 1
                text = self.get_stripped_text(item)

                processed = False

                for author in self.authors:
                    if not author in matched_authors:
                        has_all = True
                        for component in author:
                            if not component in text:
                                has_all = False
                                break

                        if has_all:
                            # found a metadata line
                            matched_authors.append(author)
                            count -= 1
                            item.getparent().remove(item)
                            self.debug.print_debug(
                                self, u'Removed line "{0}" '
                                u'because it appears to be author metadata'.
                                format(text))
                            processed = True
                            break

                if not processed:
                    for metadata in self.metadata:
                        if metadata in text:
                            # found a metadata line
                            count -= 1
                            item.getparent().remove(item)
                            self.debug.print_debug(
                                self, u'Removed line "{0}" '
                                u'because it appears to be duplicated metadata'
                                .format(text))

        manipulate.save_tree(tree)