Пример #1
0
def transform(filename):
    htmlfile = open(filename, encoding='ISO-8859-1')
    tree = html.parse(htmlfile)

    for node in tree.xpath('//font'):
        size = int(node.get('size')) if 'size' in node.attrib else None
        color = node.get('color').lower() if 'color' in node.attrib else ''
        if color == '#ff0000':
            strong = html.Element('strong')
            replace_tag(node, strong)
        elif size == 6:
            h1 = html.Element('h1')
            replace_tag(node, h1)
        elif size == 5:
            h2 = html.Element('h2')
            replace_tag(node, h2)
        elif size == 4:
            node.drop_tag()

    for node in tree.xpath('//a[@href]'):
        href = node.get('href')

        try:
            basename, extension = href.split('.')
        except ValueError:
            continue

        if extension.startswith('htm'):
            node.set('href', '{}.{}'.format(basename, 'md'))

    #for node in tree.xpath('//p[re:test(@align, "^center$", "i")]', namespaces={"re": "http://exslt.org/regular-expressions"}):
    #    node.set('align', None)

    transformed_html = etree.tostring(tree, pretty_print=True, method='html', encoding='unicode')
    return transformed_html
Пример #2
0
    def _extract_tr(self, tr: html.Element):
        " extract a row "

        #print(f"tr ===>{html.tostring(tr)}<<====\n")            

        elem = html.Element("tr")
        elem.text = ""
        elem.tail = ""
        cells = []

        for x in tr:
            if x.tag != "td" and x.tag != "th":
                if x.tag == etree.Comment: continue
                if x.tag == "script": continue

                logger.warning(f"  adding td around {html.tostring(x)}")   
                ch_elem = html.Element("td")
                bad_elem, val = self._extract_any(x)
                if bad_elem != None:
                    ch_elem.append(bad_elem)
                else:
                    ch_elem.text = val
            else:
                ch_elem, val = self._extract_any(x)
                if ch_elem == None: ch_elem = html.Element(x.tag)
        
            ch_elem.tail = ""
            elem.append(ch_elem)
            cells.append(val)

        self._new_element.append(elem)
        self.rows.append(cells)
Пример #3
0
def clean28(filename, content):
    html_content = html.fromstring(content)
    s12s = html_content.xpath('//div[@class=\'s12\']')
    has_changed = False
    for s12 in s12s:
        if ((s12.getchildren() is None or len(s12.getchildren()) == 0)
                and s12.tail is not None and len(s12.tail.strip()) > 0
                and clean28regex1.match(s12.tail.strip())):
            has_changed = has_changed or True
            element = html.Element('div', {'class': 's12'})
            element.text = s12.tail.strip()
            s12.tail = ''
            s12.addnext(element)
            while (element.getnext() is not None
                   and element.getnext().tag == 'br'
                   and element.getnext().tail is not None
                   and len(element.getnext().tail.strip()) > 0
                   and clean28regex1.match(element.getnext().tail.strip())):
                new_element = html.Element('div', {'class': 's12'})
                new_element.text = element.getnext().tail.strip()
                element.getnext().tail = ''
                element.getnext().drop_tree()
                element.addnext(new_element)
                element = new_element

    if has_changed:
        print filename
        content = etree.tostring(html_content)
    return content
Пример #4
0
def clean32(filename, content):
    html_content = html.fromstring(content)
    centers = html_content.xpath('//center')
    has_changed = False
    for center in centers:
        if (center.getchildren() and center.text
                and len(center.getchildren()) == 1
                and center.getchildren()[0].tag == 'br'
                and center.getchildren()[0].tail):
            text1 = center.text.strip()
            text2 = center.getchildren()[0].tail.strip()
            match = clean32regex1.match(text1)
            if (match):
                c = match.group(1).lower()
                if c == 'pasal':
                    element = html.Element('h4')
                else:
                    element = html.Element('h2', {'class': c})
                num = html.Element('span', {'class': 'num'})
                num.text = text1
                heading = html.Element('span', {'class': 'title'})
                heading.text = text2
                element.append(num)
                element.append(heading)
                center.addnext(element)
                center.drop_tree()
                has_changed = has_changed or True

    if has_changed:
        print filename
        content = etree.tostring(html_content)

    return content
Пример #5
0
def clean10(filename, content):
    html_content = html.fromstring(content)
    center_parts = html_content.xpath('//center')
    has_changed = False
    for part in center_parts:
        child = part.getchildren()
        if (len(child) == 1 and child[0].tag == 'br' and part.text
                and part.text.strip() and child[0].tail
                and child[0].tail.strip()):
            text = part.text.strip() + ': ' + child[0].tail.strip()
            if (clean10regex1.match(text)):
                element = html.Element('h2', {'class': 'bagian'})
                element.text = text
                part.addprevious(element)
                part.drop_tree()
                has_changed = has_changed or True
            elif (clean10regex2.match(text)):
                element = html.Element('h2', {'class': 'bab'})
                element.text = text
                part.addprevious(element)
                part.drop_tree()
                has_changed = has_changed or True

    if has_changed:
        print filename
        content = etree.tostring(html_content)
    return content
Пример #6
0
def fix_orphan_html_list_items(el):
    while True:
        try:
            li_el = el.xpath('//li[not(parent::ul) and not(parent::ol)]')[0]
        except IndexError:
            break
        else:
            parent_el = li_el.getparent()
            i = parent_el.index(li_el)  # index of the first <li>
            subsequent_li_els = list(
                takewhile(is_li_element, li_el.itersiblings()))
            if subsequent_li_els:
                # prepare new children, the first <li> and all subsequent <li>
                # siblings (anything else than <li> is a stopper)
                children_els = [li_el] + subsequent_li_els

                # move <li> elements from the parent to the new <ul>
                ul_el = html.Element('ul')
                for child_el in children_els:
                    ul_el.append(child_el)

                # move tail text from the last <li> to the new <ul>
                ul_el.tail = children_els[-1].tail
                children_els[-1].tail = None

                # put the <ul> at the same index where the first <li> was
                parent_el.insert(i, ul_el)
            else:
                # standalone <li> element, turn it into a <span> with <br>
                br_el = html.Element('br')
                li_el.addnext(br_el)
                li_el.tag = 'span'
    return el
Пример #7
0
def _createNumberedElem(paraElem):
    '''
    Uses the attributes tagged onto a HTML paragraph element to create an
    HTML numbered list.
    '''
    numberTypeMap = {
        'decimal': '1',
        'lowerLetter': 'a',
        'upperLetter': 'A',
        'lowerRoman': 'i',
        'upperRoman': 'I'
    }

    numberedElem = None
    formatType = paraElem.get('CAR_format')

    if formatType == 'bullet':
        numberedElem = HTML.Element('ul')
    else:
        numberedElem = HTML.Element('ol')
        if formatType in numberTypeMap:
            numberedElem.set('type', numberTypeMap[formatType])
        else:
            # Default to decimal
            numberedElem.set('type', '1')
        numberedElem.set('start', paraElem.get('CAR_start'))

    return numberedElem
def populate_td(input_value, recid=None):
    '''Populate the <td> elements of the table'''

    if VERBOSE:
        print 'input_value =', input_value
    if recid and isinstance(input_value, str):
        return make_url(input_value, recid)

    ul_elem = LH.Element("ul")

    for spokesperson in input_value:
        name = spokesperson['name']
        name = re.sub(ur' \(.*', '', name)
        try:
            name = make_url(name, spokesperson['recid'])
        except KeyError:
            pass
        try:
            display = LH.Element("li")
            display.append(name)
        except TypeError:
            display = ELEMENT.li(name)
        dates = ' (' + spokesperson['start'] + ' - '
        if spokesperson['curr'].lower() == 'current':
            dates += 'present)'
            display.append(ELEMENT.b(dates))
        else:
            dates += spokesperson['end'] + ')'
            display.append(ELEMENT.a(dates))

        ul_elem.append(display)
    return ul_elem
Пример #9
0
    def add_navbar_js(self):
        """short desc
        
        long desc
    
        Args:
            var (type): desc
    
        Returns:
            desc
    
        Raises:
            IOError: desc
        """
        this_dir, this_filename = os.path.split(__file__)
        file_path = os.path.join(this_dir, "js", "navbar.js")

        with open(file_path, "r") as fi:
            navbar = fi.read()

        new_script = html.Element("script")
        new_script.text = navbar
        self.book.xpath("//head")[0].insert(1, new_script)

        ## Add jquery library
        new_script = html.Element("script")
        new_script.attrib[
            "src"] = "https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"
        self.book.xpath("//head")[0].insert(1, new_script)
Пример #10
0
def clean23(filename, content):
    html_content = html.fromstring(content)
    center_parts = html_content.xpath('//center')
    has_changed = False
    for center_part in center_parts:
        if (center_part.getchildren() is not None
                and len(center_part.getchildren()) == 3 and all(
                    (child.tag == 'br' and child.tail
                     and len(child.tail.strip()) > 0)
                    for child in center_part.getchildren())):
            children = center_part.getchildren()
            text1 = center_part.text.strip()
            text2 = children[0].tail.strip()
            text3 = children[1].tail.strip()
            text4 = children[2].tail.strip()

            match1 = clean23regex1.match(text1)
            match2 = clean23regex1.match(text3)
            if (match1 and match2):
                has_changed = has_changed or True
                element1 = html.Element('h2',
                                        {'class': match1.group(1).lower()})
                element1.text = text1 + ': ' + text2
                element2 = html.Element('h2',
                                        {'class': match2.group(1).lower()})
                element2.text = text3 + ': ' + text4
                center_part.addprevious(element1)
                center_part.addprevious(element2)
                center_part.drop_tree()

    if has_changed:
        print filename
        content = etree.tostring(html_content)
    return content
Пример #11
0
    def _extract_any(self, x: html.Element) -> [html.Element, str]:
        " extract/simplify an HTML element (recursive) "
        
        #print(f"extract any ===>{html.tostring(x)}<<====\n")

        # nested tables are special because we are processing a flattend list so ignore them.
        if x.tag == "table": return html.Element("table"), "[TABLE]"

        # lists are special because we want to build up a comma seperated list
        if x.tag == "ul": return self._extract_list(x)

        if x.tag == etree.Comment: return etree.Comment(), ""
        
        # no children --> text element
        if len(x) == 0:
            if x.text == None:
                return None, ""
            elem, val = x, self._extract_text(x.text)
            return elem, val

        elem = html.Element(x.tag)
        items = []
        if x.text != None:
            elem.text = x.text
            items.append(x.text)

        for y in x:
            #ignore/strip out layout tags
            if y.tag == etree.Comment: continue
            if y.tag in ["script", "noscript", "br", "hr", "input", "button", "svg", "img", "form"]: continue

            if y.tag in ["span", "div", "h3", "h2", "h1", "small", "strong", "em", "sup", "i", 
                "a", "b", "u", "p", "ul", "label", "sub"]:
                elem_ch, s = self._extract_any(y)
                if elem_ch != None:
                    if len(x) == 1:
                        if s != None and s != "":
                            elem.text = s
                    else:
                        elem.append(elem_ch)
                if s != None and s != "":
                    items.append(s)
            elif y.tag == "table" or y.tag == "iframe":
                elem.append(html.Element(y.tag))
                items.append(f"[{y.tag.upper()}]")
            else:
                logger.warning(f"unexpected tag {y.tag} ===>{html.tostring(y)}<<====\n")
                elem_ch, s = self._extract_any(y)
                if elem_ch != None:
                    if len(x) == 1:
                        if s != None and s != "":
                            elem.text = s
                    else:
                        elem.append(elem_ch)
                if s != None and s != "":
                    items.append(s)

        val = " ".join(items)
        return elem, val
Пример #12
0
    def _render(self, template, values=None, **options):
        """ render(template, values, **options)

        Render the template specified by the given name.

        :param template: etree, xml_id, template name (see _get_template)
            * Call the method ``load`` is not an etree.
        :param dict values: template values to be used for rendering
        :param options: used to compile the template (the dict available for the rendering is frozen)
            * ``load`` (function) overrides the load method

        :returns: bytes marked as markup-safe (decode to :class:`markupsafe.Markup`
                  instead of `str`)
        :rtype: MarkupSafe
        """
        context = dict(self.env.context,
                       dev_mode='qweb' in tools.config['dev_mode'])
        context.update(options)

        result = super(IrQWeb, self)._render(template,
                                             values=values,
                                             **context)

        if not values or not values.get('__keep_empty_lines'):
            result = markupsafe.Markup(
                IrQWeb._empty_lines.sub('\n', result.strip()))

        if 'data-pagebreak=' not in result:
            return result

        fragments = html.fragments_fromstring(result)

        for fragment in fragments:
            for row in fragment.iterfind('.//tr[@data-pagebreak]'):
                table = next(row.iterancestors('table'))
                newtable = html.Element('table', attrib=dict(table.attrib))
                thead = table.find('thead')
                if thead:
                    newtable.append(copy.deepcopy(thead))
                # TODO: copy caption & tfoot as well?
                # TODO: move rows in a tbody if row.getparent() is one?

                pos = row.get('data-pagebreak')
                assert pos in ('before', 'after')
                for sibling in row.getparent().iterchildren('tr'):
                    if sibling is row:
                        if pos == 'after':
                            newtable.append(sibling)
                        break
                    newtable.append(sibling)

                table.addprevious(newtable)
                table.addprevious(
                    html.Element('div',
                                 attrib={'style': 'page-break-after: always'}))

        return markupsafe.Markup(''.join(
            html.tostring(f).decode() for f in fragments))
Пример #13
0
    def __parse_layout(self, xmlObj):
        '''

        Logger [9]
        - get the parent
        - get the parent id
        - get the html_document with the parent_id
        - get max rows
        - get max cols
        - create element of class xmlObj.get('class')
        - set element id of xmlObj.get('name')
        - set element x-data-maxcols=max_cols
        - set element x-data-maxrows=max_rows
        - loop over from 0 to maxrow +1
            - loop over from 0 to maxcol +1
                add element of type div with id of objXml.get('name')_row{x}_col{y}
                class item empty-item
        - add element to its parent

        '''
        element_parent = xmlObj.getparent()
        if etree.iselement(element_parent):
            element_parent_id = element_parent.get('name')
            html_doc_fragment = self.__html.xpath(
                '//*[@id="{0}"]'.format(element_parent_id))
            if len(html_doc_fragment) > 0:
                html_doc_fragment = html_doc_fragment[0]
                max_rows = 0
                max_cols = 0
                for item in xmlObj.findall('item'):
                    row = int(item.get('row'))
                    col = int(item.get('column'))
                    if col > max_cols:
                        max_cols = col
                    if row > max_rows:
                        max_rows = row
                html_element = html.Element('div')
                html_element.set('class', xmlObj.get('class'))
                html_element.set('id', xmlObj.get('name'))
                html_element.set('x-data-name', xmlObj.get('name'))
                html_element.set('x-data-maxcols', str(max_cols))
                html_element.set('x-data-maxrows', str(max_rows))
                for row in range(max_rows + 1):
                    for col in range(max_cols + 1):
                        item_element = html.Element('div')
                        item_element.set(
                            'id',
                            "{0}_row{1}_col{2}".format(xmlObj.get('name'), row,
                                                       col))
                        item_element.set('class', 'item item-empty')
                        html_element.append(item_element)
                        print(row, 'x', col)
                print(html.tostring(html_element))
                html_doc_fragment.append(html_element)
                self.Logger.info('[9] Added Layout {0}'.format(
                    html.tostring(html_element)))
Пример #14
0
def transform(filename):
    htmlfile = open(filename, encoding='latin-1')
    tree = html.parse(htmlfile, parser=parser)

    # Frontpage seems to use <font> tags to indicate headings
    for node in tree.xpath('//font'):
        size = int(node.get('size')) if 'size' in node.attrib else None
        color = node.get('color').lower() if 'color' in node.attrib else ''
        if color == '#ff0000':
            strong = html.Element('strong')
            replace_tag(node, strong)
        elif size == 6:
            h1 = html.Element('h1')
            replace_tag(node, h1)
        elif size == 5:
            h2 = html.Element('h2')
            replace_tag(node, h2)
        elif size == 4:
            node.drop_tag()

    # We rewrite all the urls to point to MD files instead of HTM
    for node in tree.xpath('//a[@href]'):
        href = node.get('href')

        try:
            parsed_url = urlparse(href)
            path, filename = os.path.split(parsed_url.path)
            basename, extension = filename.split('.')
            hostname = parsed_url.hostname
        except ValueError:
            continue
        else:
            if hostname and hostname.startswith('anastasis'):
                hostname = None

            if extension.startswith('htm'):
                if path:
                    new_path = '{}{}.{}'.format(path.lstrip('/'), basename,
                                                'md')
                else:
                    new_path = '{}.{}'.format(basename, 'md')

                new_url = '', '', new_path, '', '', parsed_url.fragment
                node.set('href', urlunparse(new_url))

    # Pandoc passes this through, cluttering up the final markdown. Must come
    # after footnore rewriting.
    for node in tree.xpath('//span[@class="MsoFootnoteReference"]'):
        node.drop_tag()

    remove_empty(tree)

    return etree.tostring(tree,
                          pretty_print=True,
                          method='html',
                          encoding='unicode')
Пример #15
0
    def render(self, id_or_xml_id, values=None, **options):
        """ render(id_or_xml_id, values, **options)

        Render the template specified by the given name.

        :param id_or_xml_id: name or etree (see get_template)
        :param dict values: template values to be used for rendering
        :param options: used to compile the template (the dict available for the rendering is frozen)
            * ``load`` (function) overrides the load method
            * ``profile`` (float) profile the rendering (use astor lib) (filter
              profile line with time ms >= profile)
        """
        for method in dir(self):
            if method.startswith('render_'):
                _logger.warning("Unused method '%s' is found in ir.qweb." %
                                method)

        context = dict(self.env.context,
                       dev_mode='qweb' in tools.config['dev_mode'])
        context.update(options)

        result = super(IrQWeb, self).render(id_or_xml_id,
                                            values=values,
                                            **context)

        if b'data-pagebreak=' not in result:
            return result

        fragments = html.fragments_fromstring(result)

        for fragment in fragments:
            for row in fragment.iterfind('.//tr[@data-pagebreak]'):
                table = next(row.iterancestors('table'))
                newtable = html.Element('table', attrib=dict(table.attrib))
                thead = table.find('thead')
                if thead:
                    newtable.append(copy.deepcopy(thead))
                # TODO: copy caption & tfoot as well?
                # TODO: move rows in a tbody if row.getparent() is one?

                pos = row.get('data-pagebreak')
                assert pos in ('before', 'after')
                for sibling in row.getparent().iterchildren('tr'):
                    if sibling is row:
                        if pos == 'after':
                            newtable.append(sibling)
                        break
                    newtable.append(sibling)

                table.addprevious(newtable)
                table.addprevious(
                    html.Element('div',
                                 attrib={'style': 'page-break-after: always'}))

        return b''.join(html.tostring(f) for f in fragments)
Пример #16
0
    def get_html(url):
        try:
            res = requests.get(url, timeout=30)
            parsed_page = html.fromstring(res.content)
        except requests.exceptions.Timeout:
            log.error(Directory.ERROR_MAP[4] % url)
            return html.Element('html')
        except:
            return html.Element('html')

        return parsed_page
Пример #17
0
 def getMainPage(self, mathOutput='html'):
     if mathOutput == 'svg':
         html = HTML.Element('html')
         head = HTML.Element('head')
         body = HTML.Element('body')
         html.append(head)
         html.append(body)
         self._prepareHead(head, mathOutput='svg')
         self._prepareBody(body)
         return HTML.tostring(html)
     else:
         return HTML.tostring(self._html)
Пример #18
0
    def _extract_content(self):
        """ 
        Pull information from HTML table 

        1. Ignore TH/TD distinction
        2. remove content that only changes presentation
        3. assume script/comment tags do not contain data
        
        creates a new element fragment and List[List[Str]]
        embedded UL are converted into a comma delimited string
        """

        #print(f"input table ===>{html.tostring(self.orig_element)}<<====\n")

        self.id = self.orig_element.get("id")
        if self.id != None:
            self._new_element.attrib["id"] = self.id

        tr_temp = html.Element("tr")
        for x in self.orig_element:
            #print(f"row ===>{html.tostring(x)}<<====\n")

            # -- handle TD that are missing surrounding TR
            if x.tag == "td":
                logger.warning(f"misplaced TD: {html.tostring(x)}")
                tr_temp.append(x)
                continue
                #self._extract_td(x)
            elif len(tr_temp) > 0:
                self._extract_tr(tr_temp)
                tr_temp = html.Element("tr")

            if x.tag == "tr":
                self._extract_tr(x)
            elif x.tag == "thead" or x.tag == "tbody" or x.tag == "tfoot":
                for y in x:
                    if y.tag == "tr":
                        self._extract_tr(y)
                    elif self.fail_on_unexpected_tags:
                        raise Exception(f"unexpected tag in tr: {y.tag}")
                    else:
                        logger.warning(
                            f"unexpected tag in tr: {html.tostring(y)}")
            elif x.tag == "colgroup":
                # logger.warning(f"colgroup: {html.tostring(x)}")
                pass
            elif x.tag == "caption":
                self._extract_caption(x)
            elif self.fail_on_unexpected_tags:
                logger.warning(f"unexpected tag in table: {html.tostring(x)}")
                raise Exception(f"unexpected tag in table: {x.tag}")
            else:
                logger.warning(f"unexpected tag: {html.tostring(x)}")
Пример #19
0
def make_source_link(kind: str, stage: str, name: str) -> html.Element:
    d = html.Element("span")
    if kind != stage and kind != "source":
        a = html.Element("a")
        # "http://covid19-api.exemplartech.com/github-data/raw/AZ.html
        a.attrib["href"] = f"../{stage}/{name}"
        a.text = stage
        d.append(a)
    else:
        d.text = stage
    d.tail = " < "
    return d
Пример #20
0
def separate_summary(htmlRoot):
    firstIndicator = None
    secondIndicator = None


    def drop_sibs(element, backwards=False):
        siblingIterator = element.itersiblings(preceding=backwards)
        for sibling in siblingIterator:
            sibling.drop_tree()

    def move_sibs(start, stop, destination):
        siblingIterator = start.itersiblings()
        destination.append(start)
        for sibling in siblingIterator:
            if sibling != stop:
                destination.append(sibling)
            else:
                break

    elements = htmlRoot.iter('h2')
    for element in elements:
        if str(element.text_content()).count('***'):
            if firstIndicator == None:
                firstIndicator = element
            elif secondIndicator == None:
                secondIndicator = element
            else:
                pass

    if firstIndicator == None:
        pass
        # more code needed - probably change html tag to div id="report" tag

    elif secondIndicator == None:
        summary = None
        report = html.Element('div', attrib={'id': 'report'})
        report.tail = '\n'

        drop_sibs(firstIndicator, backwards=True)
        move_sibs(firstIndicator, None, report)

    else:
        summary = html.Element('div', attrib={'id': 'summary'})
        summary.tail = '\n'
        report = html.Element('div', attrib={'id': 'report'})
        report.tail = '\n'

        drop_sibs(firstIndicator, backwards=True)
        move_sibs(firstIndicator, secondIndicator, summary)
        move_sibs(secondIndicator, None, report)

    return([summary, report])
Пример #21
0
    def write_as_html(self, foutput, name: str, url: str,
                      tables: List[ContentTable], html_doc: html.Element):

        s = html.Element("div")
        h = html.Element("h1")
        h.text = name
        s.append(h)

        m = html.Element("div")
        m.text = self.cache.read_date_time_str(name + ".html")
        s.append(m)

        for t in tables:
            s.append(t.new_element)

        x = html.Element("br")
        s.append(x)
        a = html.Element("a")
        a.attrib["href"] = url
        a.text = url
        s.append(a)

        h = html.Element("html")
        h.append(html.Element("body"))
        h[0].append(deepcopy(s))
        foutput.write(html.tostring(h, pretty_print=True))

        html_doc.append(s)
        html_doc.append(html.Element("hr"))
Пример #22
0
 def write_cache(self, style, fonts, svgs):
     "Cache the computed data in an xml file"
     cache = html.Element('cache')
     elt = html.Element('style', id='pretex-style')
     elt.text = style
     cache.append(elt)
     elt = html.Element('style', id='pretex-fonts')
     elt.text = fonts
     cache.append(elt)
     for svg in svgs:
         svg.tail = ''
         cache.append(svg)
     with open(self.svg_cache, 'wb') as fobj:
         fobj.write(html.tostring(cache))
Пример #23
0
def parse_print_tab_kangxi(dict_root, homo_no, word, content):
    parsed_word={}
    if not (len(content) and parse_word_kangxi(word, content, parsed_word)):
        return False
    entry=html.Element("idx:entry", scriptable='yes')
    dict_root.append(entry)
    entry.append(html.Element("idx:orth",value=word))

    #对“详细解释”页面的每一类按'详细字义','基本词义','词性变化'的顺序显示
    #首先展示单词
    b=html.Element('b')
    entry.append(b)
    #基本解释:homo_no='1',详细解释:homo_no='2'
    #make_sub_elem(b, 'word', {"homo_no":"1"}, parsed_ziyi['zi'])
    make_sub_elem(b, 'word', _text= word)
    entry.append(html.Element('br'))
    category=html.Element('category')
    entry.append(category)
    
    sense=html.Element('sense')
    category.append(sense)
    make_sub_elem(sense, 'description',_text=parsed_word['jianjie'])
    sense.append(html.Element('br'))

    for desc in parsed_word['jieshi']:
        make_sub_elem(sense, 'description',_text=desc)
        sense.append(html.Element('br'))
    make_sub_elem(dict_root,'hr')
    return True
Пример #24
0
    def _add_html_info_row(self, t: html.Element, label: str, val: str, cls: str = None):
        tr = html.Element("tr")

        td = html.Element("td")
        td.text = label
        if cls != None: td.attrib["class"] = cls
        tr.append(td)

        td = html.Element("td")
        td.text = val
        if cls != None: td.attrib["class"] = cls
        tr.append(td)

        tr.tail = "\n      "
        t.append(tr)        
Пример #25
0
 def get_wrapper_tag(self):
     if self.allow_tags is None:
         return
     if self.wrap_inline_tags in (None, True):
         if 'p' in self.allow_tags:
             return html.Element('p')
         elif 'div' in self.allow_tags:
             return html.Element('div')
     elif self.wrap_inline_tags in ('p', 'div'):
         if 'p' in self.allow_tags or 'div' in self.allow_tags:
             return html.Element(self.wrap_inline_tags)
     elif callable(self.wrap_inline_tags):
         element = self.wrap_inline_tags()
         if element.tag in self.allow_tags:
             return element
Пример #26
0
    def load_info(self, item: ChangeItem, body: html.Element):

        body.text = "\n    "
        h3 = html.Element("h3")
        h3.text = item.name
        h3.tail = "\n\n    "
        body.append(h3)

        div = html_helpers.make_source_links("extract", item.name, item.source)
        body.append(div)

        body[len(body) - 1].tail = "\n    "
        br = html.Element("br")
        br.tail = "\n    "
        body.append(br)
Пример #27
0
def markdown(value, style, math_engine=None, lazy_load=False):
    styles = getattr(settings, 'MARKDOWN_STYLES', {}).get(style, getattr(settings, 'MARKDOWN_DEFAULT_STYLE', {}))
    escape = styles.get('safe_mode', True)
    nofollow = styles.get('nofollow', True)
    texoid = TEXOID_ENABLED and styles.get('texoid', False)
    math = hasattr(settings, 'MATHOID_URL') and styles.get('math', False)

    post_processors = []
    if styles.get('use_camo', False) and camo_client is not None:
        post_processors.append(camo_client.update_tree)
    if lazy_load:
        post_processors.append(lazy_load_processor)

    renderer = AwesomeRenderer(escape=escape, nofollow=nofollow, texoid=texoid,
                               math=math and math_engine is not None, math_engine=math_engine)
    markdown = mistune.Markdown(renderer=renderer, inline=AwesomeInlineLexer,
                                parse_block_html=1, parse_inline_html=1)
    result = markdown(value)

    if post_processors:
        try:
            tree = html.fromstring(result, parser=html.HTMLParser(recover=True))
        except (XMLSyntaxError, ParserError) as e:
            if result and (not isinstance(e, ParserError) or e.args[0] != 'Document is empty'):
                logger.exception('Failed to parse HTML string')
            tree = html.Element('div')
        for processor in post_processors:
            processor(tree)
        result = html.tostring(tree, encoding='unicode')
    return Markup(result)
Пример #28
0
def markdown(value, style, math_engine=None, lazy_load=False):
    styles = getattr(settings, 'MARKDOWN_STYLES',
                     {}).get(style,
                             getattr(settings, 'MARKDOWN_DEFAULT_STYLE', {}))
    escape = styles.get('safe_mode', True)
    nofollow = styles.get('nofollow', True)

    post_processors = []
    if lazy_load:
        post_processors.append(lazy_load_processor)

    renderer = AwesomeRenderer(escape=escape, nofollow=nofollow)
    markdown = mistune.Markdown(renderer=renderer,
                                inline=AwesomeInlineLexer,
                                parse_block_html=1,
                                parse_inline_html=1)
    result = markdown(value)

    if post_processors:
        try:
            tree = html.fromstring(result,
                                   parser=html.HTMLParser(recover=True))
        except (XMLSyntaxError, ParserError) as e:
            if result and (not isinstance(e, ParserError)
                           or e.args[0] != 'Document is empty'):
                raise ValueError('Failed to parse HTML string')
            tree = html.Element('div')
        for processor in post_processors:
            processor(tree)
        result = html.tostring(tree, encoding='unicode')
    return Markup(result)
Пример #29
0
def convert_html_to_text(html_str):
    """
    If lxml is available, convert to Markdown (but badly)
    otherwise just strip_tags
    """

    try:
        from lxml import html
    except ImportError:
        return strip_tags(html_str)

    root = html.fromstring(html_str)
    try:
        body = root.xpath('./body')[0]
    except IndexError:
        # No body element
        body = root

    for tag, func in HTML_CONVERTERS.items():
        els = body.xpath('.//' + tag)
        for el in els:
            replacement = func(el)
            repl_tag = html.Element("span")
            repl_tag.text = replacement
            el.getparent().replace(el, repl_tag)

    text = html.tostring(body,
                         pretty_print=True,
                         method='text',
                         encoding='utf-8').decode('utf-8')

    return '\n'.join(x.strip() for x in text.splitlines()).strip()
Пример #30
0
def clean17(filename, content):
    html_content = html.fromstring(content)
    s140s = html_content.xpath('//div[@class=\'s140\']')
    has_changed = False
    for s140 in s140s:
        if (s140.text and len(s140.text.strip()) > 0
                and len(s140.getchildren()) > 0
                and all(child.tag == 'br' for child in s140.getchildren())):
            text = []
            text.append(s140.text.strip())
            for child in s140.getchildren():
                if (child.tail and len(child.tail.strip())):
                    text.append(child.tail.strip())

            if (all(clean17regex1.match(t) for t in text)
                    or all(clean17regex2.match(t) for t in text)):
                has_changed = has_changed or True
                s140.text = ''
                for child in s140.getchildren():
                    s140.remove(child)
                for t in text:
                    element = html.Element('li')
                    element.text = t
                    s140.append(element)
                s140.tag = 'ol'

    if has_changed:
        print filename
        content = etree.tostring(html_content)
    return content