def format_head(node, path='', *args, **kwargs):
    if isfile(os.path.join(path, '../../../style.css')):
        code_nodes = html.fragments_fromstring("""
        <meta http-equiv="content-type" content="text/html; charset=UTF-8">
        <link href="../../style.css" rel="stylesheet">
        """)
    else:
        code_nodes = html.fragments_fromstring("""
        <meta http-equiv="content-type" content="text/html; charset=UTF-8">
        <link href="../style.css" rel="stylesheet">
        """)
    for code_node in code_nodes:
        node.append(code_node)
    return node
예제 #2
0
    def process(self, text):
        doc = lxml_tree.fromstring(text)

        for block in doc.xpath('//text()'):
            result = inline_math.sub(self._sub_inline, block)
            result = display_math.sub(self._sub_display, result)
            if result == block:
                continue

            last = block.getparent()
            into_text = block.is_text

            if into_text:
                last.text = ''
            else:
                last.tail = ''

            for item in html.fragments_fromstring(result):
                if isinstance(item, ElementBase):
                    if into_text:
                        last.insert(0, item)
                    else:
                        next = last.getnext()
                        if next is None:
                            last.getparent().append(item)
                        else:
                            next.addprevious(item)
                    last = item
                    into_text = False
                elif into_text:
                    last.text += item
                else:
                    last.tail = item

        return doc
예제 #3
0
def verify_html(str_l):
    """
    验证输入的html是否准确
    
    str_l:list,str_l[0] is uploaded html fragment string;str_l[1] is 
        error message(if).
    """
    container = list()
    child_cnt = list()
    cur = list()
    html = ht.fragments_fromstring(str_l[0])
    if len(html) != 1:
        raise ValueError
    html = html[0]
    container.append(html)
    child_cnt.append(1)
    cur.append(0)

    while (container):
        node = container[-1]
        cnt = child_cnt[-1]
        idx = cur[-1]
        if cnt >= idx:
            child = node[idx]
            if child.tag not in gc_verify_container_tag:
                raise ValueError
            c_cnt = len(child)
            if c_cnt > 1:
                if child.tag == 'ul':
                    container.append(child)
                    child_cnt.append(c_cnt - 1)
                    cur[-1] = idx + 1
                    cur.append(0)
                else:
                    raise ValueError
            else:
                if child.tag == "h5":
                    if len(child.attrib) or \
                    gc_escape_char.search(child.text):
                        raise ValueError
                elif child.tag == "li":
                    attr = child.attrib
                    child_href = child[0]
                    if len(attr) or child_href.tag != 'a':
                        raise ValueError

                    attr = child_href.attrib
                    if len(attr)!=1 or \
                        not attr.has_key("href") or \
                        gc_escape_char.search(child_href.text) or \
                        not URL_CHECK(attr["href"]):
                        # not gc_url.match(attr["href"]):
                        raise ValueError
                else:
                    raise ValueError
                cur[-1] = idx + 1
        else:
            container.pop()
            child_cnt.pop()
            cur.pop()
예제 #4
0
파일: scraper.py 프로젝트: Shaggs/cfsresd
    def scrape(self):
        try:
            resp = requests.get(API_URL, params={'f': self.id_stamp}).json()
        except requests.exceptions.ConnectionError as e:
            puts("Error encountered when connecting to urgmsg: ",
                 newline=False)
            puts(colored.magenta(e.__class__.__name__), newline=False)
            print ""
            time.sleep(self.timeout)
            scraper.run()

        if not resp['updated']:
            return

        old_id_stamp = self.id_stamp
        self.id_stamp = resp['IDstamp']
        # if old_id_stamp is 0, this is the first scrape
        # which will return a whole bunch of recent past messages
        if not self.recent_messages and old_id_stamp == 0: return

        # Pager messages are returned newest to oldest, we want to
        # process them oldest to newest
        frags = lh.fragments_fromstring(resp['data'])[::-1]
        for frag in frags:
            msg = PagerMessage(frag)
            for handler in self.handlers:
                handler(msg)
예제 #5
0
def write_html_sig(sigfile, sig, basedir, is_domain, logger):
    "write html sig"
    cleaner = SignatureCleaner(style=True, remove_tags=UNCLEANTAGS,
                                safe_attrs_only=False)
    html = cleaner.clean_html(sig.signature_content)
    html = fragments_fromstring(html)[0]
    for element, attribute, link, pos in iterlinks(html):
        if link.startswith('/settings/imgs/'):
            view, args, kwargs = resolve(link)
            view = None
            args = None
            img = SignatureImg.objects.get(pk=kwargs['img_id'])
            if is_domain:
                imgfile = '%s/domains/imgs/%s' % (basedir, img.name)
            else:
                imgfile = '%s/users/imgs/%s' % (basedir, img.name)
            element.attrib['src'] = 'cid:%s' % img.name
            imghandle = open(imgfile, 'wb')
            imghandle.write(base64.decodestring(img.image))
            imghandle.close()
            logger.info(_("Wrote img: %(img)s") % dict(img=imgfile))
            # update the sig with image obtained
            sig.image = img
    if 'link' in locals():
        sig.save()
    sighandle = open(sigfile, 'w')
    if not sig.signature_content.startswith('--'):
        sighandle.write('<br/>--<br/>')
    sighandle.write(tostring(html))
    logger.info(_("Wrote html signature: %(sig)s") % dict(sig=sigfile))
예제 #6
0
파일: tasks.py 프로젝트: heartshare/baruwa
def write_html_sig(sigfile, sig, basedir, is_domain, logger):
    "write html sig"
    cleaner = SignatureCleaner(style=True, remove_tags=UNCLEANTAGS,
                                safe_attrs_only=False)
    html = cleaner.clean_html(sig.signature_content)
    html = fragments_fromstring(html)[0]
    for element, attribute, link, pos in iterlinks(html):
        if link.startswith('/settings/imgs/'):
            view, args, kwargs = resolve(link)
            view = None
            args = None
            img = SignatureImg.objects.get(pk=kwargs['img_id'])
            if is_domain:
                imgfile = '%s/domains/imgs/%s' % (basedir, img.name)
            else:
                imgfile = '%s/users/imgs/%s' % (basedir, img.name)
            element.attrib['src'] = 'cid:%s' % img.name
            imghandle = open(imgfile, 'wb')
            imghandle.write(base64.decodestring(img.image))
            imghandle.close()
            logger.info(_("Wrote img: %(img)s") % dict(img=imgfile))
            # update the sig with image obtained
            sig.image = img
    if 'link' in locals():
        sig.save()
    sighandle = open(sigfile, 'w')
    if not sig.signature_content.startswith('--'):
        sighandle.write('<br/>--<br/>')
    sighandle.write(tostring(html))
    logger.info(_("Wrote html signature: %(sig)s") % dict(sig=sigfile))
예제 #7
0
def parse_fragments(html_string, safe_tags=None, safe_attrs=None):
    """Parse HTML fragments from the given HTML fragment string.
    """
    for f in html.fragments_fromstring(html_string):
        cf = clean_fragment(f, safe_tags=safe_tags, safe_attrs=safe_attrs)
        if cf is not None:
            yield cf
예제 #8
0
    def render(self, context):
        try:
            html = unescape_entities(self.nodelist.render(context))
            safe_html = self.sanitize(html)
            top_level_elements = fragments_fromstring(safe_html)
            # TODO: We need to remember to patch in whatever pre-save
            #       HTML processing we eventually do here, too.  E.g.
            #       a spam URL blacklist.
            out = []
            for elem in top_level_elements:
                if elem.tag == "iframe":
                    elem = self._process_iframe(elem)
                out.append(etree.tostring(elem, method="html", encoding="UTF-8"))
            return "".join(out)

        except IFrameSrcNotApproved:
            return (
                '<span class="plugin embed">'
                + _(
                    "The embedded URL is not on the list of approved providers.  "
                    "Contact the site administrator to add it."
                )
                + "</span>"
            )
        except:
            return '<span class="plugin embed">' + _("Invalid embed code") + "</span>"
예제 #9
0
def replace(shape, spec, data):
    if not isinstance(spec, dict):
        raise ValueError('replace: needs a dict of {old: new} text, not %r' %
                         spec)
    frame = get_text_frame(shape)

    def insert_run_after(r, p, original_r):
        new_r = copy.deepcopy(original_r)
        r._r.addnext(new_r)
        return _Run(new_r, p)

    spec = {
        re.compile(old): fragments_fromstring(expr(new, data))
        for old, new in spec.items()
    }
    for p in frame.paragraphs:
        for r in p.runs:
            for old, tree in spec.items():
                match = old.search(r.text)
                if match:
                    original_r, prefix, suffix = r._r, r.text[:match.start(
                    )], r.text[match.end():]
                    if prefix:
                        r.text = prefix
                        r = insert_run_after(r, p, original_r)
                    for j, run in enumerate(get_elements(tree, builder.A)):
                        r = insert_run_after(r, p, original_r) if j > 0 else r
                        r.text = run.text
                        for attr, val in run.attrib.items():
                            if attr in run_methods:
                                run_methods[attr](r, val, data)
                    if suffix:
                        # Ensure suffix has same attrs as original text
                        r = insert_run_after(r, p, original_r)
                        r.text = suffix
예제 #10
0
파일: tools.py 프로젝트: addventa/core-nlg
def handle_string_to_html(html_tag, text, *args, **kwargs):
    """
    If the text contains HTML elements, adds them inside the HTML tag created with the builder 'html_tag'.
    If it contains a string not in a HTML element, then it becomes the text content of the HTML tag.

    :param html_tag: builder of the HTML tag
    :type html_tag: lxml.html.builder function

    :param text: the text of the HTML tag
    :type text: string

    :param args: the class of the HTML container
    :type args: None / string / list of strings

    :param kwargs: other attributes of the HTML container
    :type kwargs: None / string / list of strings

    :return: the final element correctly formatted, especially if there was HTML containers in the text
    :return type: HTML element (lxml.html.HtmlElement)
    """

    e = html_tag(*args, **kwargs)
    e.text = ""
    for elem in html.fragments_fromstring(text):
        if isinstance(elem, str):
            e.text += elem
        else:
            e.append(elem)
    return e
예제 #11
0
파일: plugins.py 프로젝트: samrose/sapling
def html_to_template_text(unsafe_html, context=None):
    """
    Parse html and turn it into template text.
    """
    # TODO: factor out parsing/serializing
    safe_html = sanitize_intermediate(unsafe_html)
    top_level_elements = fragments_fromstring(safe_html)

    # put top level elements in container
    container = etree.Element('div')
    if top_level_elements and not hasattr(top_level_elements[0], 'tag'):
        container.text = top_level_elements.pop(0)
    container.extend(top_level_elements)

    tree = etree.iterwalk(container, events=('end',))
    # walk over all elements
    for action, elem in tree:
        if not elem.tag in tag_handlers:
            continue
        for handler in tag_handlers[elem.tag]:
            can_continue = handler(elem, context)
            if can_continue is False:
                break

    template_bits = [etree.tostring(elem, encoding='UTF-8')
                     for elem in container]
    return sanitize_final(''.join(tag_imports +
                                  [escape(container.text or '')] +
                                  template_bits
                                  )
                         )
예제 #12
0
파일: __init__.py 프로젝트: Fingel/potion
def insert_item(item):
    """docstring for item_insert"""
    frag = fragments_fromstring(item.content)
    for e in frag:
        if not type(e) == HtmlElement:
            continue
        for i in e.xpath('//img'):
            f = '/'+i.attrib['src'].replace('http://', '').replace('/','_').split('?')[0]
            if exists(cfg.get('cache', 'dir')+f):
                continue
            try:
                urllib.urlretrieve(i.attrib['src'], cfg.get('cache', 'dir')+f)
                i.attrib['src'] = cfg.get('cache', 'url')+f
            except:
                # TODO
                pass

    c = ''
    for i in frag:
        if type(i) == HtmlElement:
            c += unicode(tostring(i))
        else:
            c += i

    item.content = c
    return item
예제 #13
0
def parse_fragments(html_string, safe_tags=None, safe_attrs=None):
    """Parse HTML fragments from the given HTML fragment string.
    """
    for f in html.fragments_fromstring(decode_html(html_string)):
        cf = clean_fragment(f, safe_tags=safe_tags, safe_attrs=safe_attrs)
        if cf is not None:
            yield cf
예제 #14
0
    def process(self, text):
        doc = lxml_tree.fromstring(text)

        for block in doc.xpath('//text()'):
            result = inline_math.sub(self._sub_inline, block)
            result = display_math.sub(self._sub_display, result)
            if result == block:
                continue

            last = block.getparent()
            into_text = block.is_text

            if into_text:
                last.text = ''
            else:
                last.tail = ''

            for item in html.fragments_fromstring(result):
                if isinstance(item, ElementBase):
                    if into_text:
                        last.insert(0, item)
                    else:
                        next = last.getnext()
                        if next is None:
                            last.getparent().append(item)
                        else:
                            next.addprevious(item)
                    last = item
                    into_text = False
                elif into_text:
                    last.text += item
                else:
                    last.tail = item

        return doc
예제 #15
0
def rich_text_to_elems(ar, description):
    if description.startswith("<"):
        # desc = E.raw('<div>%s</div>' % self.description)
        desc = lxml_html.fragments_fromstring(ar.parse_memo(description))
        return desc
    # desc = E.raw('<div>%s</div>' % self.description)
    html = restify(ar.parse_memo(description))
    # logger.info(u"20180320 restify %s --> %s", description, html)
    # html = html.strip()
    try:
        desc = lxml_html.fragments_fromstring(html)
    except Exception as e:
        raise Exception("Could not parse {!r} : {}".format(html, e))
    # logger.info(
    #     "20160704c parsed --> %s", tostring(desc))
    return desc
예제 #16
0
    def _generate(self, article, style=None, extra_css=None):
        content = article.content if article.content else ''
        summary = article.summary if article.summary else ''
        text = content if len(content) > len(summary) else summary
        head = HEAD(TITLE(article.title))
        if style:
            head.append(STYLE(style, type='text/css'))
        if extra_css:
            head.append(STYLE(extra_css, type='text/css'))

        if isbytestring(text):
            text = text.decode('utf-8', 'replace')
        elements = html.fragments_fromstring(text)
        self.root = HTML(head,
                BODY(H2(article.title), DIV()))
        div = self.root.find('body').find('div')
        if elements and isinstance(elements[0], unicode_type):
            div.text = elements[0]
            elements = list(elements)[1:]
        for elem in elements:
            if hasattr(elem, 'getparent'):
                elem.getparent().remove(elem)
            else:
                elem = SPAN(elem)
            div.append(elem)
예제 #17
0
    def rows(self):
        """Values for the report table.

        The batch job software violates the principle of applying
        markup to information as far downstream as possible. As a
        result we have to jump through some hoops to extract what
        we need for the Last Message column.
        """

        jobs = getJobStatus(self.id, self.name, self.age, self.status)
        rows = []
        Cell = self.Reporter.Cell
        B = self.HTMLPage.B
        for job in jobs:
            row = list(job)
            row[0] = Cell(row[0], classes="center")
            if row[2]:
                row[2] = Cell(str(row[2])[:19], classes="nowrap")
            if row[4]:
                row[4] = Cell(str(row[4])[:19], classes="nowrap")
            if row[-1]:
                try:
                    node = html.fromstring(row[-1])
                    if node.tag == "errors":
                        errors = [child.text for child in node.findall("err")]
                        errors = "; ".join(errors)
                        row[-1] = Cell(errors, classes="error")
                    else:
                        row[-1] = Cell(node)
                except:
                    row[-1] = Cell(B.SPAN(*html.fragments_fromstring(row[-1])))
            rows.append(row)
        return rows
예제 #18
0
def html_to_template_text(unsafe_html):
    """
    Parse html and turn it into template text.
    """
    safe_html = sanitize_intermediate(unsafe_html)
    top_level_elements = fragments_fromstring(safe_html)

    # put top level elements in container
    container = etree.Element('div')
    if top_level_elements and not hasattr(top_level_elements[0], 'tag'):
        container.text = top_level_elements.pop(0)
    container.extend(top_level_elements)

    context = etree.iterwalk(container, events=('end', ))
    # walk over all elements
    for action, elem in context:
        if not elem.tag in tag_handlers:
            continue
        for handler in tag_handlers[elem.tag]:
            can_continue = handler(elem)
            if can_continue is False:
                break

    template_bits = [
        etree.tostring(elem, encoding='utf-8') for elem in container
    ]
    return sanitize_final(''.join(tag_imports + [container.text or ''] +
                                  template_bits))
예제 #19
0
def mark_images(html):
    """ Takes the given html and marks every paragraph with an 'has-img'
    class, if the paragraph contains an img element.

    """

    if not html:
        return html

    fragments = fragments_fromstring(html)

    # we perform a root xpath lookup, which will result in all paragraphs
    # being looked at - so we don't need to loop over all elements (yah, it's
    # a bit weird)
    for element in fragments[:1]:

        # instead of failing, lxml will return strings instead of elements if
        # they can't be parsed.. so we have to inspect the objects
        if not hasattr(element, 'xpath'):
            return html

        for paragraph in element.xpath('//p[img]'):
            if 'class' in paragraph.attrib:
                paragraph.attrib['class'] += ' has-img'
            else:
                paragraph.attrib['class'] = 'has-img'

    return ''.join(etree.tostring(e).decode('utf-8') for e in fragments)
예제 #20
0
    def _generate(self, article, style=None, extra_css=None):
        content = article.content if article.content else ''
        summary = article.summary if article.summary else ''
        text = content if len(content) > len(summary) else summary
        head = HEAD(TITLE(article.title))
        if style:
            head.append(STYLE(style, type='text/css'))
        if extra_css:
            head.append(STYLE(extra_css, type='text/css'))

        if isbytestring(text):
            text = text.decode('utf-8', 'replace')
        elements = html.fragments_fromstring(text)
        self.root = HTML(head,
                BODY(H2(article.title), DIV()))
        div = self.root.find('body').find('div')
        if elements and isinstance(elements[0], unicode_type):
            div.text = elements[0]
            elements = list(elements)[1:]
        for elem in elements:
            if hasattr(elem, 'getparent'):
                elem.getparent().remove(elem)
            else:
                elem = SPAN(elem)
            div.append(elem)
예제 #21
0
def prefix_chapter_title_with_chapter_no(html, chapter_no=None):
    """Return the html string with first header prefixed with
    chapter number.

    :param html: A string of html's chapter
    :param chapter_no: An int of chapter's number based on the order in
        config.yaml
    :return: A string of html in which the title already prefixed with chapter's
        number
    """
    # If there's no chapter_no as a prefix returns original html
    if not chapter_no:
        return html

    # Convert html fragments to node element
    nodes = fragments_fromstring(html)

    # Apply to first important header
    for node in nodes:
        if node.tag in ['h1', 'h2']:
            node.text = "%s %d %s %s" % (CHAPTER_NO_BEFORE, chapter_no,
                                         CHAPTER_NO_AFTER, node.text)
            break

    return ''.join([to_html_string(node) for node in nodes])
예제 #22
0
파일: scraper.py 프로젝트: a1646927/cfsresd
	def scrape(self):
		try:
			resp = requests.get(API_URL, params={'f': self.id_stamp}).json()
		except requests.exceptions.ConnectionError as e:
			puts("Error encountered when connecting to urgmsg: ", newline=False)
			puts(colored.red(e.__class__.__name__), newline=False)
			puts(" " + e.message)
			return

		if not resp['updated']:
			return

		old_id_stamp = self.id_stamp
		self.id_stamp = resp['IDstamp']
		# if old_id_stamp is 0, this is the first scrape
		# which will return a whole bunch of recent past messages
		if not self.recent_messages and old_id_stamp == 0: return

		# Pager messages are returned newest to oldest, we want to
		# process them oldest to newest
		frags = lh.fragments_fromstring(resp['data'])[::-1]
		for frag in frags:
			msg = PagerMessage(frag)
			for handler in self.handlers:
				handler(msg)
예제 #23
0
def insert_item(item):
    """docstring for item_insert"""
    frag = fragments_fromstring(item.content)
    for e in frag:
        if not type(e) == HtmlElement:
            continue
        for i in e.xpath('//img'):
            f = '/' + i.attrib['src'].replace('http://', '').replace(
                '/', '_').split('?')[0]
            if exists(cfg.get('cache', 'dir') + f):
                continue
            try:
                urllib.urlretrieve(i.attrib['src'],
                                   cfg.get('cache', 'dir') + f)
                i.attrib['src'] = cfg.get('cache', 'url') + f
            except:
                # TODO
                pass

    c = ''
    for i in frag:
        if type(i) == HtmlElement:
            c += unicode(tostring(i))
        else:
            c += i

    item.content = c
    return item
예제 #24
0
파일: ir_qweb.py 프로젝트: XiaodiJiang/odoo
    def _get_asset_content(self, xmlid, options):
        options = dict(self.env.context)
        options.update(options,
            inherit_branding=False, inherit_branding_auto=False,
            edit_translations=False, translatable=False,
            rendering_bundle=True)

        env = self.env(context=options)

        # TODO: This helper can be used by any template that wants to embedd the backend.
        #       It is currently necessary because the ir.ui.view bundle inheritance does not
        #       match the module dependency graph.
        def get_modules_order():
            if request:
                from odoo.addons.web.controllers.main import module_boot
                return json.dumps(module_boot())
            return '[]'
        template = env['ir.qweb'].render(xmlid, {"get_modules_order": get_modules_order})

        files = []
        remains = []
        for el in html.fragments_fromstring(template):
            if isinstance(el, basestring):
                remains.append(el)
            elif isinstance(el, html.HtmlElement):
                href = el.get('href', '')
                src = el.get('src', '')
                atype = el.get('type')
                media = el.get('media')

                can_aggregate = not urlparse(href).netloc and not href.startswith('/web/content')
                if el.tag == 'style' or (el.tag == 'link' and el.get('rel') == 'stylesheet' and can_aggregate):
                    if href.endswith('.sass'):
                        atype = 'text/sass'
                    elif href.endswith('.less'):
                        atype = 'text/less'
                    if atype not in ('text/less', 'text/sass'):
                        atype = 'text/css'
                    path = filter(None, href.split('/'))
                    filename = get_resource_path(*path)
                    files.append({'atype': atype, 'url': href, 'filename': filename, 'content': el.text, 'media': media})
                elif el.tag == 'script':
                    atype = 'text/javascript'
                    if src:
                        path = filter(None, src.split('/'))
                        filename = get_resource_path(*path)
                    else:
                        filename = None
                    files.append({'atype': atype, 'url': src, 'filename': filename, 'content': el.text, 'media': media})
                else:
                    remains.append(html.tostring(el))
            else:
                try:
                    remains.append(html.tostring(el))
                except Exception:
                    # notYETimplementederror
                    raise NotImplementedError

        return (files, remains)
예제 #25
0
    def _get_asset_content(self, xmlid, options):
        options = dict(self.env.context)
        options.update(options,
            inherit_branding=False, inherit_branding_auto=False,
            edit_translations=False, translatable=False,
            rendering_bundle=True)

        env = self.env(context=options)

        # TODO: This helper can be used by any template that wants to embedd the backend.
        #       It is currently necessary because the ir.ui.view bundle inheritance does not
        #       match the module dependency graph.
        def get_modules_order():
            if request:
                from odoo.addons.web.controllers.main import module_boot
                return json.dumps(module_boot())
            return '[]'
        template = env['ir.qweb'].render(xmlid, {"get_modules_order": get_modules_order})

        files = []
        remains = []
        for el in html.fragments_fromstring(template):
            if isinstance(el, basestring):
                remains.append(el)
            elif isinstance(el, html.HtmlElement):
                href = el.get('href', '')
                src = el.get('src', '')
                atype = el.get('type')
                media = el.get('media')

                can_aggregate = not urlparse(href).netloc and not href.startswith('/web/content')
                if el.tag == 'style' or (el.tag == 'link' and el.get('rel') == 'stylesheet' and can_aggregate):
                    if href.endswith('.sass'):
                        atype = 'text/sass'
                    elif href.endswith('.less'):
                        atype = 'text/less'
                    if atype not in ('text/less', 'text/sass'):
                        atype = 'text/css'
                    path = filter(None, href.split('/'))
                    filename = get_resource_path(*path)
                    files.append({'atype': atype, 'url': href, 'filename': filename, 'content': el.text, 'media': media})
                elif el.tag == 'script':
                    atype = 'text/javascript'
                    if src:
                        path = filter(None, src.split('/'))
                        filename = get_resource_path(*path)
                    else:
                        filename = None
                    files.append({'atype': atype, 'url': src, 'filename': filename, 'content': el.text, 'media': media})
                else:
                    remains.append(html.tostring(el))
            else:
                try:
                    remains.append(html.tostring(el))
                except Exception:
                    # notYETimplementederror
                    raise NotImplementedError

        return (files, remains)
예제 #26
0
    def process(self, body):
        if not self.templates:
            return body

        if html is None:
            raise ImportError("lxml.html")

        root = html.fromstring(body)
        for path, (method, limit) in self.templates.items():
            for element in root.xpath(path)[:limit]:
                out, write = generation.initialize_stream()
                def select(path):
                    return XPathResult(element.xpath(path))
                method(out, write, select)

                # replace element with fragments
                fragments = html.fragments_fromstring(out.getvalue())
                if element is root:
                    for fragment in fragments:
                        # ignore trivial fragments, if we're replacing
                        # the root node
                        if isinstance(fragment, basestring):
                            if fragment.strip('\n ') == "":
                                continue
                            raise ValueError(
                                "Must replace root with structural element.")

                        prev = root = fragment
                else:
                    # this node does have a parent; replace it with
                    # the fragments
                    tail = element.tail
                    prev = element.getprevious()
                    parent = element.getparent()
                    index = parent.index(element)

                    fragment = fragments[0]
                    if isinstance(fragment, basestring):
                        if prev is None:
                            parent.text = (parent.text or "") + fragment
                        else:
                            prev.tail = (prev.tail or "") + fragment
                        fragments.pop(0)

                    for fragment in fragments:
                        if isinstance(fragment, basestring):
                            assert prev is not None
                            prev.tail = (prev.tail or "") + fragment
                        else:
                            parent.insert(index+1, fragment)
                            prev = fragment

                    parent.remove(element)
                    if prev is None:
                        parent.text += tail
                    else:
                        prev.tail = tail

        return html.tostring(root, pretty_print=True, encoding=self.encoding)
예제 #27
0
def muddDinner():
    page = requests.get('https://aspc.pomona.edu/menu/',verify=False)
    tree = html.fragments_fromstring(page.text)
    mudd = tree[1].xpath('//*[@id="mudd_menu"]/td[3]/ul')
    muddItems = []
    for i in range(len(mudd[0])):
        muddItems += [mudd[0][i].text_content()]
    return muddItems
예제 #28
0
def scrippsDinner():
    page = requests.get('https://aspc.pomona.edu/menu/',verify=False)
    tree = html.fragments_fromstring(page.text)
    scripps = tree[1].xpath('//*[@id="scripps_menu"]/td[3]/ul')
    scrippsItems = []
    for i in range(len(scripps[0])):
        scrippsItems += [scripps[0][i].text_content()]
    return scrippsItems
예제 #29
0
def _clean_html(html_value, cleaner):
    fragments = html.fragments_fromstring(html_value)
    for f in fragments:
        if isinstance(f, html.HtmlElement):
            cleaner(f)
            yield html.tostring(f, encoding="unicode")
        else:
            yield f
예제 #30
0
def oldenborgDinner():
    page = requests.get('https://aspc.pomona.edu/menu/',verify=False)
    tree = html.fragments_fromstring(page.text)
    oldenborg = tree[1].xpath('//*[@id="oldenborg_menu"]/td[3]/ul')
    oldenborgItems = []
    for i in range(len(oldenborg[0])):
        oldenborgItems += [oldenborg[0][i].text_content()]
    return oldenborgItems
예제 #31
0
def cmcDinner():
    page = requests.get('https://aspc.pomona.edu/menu/',verify=False)
    tree = html.fragments_fromstring(page.text)
    cmc = tree[1].xpath('//*[@id="cmc_menu"]/td[3]/ul')
    cmcItems = []
    for i in range(len(cmc[0])):
        cmcItems += [cmc[0][i].text_content()]
    return cmcItems
예제 #32
0
def _clean_html(html_value, cleaner):
    fragments = html.fragments_fromstring(html_value)
    for f in fragments:
        if isinstance(f, html.HtmlElement):
            cleaner(f)
            yield html.tostring(f)
        else:
            yield f
예제 #33
0
def frankDinner():
    page = requests.get('https://aspc.pomona.edu/menu/',verify=False)
    tree = html.fragments_fromstring(page.text)
    frank = tree[1].xpath('//*[@id="frank_menu"]/td[3]/ul')
    frankItems = []
    for i in range(len(frank[0])):
        frankItems += [frank[0][i].text_content()]
    return frankItems
예제 #34
0
def pitzerDinner():
    page = requests.get('https://aspc.pomona.edu/menu/',verify=False)
    tree = html.fragments_fromstring(page.text)
    pitzer = tree[1].xpath('//*[@id="pitzer_menu"]/td[3]/ul')
    pitzerItems = []
    for i in range(len(pitzer[0])):
        pitzerItems += [pitzer[0][i].text_content()]
    return pitzerItems
예제 #35
0
 def render(self, res):
     self.fillTheTop()
     title = self.skin.root().xpath('//title')[0]
     title.text = self.title
     faqs = self.template.middle.find('div')
     current_record = None
     i = 0
     title_from_blog = self.title
     for r in res['rows']:
         i+=1
         faq_viewlet = deepcopy(self.template.root().find('faq').find('div'))
         if 'title' in r['doc']:
             title_from_blog = r['doc']['title']
             ti = faq_viewlet.xpath('//h3')[0]
             ti.text = r['doc']['title']
             ti.set('style','display:block')
         # else:
         faq_viewlet.set('id',r['doc']['_id'])
         author = faq_viewlet.xpath('//span[@class="faqauthor"]')[0]
         if 'name' in r['doc']:
             author.text = r['doc']['name']
         else:
             author.text = r['doc']['author']
         _date = r['doc']['date']
         _date.reverse()
         faq_viewlet.xpath('//span[@class="faqdate"]')[0].text = u'.'.join(_date)
         tag_container = faq_viewlet.xpath('//div[@class="faqtags"]')[0]
         for tag in r['doc'].get('tags',[]):
             a = etree.Element('a')
             a.set('href','/blog?tag='+tag)
             a.text = tag
             tag_container.append(a)
         text_field = faq_viewlet.xpath('//div[@class="faqbody well"]')[0]
         text_field.text = ''
         if r['doc']['type'] == 'blog' and 'parent' not in r['doc']:
             for el in html.fragments_fromstring(r['doc']['txt']):
                 _t = type(el)
                 if  _t is unicode or _t is str:
                     text_field.text += el
                 else:
                     text_field.append(el)
         else:
             text_field.text = r['doc']['txt']
         if not 'parent' in r['doc']:
             faqs.append(faq_viewlet)
             current_record = faq_viewlet
         else:
             faq_viewlet.set('class',faq_viewlet.get('class')+' fanswer')
             # this will remove the links from answers
             faq_viewlet.remove(faq_viewlet.xpath('//div[@class="faqlinks"]')[0])
             current_record.append(faq_viewlet)
     if i==1:
         title.text = title_from_blog
     for el in self.template.top:
         self.skin.top.append(el)
     for el in self.template.middle:
         self.skin.middle.append(el)        
     return self.skin.render()
예제 #36
0
    def _render(self, template, values=None, **options):
        """ render(template, values, **options)

        Render the template specified by the given name.

        :param template: etree, xml_id, template name (see _get_template)
            * Call the method ``load`` is not an etree.
        :param dict values: template values to be used for rendering
        :param options: used to compile the template (the dict available for the rendering is frozen)
            * ``load`` (function) overrides the load method

        :returns: bytes marked as markup-safe (decode to :class:`markupsafe.Markup`
                  instead of `str`)
        :rtype: MarkupSafe
        """
        context = dict(self.env.context,
                       dev_mode='qweb' in tools.config['dev_mode'])
        context.update(options)

        result = super(IrQWeb, self)._render(template,
                                             values=values,
                                             **context)

        if not values or not values.get('__keep_empty_lines'):
            result = markupsafe.Markup(
                IrQWeb._empty_lines.sub('\n', result.strip()))

        if 'data-pagebreak=' not in result:
            return result

        fragments = html.fragments_fromstring(result)

        for fragment in fragments:
            for row in fragment.iterfind('.//tr[@data-pagebreak]'):
                table = next(row.iterancestors('table'))
                newtable = html.Element('table', attrib=dict(table.attrib))
                thead = table.find('thead')
                if thead:
                    newtable.append(copy.deepcopy(thead))
                # TODO: copy caption & tfoot as well?
                # TODO: move rows in a tbody if row.getparent() is one?

                pos = row.get('data-pagebreak')
                assert pos in ('before', 'after')
                for sibling in row.getparent().iterchildren('tr'):
                    if sibling is row:
                        if pos == 'after':
                            newtable.append(sibling)
                        break
                    newtable.append(sibling)

                table.addprevious(newtable)
                table.addprevious(
                    html.Element('div',
                                 attrib={'style': 'page-break-after: always'}))

        return markupsafe.Markup(''.join(
            html.tostring(f).decode() for f in fragments))
예제 #37
0
def checkHTML(doc):
    from lxml import html
    ret = ResultNode("root")
    try:
        for c in treeCheck(html.fragments_fromstring(doc), "root"):
            ret.append_child(c)
    except html.etree.ParseError:
        ret.ok = False
    return ret
예제 #38
0
파일: p3.py 프로젝트: sammyt/p3
        def _html(node, d, i):
            ml = val if not callable(val) else val(node, d, i)
            frags = fragments_fromstring(ml)

            for child in node:
                node.remove(child)

            for new in frags:
                node.append(new)
예제 #39
0
파일: blog.py 프로젝트: ahri/nodeblog
def post_node(title, datetime, content):
    post = copy(POST)
    CSSSelector('.title .text')(post)[0].text = title
    CSSSelector('.datetime')(post)[0].text = datetime.strftime("%H:%M on %A the %%s of %B, %Y") % niceday(datetime)
    content_css = CSSSelector('.content')(post)[0]
    for fragment in fragments_fromstring(cleaner_trusted.clean_html(content)):
        content_css.append(fragment)

    return post
예제 #40
0
def stork_to_django(html):
    elements = fragments_fromstring(''.join([c for c in html if c != '\r']))
    html = ''.join(tostring(clean_html(element)) for element in elements)
    with_blocks = pseudoblock_re.sub(r'&& block \1 &&&& endblock &&', html)
    for bit, tag in TEMPLATE_TAG_ESCAPES:
        sub = ''.join(['&#%d;' % ord(c) for c in bit])
        with_blocks = with_blocks.replace(bit, sub)
    with_blocks = re.sub(r'&& block ([a-z]+) &&&& endblock &&',
                         r'{% block \1 %}{% endblock %}', with_blocks)
    return with_blocks
def _fragments_from_string(html_string):
    fragments = html.fragments_fromstring(html_string)
    if not len(fragments):
        return []
    # convert and append text node before starting tag
    if not isinstance(fragments[0], html.HtmlElement):
        if len(fragments[0].strip()) > 0:
            if len(fragments) == 1:
                return html.fragments_fromstring('<p>%s</p>' % fragments[0])
            else:
                paragraph = _create_element('p')
                paragraph.text = fragments[0]
                fragments[1].addprevious(paragraph)
                fragments.insert(1, paragraph)

        fragments.pop(0)
        if not len(fragments):
            return []
    return fragments
예제 #42
0
    def _get_asset_content(self, xmlid, options):
        options = dict(options,
            inherit_branding=False, inherit_branding_auto=False,
            edit_translations=False, translatable=False,
            rendering_bundle=True)

        options['website_id'] = self.env.context.get('website_id')
        IrQweb = self.env['ir.qweb'].with_context(options)

        def can_aggregate(url):
            return not urls.url_parse(url).scheme and not urls.url_parse(url).netloc and not url.startswith('/web/content')

        # TODO: This helper can be used by any template that wants to embedd the backend.
        #       It is currently necessary because the ir.ui.view bundle inheritance does not
        #       match the module dependency graph.
        def get_modules_order():
            if request:
                from odoo.addons.web.controllers.main import module_boot
                return json.dumps(module_boot())
            return '[]'
        template = IrQweb.render(xmlid, {"get_modules_order": get_modules_order})

        files = []
        remains = []
        for el in html.fragments_fromstring(template):
            if isinstance(el, html.HtmlElement):
                href = el.get('href', '')
                src = el.get('src', '')
                atype = el.get('type')
                media = el.get('media')

                if can_aggregate(href) and (el.tag == 'style' or (el.tag == 'link' and el.get('rel') == 'stylesheet')):
                    if href.endswith('.sass'):
                        atype = 'text/sass'
                    elif href.endswith('.scss'):
                        atype = 'text/scss'
                    elif href.endswith('.less'):
                        atype = 'text/less'
                    if atype not in ('text/less', 'text/scss', 'text/sass'):
                        atype = 'text/css'
                    path = [segment for segment in href.split('/') if segment]
                    filename = get_resource_path(*path) if path else None
                    files.append({'atype': atype, 'url': href, 'filename': filename, 'content': el.text, 'media': media})
                elif can_aggregate(src) and el.tag == 'script':
                    atype = 'text/javascript'
                    path = [segment for segment in src.split('/') if segment]
                    filename = get_resource_path(*path) if path else None
                    files.append({'atype': atype, 'url': src, 'filename': filename, 'content': el.text, 'media': media})
                else:
                    remains.append((el.tag, OrderedDict(el.attrib), el.text))
            else:
                # the other cases are ignored
                pass

        return (files, remains)
예제 #43
0
    def render(self, id_or_xml_id, values=None, **options):
        """ render(id_or_xml_id, values, **options)

        Render the template specified by the given name.

        :param id_or_xml_id: name or etree (see get_template)
        :param dict values: template values to be used for rendering
        :param options: used to compile the template (the dict available for the rendering is frozen)
            * ``load`` (function) overrides the load method
            * ``profile`` (float) profile the rendering (use astor lib) (filter
              profile line with time ms >= profile)
        """
        for method in dir(self):
            if method.startswith('render_'):
                _logger.warning("Unused method '%s' is found in ir.qweb." %
                                method)

        context = dict(self.env.context,
                       dev_mode='qweb' in tools.config['dev_mode'])
        context.update(options)

        result = super(IrQWeb, self).render(id_or_xml_id,
                                            values=values,
                                            **context)

        if b'data-pagebreak=' not in result:
            return result

        fragments = html.fragments_fromstring(result)

        for fragment in fragments:
            for row in fragment.iterfind('.//tr[@data-pagebreak]'):
                table = next(row.iterancestors('table'))
                newtable = html.Element('table', attrib=dict(table.attrib))
                thead = table.find('thead')
                if thead:
                    newtable.append(copy.deepcopy(thead))
                # TODO: copy caption & tfoot as well?
                # TODO: move rows in a tbody if row.getparent() is one?

                pos = row.get('data-pagebreak')
                assert pos in ('before', 'after')
                for sibling in row.getparent().iterchildren('tr'):
                    if sibling is row:
                        if pos == 'after':
                            newtable.append(sibling)
                        break
                    newtable.append(sibling)

                table.addprevious(newtable)
                table.addprevious(
                    html.Element('div',
                                 attrib={'style': 'page-break-after: always'}))

        return b''.join(html.tostring(f) for f in fragments)
예제 #44
0
def rich_text_to_elems(ar, description):
    """
    A RichTextField can contain HTML markup or plain text.
    """
    if description.startswith("<"):
        # desc = E.raw('<div>%s</div>' % self.description)
        desc = lxml_html.fragments_fromstring(ar.parse_memo(description))
        return desc
    # desc = E.raw('<div>%s</div>' % self.description)
    html = restify(ar.parse_memo(description))
    # logger.info(u"20180320 restify %s --> %s", description, html)
    # html = html.strip()
    try:
        desc = lxml_html.fragments_fromstring(html)
    except Exception as e:
        raise Exception(
            "Could not parse {!r} : {}".format(html, e))
    # logger.info(
    #     "20160704c parsed --> %s", tostring(desc))
    return desc
예제 #45
0
def replace_tag_by_string(tag, string):
    fragments = html.fragments_fromstring(string)#.encode('utf-8'))
    parent = tag.getparent()
    if fragments and isinstance(fragments[0], basestring):
        # append inserted fragment's text to previous element
        prepend_text(tag, fragments.pop(0))

    for child in fragments:
        parent.insert(parent.index(tag), child)

    tag.drop_tree()
예제 #46
0
파일: lxml_aloha.py 프로젝트: dmdm/PySite
 def parse_page(self, txt):
     self.fragments = None
     self.newfragments = None
     # Parse into fragments (plural), not into a single fragments.
     # In the latter case lxml would try to create valid HTML:
     # - if txt has no root element, txt is enclosed in 'div'
     # - if txt starts with strings (not tags), these are enclosed
     #   in 'p'
     # To avoid this, we let lxml parse txt into fragments which
     # preserves leading text.
     self.fragments = html.fragments_fromstring(txt)
예제 #47
0
def get_daily_task_answer() -> (str, str, str):
	global cookie_jar
	print("get question...")
	header = {
		"User-Agent": user_agent,
		"Referer": referer
	}
	# response = requests.get(get_question_url, headers=header, cookies=cookie_jar)
	response = session.get(get_question_url, headers=header)
	check_status_code(response, "get daily question")
	if "您今天已经参加过答题,明天再来吧!" in response.text:
		print("已答题")
		return "", "", ""
	cookie_jar.update(response.cookies)
	dom = xml.parseString(response.text)
	data = dom.childNodes[0].childNodes[0].data
	nodes = html.fragments_fromstring(data)
	form_hash_node = nodes[1].cssselect('form input[name="formhash"]')[0]
	form_hash = form_hash_node.get("value")
	sec_hash_node = nodes[1].cssselect("form input[name='sechash']")[0]
	sec_hash = sec_hash_node.get("value")
	print(f"form hash: {form_hash}")
	question_node = nodes[1].cssselect("form div span font")[0]
	question = question_node.text_content()
	question = question[5:]  # 去掉开始的 "【问题】 "
	question = question.strip()  # 去掉结尾空格
	print(f"question: {question}")
	answer_nodes = nodes[1].cssselect("form div.qs_option input")
	answers = {}
	for node in answer_nodes:
		id = node.get("value")
		text = node.getparent().text_content()
		answers[id] = text[2:].strip()  # 去掉前后的空格 fix https://github.com/harryhare/1point3acres/issues/3
	print(f"answers: {answers}")
	answer = ""
	answer_id = ""
	if question in questions.questions.keys():
		answer = questions.questions[question]

		if type(answer) == list:
			for k in answers:
				if answers[k] in answer:
					print(f"find answer: {answers[k]} option value: {k} ")
					answer_id = k
		else:
			for k in answers:
				if answers[k] == answer:
					print(f"find answer: {answers[k]} option value: {k} ")
					answer_id = k
		if answer_id == "":
			print(f"answer not found: {answer}")
	else:
		print("question not found")
	return answer_id, form_hash, sec_hash
예제 #48
0
파일: html.py 프로젝트: VinceRafale/tiger
def stork_to_django(html):
    elements = fragments_fromstring(''.join([
        c for c in html
        if c != '\r'
    ]))
    html = ''.join(tostring(clean_html(element)) for element in elements)
    with_blocks = pseudoblock_re.sub(r'&& block \1 &&&& endblock &&', html)
    for bit, tag in TEMPLATE_TAG_ESCAPES:
        sub = ''.join(['&#%d;' % ord(c) for c in bit])
        with_blocks = with_blocks.replace(bit, sub)    
    with_blocks = re.sub(r'&& block ([a-z]+) &&&& endblock &&', r'{% block \1 %}{% endblock %}', with_blocks)
    return with_blocks
예제 #49
0
def first_paragraph(value): 
    import re
    from lxml.html import fragments_fromstring, tostring
    fragments = fragments_fromstring(value)
    if len(fragments):
        for fragment in fragments:
            if getattr(fragment, 'tag', None) == 'p':
                fragment.drop_tag()
                return tostring(fragment)

    graphs = re.split(r'[\r\n]{2,}', value)
    return graphs[0]
예제 #50
0
파일: ir_qweb.py 프로젝트: Tecnativa/odoo
    def render(self, id_or_xml_id, values=None, **options):
        """ render(id_or_xml_id, values, **options)

        Render the template specified by the given name.

        :param id_or_xml_id: name or etree (see get_template)
        :param dict values: template values to be used for rendering
        :param options: used to compile the template (the dict available for the rendering is frozen)
            * ``load`` (function) overrides the load method
            * ``profile`` (float) profile the rendering (use astor lib) (filter
              profile line with time ms >= profile)
        """
        for method in dir(self):
            if method.startswith('render_'):
                _logger.warning("Unused method '%s' is found in ir.qweb." % method)

        context = dict(self.env.context, dev_mode='qweb' in tools.config['dev_mode'])
        context.update(options)

        result = super(IrQWeb, self).render(id_or_xml_id, values=values, **context)

        if b'data-pagebreak=' not in result:
            return result

        fragments = html.fragments_fromstring(result)

        for fragment in fragments:
            for row in fragment.iterfind('.//tr[@data-pagebreak]'):
                table = next(row.iterancestors('table'))
                newtable = html.Element('table', attrib=dict(table.attrib))
                thead = table.find('thead')
                if thead:
                    newtable.append(copy.deepcopy(thead))
                # TODO: copy caption & tfoot as well?
                # TODO: move rows in a tbody if row.getparent() is one?

                pos = row.get('data-pagebreak')
                assert pos in ('before', 'after')
                for sibling in row.getparent().iterchildren('tr'):
                    if sibling is row:
                        if pos == 'after':
                            newtable.append(sibling)
                        break
                    newtable.append(sibling)

                table.addprevious(newtable)
                table.addprevious(html.Element('div', attrib={
                    'style': 'page-break-after: always'
                }))

        return b''.join(html.tostring(f) for f in fragments)
예제 #51
0
    def has_iframe(self):
        field = self.getField('embed')
        value = field.getAccessor(self)()
        if not value:
            return False

        html_elements = fragments_fromstring(value)
        iframes = [iframe for iframe in html_elements
                    if iframe.tag == 'iframe']

        if iframes:
            return True

        return False
예제 #52
0
def summarize(string):
    """Summarize a bunch of html.

    What 'summarize' means in this case is to cherrypick the first paragraph
    as well as, if it precedes that tag, the first image tag."""
    soup = html.fragments_fromstring(string)
    ret = []
    for tag in soup:
        if tag.tag == 'img':
            ret.append(html.tostring(tag, method='xml'))
        elif tag.tag == 'p':
            ret.append(html.tostring(tag))
            break
    return '\n'.join(ret)