def format_head(node, path='', *args, **kwargs): if isfile(os.path.join(path, '../../../style.css')): code_nodes = html.fragments_fromstring(""" <meta http-equiv="content-type" content="text/html; charset=UTF-8"> <link href="../../style.css" rel="stylesheet"> """) else: code_nodes = html.fragments_fromstring(""" <meta http-equiv="content-type" content="text/html; charset=UTF-8"> <link href="../style.css" rel="stylesheet"> """) for code_node in code_nodes: node.append(code_node) return node
def process(self, text): doc = lxml_tree.fromstring(text) for block in doc.xpath('//text()'): result = inline_math.sub(self._sub_inline, block) result = display_math.sub(self._sub_display, result) if result == block: continue last = block.getparent() into_text = block.is_text if into_text: last.text = '' else: last.tail = '' for item in html.fragments_fromstring(result): if isinstance(item, ElementBase): if into_text: last.insert(0, item) else: next = last.getnext() if next is None: last.getparent().append(item) else: next.addprevious(item) last = item into_text = False elif into_text: last.text += item else: last.tail = item return doc
def verify_html(str_l): """ 验证输入的html是否准确 str_l:list,str_l[0] is uploaded html fragment string;str_l[1] is error message(if). """ container = list() child_cnt = list() cur = list() html = ht.fragments_fromstring(str_l[0]) if len(html) != 1: raise ValueError html = html[0] container.append(html) child_cnt.append(1) cur.append(0) while (container): node = container[-1] cnt = child_cnt[-1] idx = cur[-1] if cnt >= idx: child = node[idx] if child.tag not in gc_verify_container_tag: raise ValueError c_cnt = len(child) if c_cnt > 1: if child.tag == 'ul': container.append(child) child_cnt.append(c_cnt - 1) cur[-1] = idx + 1 cur.append(0) else: raise ValueError else: if child.tag == "h5": if len(child.attrib) or \ gc_escape_char.search(child.text): raise ValueError elif child.tag == "li": attr = child.attrib child_href = child[0] if len(attr) or child_href.tag != 'a': raise ValueError attr = child_href.attrib if len(attr)!=1 or \ not attr.has_key("href") or \ gc_escape_char.search(child_href.text) or \ not URL_CHECK(attr["href"]): # not gc_url.match(attr["href"]): raise ValueError else: raise ValueError cur[-1] = idx + 1 else: container.pop() child_cnt.pop() cur.pop()
def scrape(self): try: resp = requests.get(API_URL, params={'f': self.id_stamp}).json() except requests.exceptions.ConnectionError as e: puts("Error encountered when connecting to urgmsg: ", newline=False) puts(colored.magenta(e.__class__.__name__), newline=False) print "" time.sleep(self.timeout) scraper.run() if not resp['updated']: return old_id_stamp = self.id_stamp self.id_stamp = resp['IDstamp'] # if old_id_stamp is 0, this is the first scrape # which will return a whole bunch of recent past messages if not self.recent_messages and old_id_stamp == 0: return # Pager messages are returned newest to oldest, we want to # process them oldest to newest frags = lh.fragments_fromstring(resp['data'])[::-1] for frag in frags: msg = PagerMessage(frag) for handler in self.handlers: handler(msg)
def write_html_sig(sigfile, sig, basedir, is_domain, logger): "write html sig" cleaner = SignatureCleaner(style=True, remove_tags=UNCLEANTAGS, safe_attrs_only=False) html = cleaner.clean_html(sig.signature_content) html = fragments_fromstring(html)[0] for element, attribute, link, pos in iterlinks(html): if link.startswith('/settings/imgs/'): view, args, kwargs = resolve(link) view = None args = None img = SignatureImg.objects.get(pk=kwargs['img_id']) if is_domain: imgfile = '%s/domains/imgs/%s' % (basedir, img.name) else: imgfile = '%s/users/imgs/%s' % (basedir, img.name) element.attrib['src'] = 'cid:%s' % img.name imghandle = open(imgfile, 'wb') imghandle.write(base64.decodestring(img.image)) imghandle.close() logger.info(_("Wrote img: %(img)s") % dict(img=imgfile)) # update the sig with image obtained sig.image = img if 'link' in locals(): sig.save() sighandle = open(sigfile, 'w') if not sig.signature_content.startswith('--'): sighandle.write('<br/>--<br/>') sighandle.write(tostring(html)) logger.info(_("Wrote html signature: %(sig)s") % dict(sig=sigfile))
def parse_fragments(html_string, safe_tags=None, safe_attrs=None): """Parse HTML fragments from the given HTML fragment string. """ for f in html.fragments_fromstring(html_string): cf = clean_fragment(f, safe_tags=safe_tags, safe_attrs=safe_attrs) if cf is not None: yield cf
def render(self, context): try: html = unescape_entities(self.nodelist.render(context)) safe_html = self.sanitize(html) top_level_elements = fragments_fromstring(safe_html) # TODO: We need to remember to patch in whatever pre-save # HTML processing we eventually do here, too. E.g. # a spam URL blacklist. out = [] for elem in top_level_elements: if elem.tag == "iframe": elem = self._process_iframe(elem) out.append(etree.tostring(elem, method="html", encoding="UTF-8")) return "".join(out) except IFrameSrcNotApproved: return ( '<span class="plugin embed">' + _( "The embedded URL is not on the list of approved providers. " "Contact the site administrator to add it." ) + "</span>" ) except: return '<span class="plugin embed">' + _("Invalid embed code") + "</span>"
def replace(shape, spec, data): if not isinstance(spec, dict): raise ValueError('replace: needs a dict of {old: new} text, not %r' % spec) frame = get_text_frame(shape) def insert_run_after(r, p, original_r): new_r = copy.deepcopy(original_r) r._r.addnext(new_r) return _Run(new_r, p) spec = { re.compile(old): fragments_fromstring(expr(new, data)) for old, new in spec.items() } for p in frame.paragraphs: for r in p.runs: for old, tree in spec.items(): match = old.search(r.text) if match: original_r, prefix, suffix = r._r, r.text[:match.start( )], r.text[match.end():] if prefix: r.text = prefix r = insert_run_after(r, p, original_r) for j, run in enumerate(get_elements(tree, builder.A)): r = insert_run_after(r, p, original_r) if j > 0 else r r.text = run.text for attr, val in run.attrib.items(): if attr in run_methods: run_methods[attr](r, val, data) if suffix: # Ensure suffix has same attrs as original text r = insert_run_after(r, p, original_r) r.text = suffix
def handle_string_to_html(html_tag, text, *args, **kwargs): """ If the text contains HTML elements, adds them inside the HTML tag created with the builder 'html_tag'. If it contains a string not in a HTML element, then it becomes the text content of the HTML tag. :param html_tag: builder of the HTML tag :type html_tag: lxml.html.builder function :param text: the text of the HTML tag :type text: string :param args: the class of the HTML container :type args: None / string / list of strings :param kwargs: other attributes of the HTML container :type kwargs: None / string / list of strings :return: the final element correctly formatted, especially if there was HTML containers in the text :return type: HTML element (lxml.html.HtmlElement) """ e = html_tag(*args, **kwargs) e.text = "" for elem in html.fragments_fromstring(text): if isinstance(elem, str): e.text += elem else: e.append(elem) return e
def html_to_template_text(unsafe_html, context=None): """ Parse html and turn it into template text. """ # TODO: factor out parsing/serializing safe_html = sanitize_intermediate(unsafe_html) top_level_elements = fragments_fromstring(safe_html) # put top level elements in container container = etree.Element('div') if top_level_elements and not hasattr(top_level_elements[0], 'tag'): container.text = top_level_elements.pop(0) container.extend(top_level_elements) tree = etree.iterwalk(container, events=('end',)) # walk over all elements for action, elem in tree: if not elem.tag in tag_handlers: continue for handler in tag_handlers[elem.tag]: can_continue = handler(elem, context) if can_continue is False: break template_bits = [etree.tostring(elem, encoding='UTF-8') for elem in container] return sanitize_final(''.join(tag_imports + [escape(container.text or '')] + template_bits ) )
def insert_item(item): """docstring for item_insert""" frag = fragments_fromstring(item.content) for e in frag: if not type(e) == HtmlElement: continue for i in e.xpath('//img'): f = '/'+i.attrib['src'].replace('http://', '').replace('/','_').split('?')[0] if exists(cfg.get('cache', 'dir')+f): continue try: urllib.urlretrieve(i.attrib['src'], cfg.get('cache', 'dir')+f) i.attrib['src'] = cfg.get('cache', 'url')+f except: # TODO pass c = '' for i in frag: if type(i) == HtmlElement: c += unicode(tostring(i)) else: c += i item.content = c return item
def parse_fragments(html_string, safe_tags=None, safe_attrs=None): """Parse HTML fragments from the given HTML fragment string. """ for f in html.fragments_fromstring(decode_html(html_string)): cf = clean_fragment(f, safe_tags=safe_tags, safe_attrs=safe_attrs) if cf is not None: yield cf
def rich_text_to_elems(ar, description): if description.startswith("<"): # desc = E.raw('<div>%s</div>' % self.description) desc = lxml_html.fragments_fromstring(ar.parse_memo(description)) return desc # desc = E.raw('<div>%s</div>' % self.description) html = restify(ar.parse_memo(description)) # logger.info(u"20180320 restify %s --> %s", description, html) # html = html.strip() try: desc = lxml_html.fragments_fromstring(html) except Exception as e: raise Exception("Could not parse {!r} : {}".format(html, e)) # logger.info( # "20160704c parsed --> %s", tostring(desc)) return desc
def _generate(self, article, style=None, extra_css=None): content = article.content if article.content else '' summary = article.summary if article.summary else '' text = content if len(content) > len(summary) else summary head = HEAD(TITLE(article.title)) if style: head.append(STYLE(style, type='text/css')) if extra_css: head.append(STYLE(extra_css, type='text/css')) if isbytestring(text): text = text.decode('utf-8', 'replace') elements = html.fragments_fromstring(text) self.root = HTML(head, BODY(H2(article.title), DIV())) div = self.root.find('body').find('div') if elements and isinstance(elements[0], unicode_type): div.text = elements[0] elements = list(elements)[1:] for elem in elements: if hasattr(elem, 'getparent'): elem.getparent().remove(elem) else: elem = SPAN(elem) div.append(elem)
def rows(self): """Values for the report table. The batch job software violates the principle of applying markup to information as far downstream as possible. As a result we have to jump through some hoops to extract what we need for the Last Message column. """ jobs = getJobStatus(self.id, self.name, self.age, self.status) rows = [] Cell = self.Reporter.Cell B = self.HTMLPage.B for job in jobs: row = list(job) row[0] = Cell(row[0], classes="center") if row[2]: row[2] = Cell(str(row[2])[:19], classes="nowrap") if row[4]: row[4] = Cell(str(row[4])[:19], classes="nowrap") if row[-1]: try: node = html.fromstring(row[-1]) if node.tag == "errors": errors = [child.text for child in node.findall("err")] errors = "; ".join(errors) row[-1] = Cell(errors, classes="error") else: row[-1] = Cell(node) except: row[-1] = Cell(B.SPAN(*html.fragments_fromstring(row[-1]))) rows.append(row) return rows
def html_to_template_text(unsafe_html): """ Parse html and turn it into template text. """ safe_html = sanitize_intermediate(unsafe_html) top_level_elements = fragments_fromstring(safe_html) # put top level elements in container container = etree.Element('div') if top_level_elements and not hasattr(top_level_elements[0], 'tag'): container.text = top_level_elements.pop(0) container.extend(top_level_elements) context = etree.iterwalk(container, events=('end', )) # walk over all elements for action, elem in context: if not elem.tag in tag_handlers: continue for handler in tag_handlers[elem.tag]: can_continue = handler(elem) if can_continue is False: break template_bits = [ etree.tostring(elem, encoding='utf-8') for elem in container ] return sanitize_final(''.join(tag_imports + [container.text or ''] + template_bits))
def mark_images(html): """ Takes the given html and marks every paragraph with an 'has-img' class, if the paragraph contains an img element. """ if not html: return html fragments = fragments_fromstring(html) # we perform a root xpath lookup, which will result in all paragraphs # being looked at - so we don't need to loop over all elements (yah, it's # a bit weird) for element in fragments[:1]: # instead of failing, lxml will return strings instead of elements if # they can't be parsed.. so we have to inspect the objects if not hasattr(element, 'xpath'): return html for paragraph in element.xpath('//p[img]'): if 'class' in paragraph.attrib: paragraph.attrib['class'] += ' has-img' else: paragraph.attrib['class'] = 'has-img' return ''.join(etree.tostring(e).decode('utf-8') for e in fragments)
def prefix_chapter_title_with_chapter_no(html, chapter_no=None): """Return the html string with first header prefixed with chapter number. :param html: A string of html's chapter :param chapter_no: An int of chapter's number based on the order in config.yaml :return: A string of html in which the title already prefixed with chapter's number """ # If there's no chapter_no as a prefix returns original html if not chapter_no: return html # Convert html fragments to node element nodes = fragments_fromstring(html) # Apply to first important header for node in nodes: if node.tag in ['h1', 'h2']: node.text = "%s %d %s %s" % (CHAPTER_NO_BEFORE, chapter_no, CHAPTER_NO_AFTER, node.text) break return ''.join([to_html_string(node) for node in nodes])
def scrape(self): try: resp = requests.get(API_URL, params={'f': self.id_stamp}).json() except requests.exceptions.ConnectionError as e: puts("Error encountered when connecting to urgmsg: ", newline=False) puts(colored.red(e.__class__.__name__), newline=False) puts(" " + e.message) return if not resp['updated']: return old_id_stamp = self.id_stamp self.id_stamp = resp['IDstamp'] # if old_id_stamp is 0, this is the first scrape # which will return a whole bunch of recent past messages if not self.recent_messages and old_id_stamp == 0: return # Pager messages are returned newest to oldest, we want to # process them oldest to newest frags = lh.fragments_fromstring(resp['data'])[::-1] for frag in frags: msg = PagerMessage(frag) for handler in self.handlers: handler(msg)
def insert_item(item): """docstring for item_insert""" frag = fragments_fromstring(item.content) for e in frag: if not type(e) == HtmlElement: continue for i in e.xpath('//img'): f = '/' + i.attrib['src'].replace('http://', '').replace( '/', '_').split('?')[0] if exists(cfg.get('cache', 'dir') + f): continue try: urllib.urlretrieve(i.attrib['src'], cfg.get('cache', 'dir') + f) i.attrib['src'] = cfg.get('cache', 'url') + f except: # TODO pass c = '' for i in frag: if type(i) == HtmlElement: c += unicode(tostring(i)) else: c += i item.content = c return item
def _get_asset_content(self, xmlid, options): options = dict(self.env.context) options.update(options, inherit_branding=False, inherit_branding_auto=False, edit_translations=False, translatable=False, rendering_bundle=True) env = self.env(context=options) # TODO: This helper can be used by any template that wants to embedd the backend. # It is currently necessary because the ir.ui.view bundle inheritance does not # match the module dependency graph. def get_modules_order(): if request: from odoo.addons.web.controllers.main import module_boot return json.dumps(module_boot()) return '[]' template = env['ir.qweb'].render(xmlid, {"get_modules_order": get_modules_order}) files = [] remains = [] for el in html.fragments_fromstring(template): if isinstance(el, basestring): remains.append(el) elif isinstance(el, html.HtmlElement): href = el.get('href', '') src = el.get('src', '') atype = el.get('type') media = el.get('media') can_aggregate = not urlparse(href).netloc and not href.startswith('/web/content') if el.tag == 'style' or (el.tag == 'link' and el.get('rel') == 'stylesheet' and can_aggregate): if href.endswith('.sass'): atype = 'text/sass' elif href.endswith('.less'): atype = 'text/less' if atype not in ('text/less', 'text/sass'): atype = 'text/css' path = filter(None, href.split('/')) filename = get_resource_path(*path) files.append({'atype': atype, 'url': href, 'filename': filename, 'content': el.text, 'media': media}) elif el.tag == 'script': atype = 'text/javascript' if src: path = filter(None, src.split('/')) filename = get_resource_path(*path) else: filename = None files.append({'atype': atype, 'url': src, 'filename': filename, 'content': el.text, 'media': media}) else: remains.append(html.tostring(el)) else: try: remains.append(html.tostring(el)) except Exception: # notYETimplementederror raise NotImplementedError return (files, remains)
def process(self, body): if not self.templates: return body if html is None: raise ImportError("lxml.html") root = html.fromstring(body) for path, (method, limit) in self.templates.items(): for element in root.xpath(path)[:limit]: out, write = generation.initialize_stream() def select(path): return XPathResult(element.xpath(path)) method(out, write, select) # replace element with fragments fragments = html.fragments_fromstring(out.getvalue()) if element is root: for fragment in fragments: # ignore trivial fragments, if we're replacing # the root node if isinstance(fragment, basestring): if fragment.strip('\n ') == "": continue raise ValueError( "Must replace root with structural element.") prev = root = fragment else: # this node does have a parent; replace it with # the fragments tail = element.tail prev = element.getprevious() parent = element.getparent() index = parent.index(element) fragment = fragments[0] if isinstance(fragment, basestring): if prev is None: parent.text = (parent.text or "") + fragment else: prev.tail = (prev.tail or "") + fragment fragments.pop(0) for fragment in fragments: if isinstance(fragment, basestring): assert prev is not None prev.tail = (prev.tail or "") + fragment else: parent.insert(index+1, fragment) prev = fragment parent.remove(element) if prev is None: parent.text += tail else: prev.tail = tail return html.tostring(root, pretty_print=True, encoding=self.encoding)
def muddDinner(): page = requests.get('https://aspc.pomona.edu/menu/',verify=False) tree = html.fragments_fromstring(page.text) mudd = tree[1].xpath('//*[@id="mudd_menu"]/td[3]/ul') muddItems = [] for i in range(len(mudd[0])): muddItems += [mudd[0][i].text_content()] return muddItems
def scrippsDinner(): page = requests.get('https://aspc.pomona.edu/menu/',verify=False) tree = html.fragments_fromstring(page.text) scripps = tree[1].xpath('//*[@id="scripps_menu"]/td[3]/ul') scrippsItems = [] for i in range(len(scripps[0])): scrippsItems += [scripps[0][i].text_content()] return scrippsItems
def _clean_html(html_value, cleaner): fragments = html.fragments_fromstring(html_value) for f in fragments: if isinstance(f, html.HtmlElement): cleaner(f) yield html.tostring(f, encoding="unicode") else: yield f
def oldenborgDinner(): page = requests.get('https://aspc.pomona.edu/menu/',verify=False) tree = html.fragments_fromstring(page.text) oldenborg = tree[1].xpath('//*[@id="oldenborg_menu"]/td[3]/ul') oldenborgItems = [] for i in range(len(oldenborg[0])): oldenborgItems += [oldenborg[0][i].text_content()] return oldenborgItems
def cmcDinner(): page = requests.get('https://aspc.pomona.edu/menu/',verify=False) tree = html.fragments_fromstring(page.text) cmc = tree[1].xpath('//*[@id="cmc_menu"]/td[3]/ul') cmcItems = [] for i in range(len(cmc[0])): cmcItems += [cmc[0][i].text_content()] return cmcItems
def _clean_html(html_value, cleaner): fragments = html.fragments_fromstring(html_value) for f in fragments: if isinstance(f, html.HtmlElement): cleaner(f) yield html.tostring(f) else: yield f
def frankDinner(): page = requests.get('https://aspc.pomona.edu/menu/',verify=False) tree = html.fragments_fromstring(page.text) frank = tree[1].xpath('//*[@id="frank_menu"]/td[3]/ul') frankItems = [] for i in range(len(frank[0])): frankItems += [frank[0][i].text_content()] return frankItems
def pitzerDinner(): page = requests.get('https://aspc.pomona.edu/menu/',verify=False) tree = html.fragments_fromstring(page.text) pitzer = tree[1].xpath('//*[@id="pitzer_menu"]/td[3]/ul') pitzerItems = [] for i in range(len(pitzer[0])): pitzerItems += [pitzer[0][i].text_content()] return pitzerItems
def render(self, res): self.fillTheTop() title = self.skin.root().xpath('//title')[0] title.text = self.title faqs = self.template.middle.find('div') current_record = None i = 0 title_from_blog = self.title for r in res['rows']: i+=1 faq_viewlet = deepcopy(self.template.root().find('faq').find('div')) if 'title' in r['doc']: title_from_blog = r['doc']['title'] ti = faq_viewlet.xpath('//h3')[0] ti.text = r['doc']['title'] ti.set('style','display:block') # else: faq_viewlet.set('id',r['doc']['_id']) author = faq_viewlet.xpath('//span[@class="faqauthor"]')[0] if 'name' in r['doc']: author.text = r['doc']['name'] else: author.text = r['doc']['author'] _date = r['doc']['date'] _date.reverse() faq_viewlet.xpath('//span[@class="faqdate"]')[0].text = u'.'.join(_date) tag_container = faq_viewlet.xpath('//div[@class="faqtags"]')[0] for tag in r['doc'].get('tags',[]): a = etree.Element('a') a.set('href','/blog?tag='+tag) a.text = tag tag_container.append(a) text_field = faq_viewlet.xpath('//div[@class="faqbody well"]')[0] text_field.text = '' if r['doc']['type'] == 'blog' and 'parent' not in r['doc']: for el in html.fragments_fromstring(r['doc']['txt']): _t = type(el) if _t is unicode or _t is str: text_field.text += el else: text_field.append(el) else: text_field.text = r['doc']['txt'] if not 'parent' in r['doc']: faqs.append(faq_viewlet) current_record = faq_viewlet else: faq_viewlet.set('class',faq_viewlet.get('class')+' fanswer') # this will remove the links from answers faq_viewlet.remove(faq_viewlet.xpath('//div[@class="faqlinks"]')[0]) current_record.append(faq_viewlet) if i==1: title.text = title_from_blog for el in self.template.top: self.skin.top.append(el) for el in self.template.middle: self.skin.middle.append(el) return self.skin.render()
def _render(self, template, values=None, **options): """ render(template, values, **options) Render the template specified by the given name. :param template: etree, xml_id, template name (see _get_template) * Call the method ``load`` is not an etree. :param dict values: template values to be used for rendering :param options: used to compile the template (the dict available for the rendering is frozen) * ``load`` (function) overrides the load method :returns: bytes marked as markup-safe (decode to :class:`markupsafe.Markup` instead of `str`) :rtype: MarkupSafe """ context = dict(self.env.context, dev_mode='qweb' in tools.config['dev_mode']) context.update(options) result = super(IrQWeb, self)._render(template, values=values, **context) if not values or not values.get('__keep_empty_lines'): result = markupsafe.Markup( IrQWeb._empty_lines.sub('\n', result.strip())) if 'data-pagebreak=' not in result: return result fragments = html.fragments_fromstring(result) for fragment in fragments: for row in fragment.iterfind('.//tr[@data-pagebreak]'): table = next(row.iterancestors('table')) newtable = html.Element('table', attrib=dict(table.attrib)) thead = table.find('thead') if thead: newtable.append(copy.deepcopy(thead)) # TODO: copy caption & tfoot as well? # TODO: move rows in a tbody if row.getparent() is one? pos = row.get('data-pagebreak') assert pos in ('before', 'after') for sibling in row.getparent().iterchildren('tr'): if sibling is row: if pos == 'after': newtable.append(sibling) break newtable.append(sibling) table.addprevious(newtable) table.addprevious( html.Element('div', attrib={'style': 'page-break-after: always'})) return markupsafe.Markup(''.join( html.tostring(f).decode() for f in fragments))
def checkHTML(doc): from lxml import html ret = ResultNode("root") try: for c in treeCheck(html.fragments_fromstring(doc), "root"): ret.append_child(c) except html.etree.ParseError: ret.ok = False return ret
def _html(node, d, i): ml = val if not callable(val) else val(node, d, i) frags = fragments_fromstring(ml) for child in node: node.remove(child) for new in frags: node.append(new)
def post_node(title, datetime, content): post = copy(POST) CSSSelector('.title .text')(post)[0].text = title CSSSelector('.datetime')(post)[0].text = datetime.strftime("%H:%M on %A the %%s of %B, %Y") % niceday(datetime) content_css = CSSSelector('.content')(post)[0] for fragment in fragments_fromstring(cleaner_trusted.clean_html(content)): content_css.append(fragment) return post
def stork_to_django(html): elements = fragments_fromstring(''.join([c for c in html if c != '\r'])) html = ''.join(tostring(clean_html(element)) for element in elements) with_blocks = pseudoblock_re.sub(r'&& block \1 &&&& endblock &&', html) for bit, tag in TEMPLATE_TAG_ESCAPES: sub = ''.join(['&#%d;' % ord(c) for c in bit]) with_blocks = with_blocks.replace(bit, sub) with_blocks = re.sub(r'&& block ([a-z]+) &&&& endblock &&', r'{% block \1 %}{% endblock %}', with_blocks) return with_blocks
def _fragments_from_string(html_string): fragments = html.fragments_fromstring(html_string) if not len(fragments): return [] # convert and append text node before starting tag if not isinstance(fragments[0], html.HtmlElement): if len(fragments[0].strip()) > 0: if len(fragments) == 1: return html.fragments_fromstring('<p>%s</p>' % fragments[0]) else: paragraph = _create_element('p') paragraph.text = fragments[0] fragments[1].addprevious(paragraph) fragments.insert(1, paragraph) fragments.pop(0) if not len(fragments): return [] return fragments
def _get_asset_content(self, xmlid, options): options = dict(options, inherit_branding=False, inherit_branding_auto=False, edit_translations=False, translatable=False, rendering_bundle=True) options['website_id'] = self.env.context.get('website_id') IrQweb = self.env['ir.qweb'].with_context(options) def can_aggregate(url): return not urls.url_parse(url).scheme and not urls.url_parse(url).netloc and not url.startswith('/web/content') # TODO: This helper can be used by any template that wants to embedd the backend. # It is currently necessary because the ir.ui.view bundle inheritance does not # match the module dependency graph. def get_modules_order(): if request: from odoo.addons.web.controllers.main import module_boot return json.dumps(module_boot()) return '[]' template = IrQweb.render(xmlid, {"get_modules_order": get_modules_order}) files = [] remains = [] for el in html.fragments_fromstring(template): if isinstance(el, html.HtmlElement): href = el.get('href', '') src = el.get('src', '') atype = el.get('type') media = el.get('media') if can_aggregate(href) and (el.tag == 'style' or (el.tag == 'link' and el.get('rel') == 'stylesheet')): if href.endswith('.sass'): atype = 'text/sass' elif href.endswith('.scss'): atype = 'text/scss' elif href.endswith('.less'): atype = 'text/less' if atype not in ('text/less', 'text/scss', 'text/sass'): atype = 'text/css' path = [segment for segment in href.split('/') if segment] filename = get_resource_path(*path) if path else None files.append({'atype': atype, 'url': href, 'filename': filename, 'content': el.text, 'media': media}) elif can_aggregate(src) and el.tag == 'script': atype = 'text/javascript' path = [segment for segment in src.split('/') if segment] filename = get_resource_path(*path) if path else None files.append({'atype': atype, 'url': src, 'filename': filename, 'content': el.text, 'media': media}) else: remains.append((el.tag, OrderedDict(el.attrib), el.text)) else: # the other cases are ignored pass return (files, remains)
def render(self, id_or_xml_id, values=None, **options): """ render(id_or_xml_id, values, **options) Render the template specified by the given name. :param id_or_xml_id: name or etree (see get_template) :param dict values: template values to be used for rendering :param options: used to compile the template (the dict available for the rendering is frozen) * ``load`` (function) overrides the load method * ``profile`` (float) profile the rendering (use astor lib) (filter profile line with time ms >= profile) """ for method in dir(self): if method.startswith('render_'): _logger.warning("Unused method '%s' is found in ir.qweb." % method) context = dict(self.env.context, dev_mode='qweb' in tools.config['dev_mode']) context.update(options) result = super(IrQWeb, self).render(id_or_xml_id, values=values, **context) if b'data-pagebreak=' not in result: return result fragments = html.fragments_fromstring(result) for fragment in fragments: for row in fragment.iterfind('.//tr[@data-pagebreak]'): table = next(row.iterancestors('table')) newtable = html.Element('table', attrib=dict(table.attrib)) thead = table.find('thead') if thead: newtable.append(copy.deepcopy(thead)) # TODO: copy caption & tfoot as well? # TODO: move rows in a tbody if row.getparent() is one? pos = row.get('data-pagebreak') assert pos in ('before', 'after') for sibling in row.getparent().iterchildren('tr'): if sibling is row: if pos == 'after': newtable.append(sibling) break newtable.append(sibling) table.addprevious(newtable) table.addprevious( html.Element('div', attrib={'style': 'page-break-after: always'})) return b''.join(html.tostring(f) for f in fragments)
def rich_text_to_elems(ar, description): """ A RichTextField can contain HTML markup or plain text. """ if description.startswith("<"): # desc = E.raw('<div>%s</div>' % self.description) desc = lxml_html.fragments_fromstring(ar.parse_memo(description)) return desc # desc = E.raw('<div>%s</div>' % self.description) html = restify(ar.parse_memo(description)) # logger.info(u"20180320 restify %s --> %s", description, html) # html = html.strip() try: desc = lxml_html.fragments_fromstring(html) except Exception as e: raise Exception( "Could not parse {!r} : {}".format(html, e)) # logger.info( # "20160704c parsed --> %s", tostring(desc)) return desc
def replace_tag_by_string(tag, string): fragments = html.fragments_fromstring(string)#.encode('utf-8')) parent = tag.getparent() if fragments and isinstance(fragments[0], basestring): # append inserted fragment's text to previous element prepend_text(tag, fragments.pop(0)) for child in fragments: parent.insert(parent.index(tag), child) tag.drop_tree()
def parse_page(self, txt): self.fragments = None self.newfragments = None # Parse into fragments (plural), not into a single fragments. # In the latter case lxml would try to create valid HTML: # - if txt has no root element, txt is enclosed in 'div' # - if txt starts with strings (not tags), these are enclosed # in 'p' # To avoid this, we let lxml parse txt into fragments which # preserves leading text. self.fragments = html.fragments_fromstring(txt)
def get_daily_task_answer() -> (str, str, str): global cookie_jar print("get question...") header = { "User-Agent": user_agent, "Referer": referer } # response = requests.get(get_question_url, headers=header, cookies=cookie_jar) response = session.get(get_question_url, headers=header) check_status_code(response, "get daily question") if "您今天已经参加过答题,明天再来吧!" in response.text: print("已答题") return "", "", "" cookie_jar.update(response.cookies) dom = xml.parseString(response.text) data = dom.childNodes[0].childNodes[0].data nodes = html.fragments_fromstring(data) form_hash_node = nodes[1].cssselect('form input[name="formhash"]')[0] form_hash = form_hash_node.get("value") sec_hash_node = nodes[1].cssselect("form input[name='sechash']")[0] sec_hash = sec_hash_node.get("value") print(f"form hash: {form_hash}") question_node = nodes[1].cssselect("form div span font")[0] question = question_node.text_content() question = question[5:] # 去掉开始的 "【问题】 " question = question.strip() # 去掉结尾空格 print(f"question: {question}") answer_nodes = nodes[1].cssselect("form div.qs_option input") answers = {} for node in answer_nodes: id = node.get("value") text = node.getparent().text_content() answers[id] = text[2:].strip() # 去掉前后的空格 fix https://github.com/harryhare/1point3acres/issues/3 print(f"answers: {answers}") answer = "" answer_id = "" if question in questions.questions.keys(): answer = questions.questions[question] if type(answer) == list: for k in answers: if answers[k] in answer: print(f"find answer: {answers[k]} option value: {k} ") answer_id = k else: for k in answers: if answers[k] == answer: print(f"find answer: {answers[k]} option value: {k} ") answer_id = k if answer_id == "": print(f"answer not found: {answer}") else: print("question not found") return answer_id, form_hash, sec_hash
def stork_to_django(html): elements = fragments_fromstring(''.join([ c for c in html if c != '\r' ])) html = ''.join(tostring(clean_html(element)) for element in elements) with_blocks = pseudoblock_re.sub(r'&& block \1 &&&& endblock &&', html) for bit, tag in TEMPLATE_TAG_ESCAPES: sub = ''.join(['&#%d;' % ord(c) for c in bit]) with_blocks = with_blocks.replace(bit, sub) with_blocks = re.sub(r'&& block ([a-z]+) &&&& endblock &&', r'{% block \1 %}{% endblock %}', with_blocks) return with_blocks
def first_paragraph(value): import re from lxml.html import fragments_fromstring, tostring fragments = fragments_fromstring(value) if len(fragments): for fragment in fragments: if getattr(fragment, 'tag', None) == 'p': fragment.drop_tag() return tostring(fragment) graphs = re.split(r'[\r\n]{2,}', value) return graphs[0]
def render(self, id_or_xml_id, values=None, **options): """ render(id_or_xml_id, values, **options) Render the template specified by the given name. :param id_or_xml_id: name or etree (see get_template) :param dict values: template values to be used for rendering :param options: used to compile the template (the dict available for the rendering is frozen) * ``load`` (function) overrides the load method * ``profile`` (float) profile the rendering (use astor lib) (filter profile line with time ms >= profile) """ for method in dir(self): if method.startswith('render_'): _logger.warning("Unused method '%s' is found in ir.qweb." % method) context = dict(self.env.context, dev_mode='qweb' in tools.config['dev_mode']) context.update(options) result = super(IrQWeb, self).render(id_or_xml_id, values=values, **context) if b'data-pagebreak=' not in result: return result fragments = html.fragments_fromstring(result) for fragment in fragments: for row in fragment.iterfind('.//tr[@data-pagebreak]'): table = next(row.iterancestors('table')) newtable = html.Element('table', attrib=dict(table.attrib)) thead = table.find('thead') if thead: newtable.append(copy.deepcopy(thead)) # TODO: copy caption & tfoot as well? # TODO: move rows in a tbody if row.getparent() is one? pos = row.get('data-pagebreak') assert pos in ('before', 'after') for sibling in row.getparent().iterchildren('tr'): if sibling is row: if pos == 'after': newtable.append(sibling) break newtable.append(sibling) table.addprevious(newtable) table.addprevious(html.Element('div', attrib={ 'style': 'page-break-after: always' })) return b''.join(html.tostring(f) for f in fragments)
def has_iframe(self): field = self.getField('embed') value = field.getAccessor(self)() if not value: return False html_elements = fragments_fromstring(value) iframes = [iframe for iframe in html_elements if iframe.tag == 'iframe'] if iframes: return True return False
def summarize(string): """Summarize a bunch of html. What 'summarize' means in this case is to cherrypick the first paragraph as well as, if it precedes that tag, the first image tag.""" soup = html.fragments_fromstring(string) ret = [] for tag in soup: if tag.tag == 'img': ret.append(html.tostring(tag, method='xml')) elif tag.tag == 'p': ret.append(html.tostring(tag)) break return '\n'.join(ret)