def test_clean_html(self): html_raw = dedent("""\ <div> <header>this header must be removed</header> <p class="class_to_remove"> <unknown_tag>bla <strong>keep it strong</strong> </unknown_tag> <script>no script here !</script> </p> </div> """) elem = html.fromstring(html_raw) elem = sd_etree.clean_html(elem) expected = dedent("""\ <div> this header must be removed <p> bla <strong>keep it strong</strong> </p> </div> """) self.assertEqual(dedent(etree.tostring(elem, encoding="unicode")), expected)
def parse_inline_content(self, tree, item, ns=NS['xhtml']): if tree.get('contenttype') == NITF: try: body_content = tree.xpath('.//nitf:body.content/nitf:block/*', namespaces=NS) except AttributeError: return {'contenttype': NITF, 'content': ''} elements = [etree.tostring(sd_etree.clean_html(e), encoding='unicode', method='html') for e in body_content] return {'contenttype': NITF, 'content': '\n'.join(elements)} else: html = tree.find(self.qname('html', ns)) body = html.find(self.qname('body', ns)) elements = [] for elem in body: if elem.text: tag = elem.tag.rsplit('}')[1] elements.append('<%s>%s</%s>' % (tag, elem.text, tag)) # If there is a single p tag then replace the line feeds with breaks if len(elements) == 1 and body[0].tag.rsplit('}')[1] == 'p': elements[0] = elements[0].replace('\n ', '</p><p>').replace('\n', '<br/>') content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(elements) > 0: content['content'] = "\n".join(elements) elif body.text: content['content'] = '<pre>' + body.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content
def parse_inline_content(self, tree, item): html_elt = tree.find(self.qname('html')) body_elt = html_elt.find(self.qname('body')) body_elt = sd_etree.clean_html(body_elt) # replace <pre> with <p> for pre in body_elt.findall('.//pre'): pre.tag = 'p' # add target blank for all links for a in body_elt.findall('.//a'): a.attrib['target'] = '_blank' content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(body_elt) > 0: contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt] content['content'] = '\n'.join(contents) elif body_elt.text: content['content'] = '<p>' + body_elt.text + '</p>' content['format'] = 'xhtml/xml' if content.get('content'): content['content'] = content['content'].replace('<endash>-</endash>', '-') return content
def parse_inline_content(self, tree, item, ns=NS['xhtml']): if tree.get('contenttype') == NITF: body_content = tree.xpath('.//nitf:body.content/nitf:block/*', namespaces=NS) elements = [etree.tostring(sd_etree.clean_html(e), encoding='unicode', method='html') for e in body_content] content = {'contenttype': NITF, 'content': '\n'.join(elements)} else: html = tree.find(self.qname('html', ns)) body = html.find(self.qname('body', ns)) elements = [] for elem in body: if elem.text: tag = elem.tag.rsplit('}')[1] elements.append('<%s>%s</%s>' % (tag, elem.text, tag)) # If there is a single p tag then replace the line feeds with breaks if len(elements) == 1 and body[0].tag.rsplit('}')[1] == 'p': elements[0] = elements[0].replace('\n ', '</p><p>').replace('\n', '<br/>') content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(elements) > 0: content['content'] = "\n".join(elements) elif body.text: content['content'] = '<pre>' + body.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content
def parse_inline_content(self, tree, item): html_elt = tree.find(self.qname('html')) body_elt = html_elt.find(self.qname('body')) body_elt = sd_etree.clean_html(body_elt) content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(body_elt) > 0: contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt] content['content'] = '\n'.join(contents) elif body_elt.text: content['content'] = '<pre>' + body_elt.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content
def parse_inline_content(self, tree, item): try: body_elt = tree.xpath('//xhtml:body//xhtml:section[contains(@class,"main")]', namespaces=NS)[0] except IndexError: body_elt = tree.xpath('//xhtml:body', namespaces=NS)[0] body_elt = sd_etree.clean_html(body_elt) content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(body_elt) > 0: content['content'] = sd_etree.to_string(body_elt, method="html") elif body_elt.text: content['content'] = '<pre>' + body_elt.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content
def parse_inline_content(self, tree, item, ns=NS["xhtml"]): if tree.get("contenttype") == NITF: try: body_content = tree.xpath(".//nitf:body.content/nitf:block/*", namespaces=NS) except AttributeError: return {"contenttype": NITF, "content": ""} elements = [ etree.tostring(sd_etree.clean_html(e), encoding="unicode", method="html") for e in body_content ] return {"contenttype": NITF, "content": "\n".join(elements)} else: html = tree.find(self.qname("html", ns)) if html is None: try: ns = tree.nsmap.get(None) # fallback for missing xmlns except AttributeError: ns = None html = tree.find(self.qname("html", ns)) body = html.find(self.qname("body", ns)) elements = [] for elem in body: if elem.text: tag = get_content_tag(elem) elements.append("<%s>%s</%s>" % (tag, elem.text, tag)) # If there is a single p tag then replace the line feeds with breaks if len(elements) == 1 and get_content_tag(body[0]) == "p": elements[0] = elements[0].replace("\n ", "</p><p>").replace( "\n", "<br/>") content = dict() content["contenttype"] = tree.attrib["contenttype"] if len(elements) > 0: content["content"] = "\n".join(elements) elif body.text: content["content"] = "<pre>" + body.text + "</pre>" content["format"] = CONTENT_TYPE.PREFORMATTED return content
def parse_inline_content(self, tree, item, ns=NS['xhtml']): if tree.get('contenttype') == NITF: try: body_content = tree.xpath('.//nitf:body.content/nitf:block/*', namespaces=NS) except AttributeError: return {'contenttype': NITF, 'content': ''} elements = [ etree.tostring(sd_etree.clean_html(e), encoding='unicode', method='html') for e in body_content ] return {'contenttype': NITF, 'content': '\n'.join(elements)} else: html = tree.find(self.qname('html', ns)) if html is None: try: ns = tree.nsmap.get(None) # fallback for missing xmlns except AttributeError: ns = None html = tree.find(self.qname('html', ns)) body = html.find(self.qname('body', ns)) elements = [] for elem in body: if elem.text: tag = get_content_tag(elem) elements.append('<%s>%s</%s>' % (tag, elem.text, tag)) # If there is a single p tag then replace the line feeds with breaks if len(elements) == 1 and get_content_tag(body[0]) == 'p': elements[0] = elements[0].replace('\n ', '</p><p>').replace( '\n', '<br/>') content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(elements) > 0: content['content'] = "\n".join(elements) elif body.text: content['content'] = '<pre>' + body.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content
def parse_inline_content(self, tree, item): html_elt = tree.find(self.qname("html")) body_elt = html_elt.find(self.qname("body")) body_elt = sd_etree.clean_html(body_elt) content = dict() content["contenttype"] = tree.attrib["contenttype"] if len(body_elt) > 0: contents = [ sd_etree.to_string(e, encoding="unicode", method="html") for e in body_elt ] content["content"] = "\n".join(contents) elif body_elt.text: content["content"] = "<pre>" + body_elt.text + "</pre>" content["format"] = CONTENT_TYPE.PREFORMATTED if content.get("content"): content["content"] = content["content"].replace( "<endash>-</endash>", "-") return content
def parse_inline_content(self, tree, item): try: body_elt = tree.xpath('//xhtml:body//xhtml:section[contains(@class,"main")]', namespaces=NS)[0] except IndexError: body_elt = tree.xpath('//xhtml:body', namespaces=NS)[0] try: notepad = self.item_tree.xpath('.//iptc:edNote[@role="dpaednoterole:notepad"]//xhtml:section', namespaces=NS)[0] for elem in notepad: body_elt.append(elem) except IndexError: pass body_elt = sd_etree.clean_html(body_elt) content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(body_elt) > 0: content['content'] = sd_etree.to_string(body_elt, method="html") elif body_elt.text: content['content'] = '<pre>' + body_elt.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content