def format(self, article, subscriber, codes=None): """Create article in NewsML G2 format :param dict article: :param dict subscriber: :param list codes: selector codes :return [(int, str)]: return a List of tuples. A tuple consist of publish sequence number and formatted article string. :raises FormatterError: if the formatter fails to format an article """ try: self.subscriber = subscriber pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) is_package = self._is_package(article) news_message = etree.Element('newsMessage', attrib=self._debug_message_extra, nsmap=self._message_nsmap) self._format_header(article, news_message, pub_seq_num) item_set = self._format_item(news_message) if is_package: item = self._format_item_set(article, item_set, 'packageItem') self._format_groupset(article, item) elif article[ITEM_TYPE] in {CONTENT_TYPE.PICTURE, CONTENT_TYPE.AUDIO, CONTENT_TYPE.VIDEO}: item = self._format_item_set(article, item_set, 'newsItem') self._format_contentset(article, item) else: nitfFormater = NITFFormatter() nitf = nitfFormater.get_nitf(article, subscriber, pub_seq_num) newsItem = self._format_item_set(article, item_set, 'newsItem') self._format_content(article, newsItem, nitf) sd_etree.fix_html_void_elements(news_message) return [(pub_seq_num, self.XML_ROOT + etree.tostring(news_message, pretty_print=True).decode('utf-8'))] except Exception as ex: raise FormatterError.newmsmlG2FormatterError(ex, subscriber)
def body_hook(self, item, html): """Copy content to body_html if img are found in the content, they are uploaded. First image is used as feature media, then there are embeds """ # we need to convert CRLF to <p> # cf. SDTS-22 html = html.replace(" ", "\r") splitted = html.split("\r\n") if len(splitted) == 1 and "<p>" not in html: splitted = html.split("\n") if len(splitted) > 1: html = "".join([ "<p>{}</p>".format(s) if not is_block_elem(s) else s for s in splitted if s.strip() ]) if "img" in html: content = sd_etree.parse_html(html, "html") for img in content.xpath("//img"): try: src = self.check_url(img.get("src")) except ValueError: logger.warning("Can't fetch image: {elt}".format( elt=sd_etree.to_string(img))) continue try: key, media_data = self._add_image(item, src) except Exception as e: logger.error(e) img.getparent().remove(img) continue url = media_data["renditions"]["original"]["href"] img.set("src", url) if key == "featuremedia": # no need to embed the image for featuremedia continue embed_start = etree.Comment(embed_TPL.format("START", key)) embed_end = etree.Comment(embed_TPL.format("END", key)) img.addprevious(embed_start) img.addnext(embed_end) content = sd_etree.fix_html_void_elements(content) html = sd_etree.to_string(content, encoding="unicode", method="xml") html = remove_shortcodes(html) item["body_html"] = html
def body_hook(self, item, html): """Copy content to body_html if img are found in the content, they are uploaded. First image is used as feature media, then there are embeds """ # we need to convert CRLF to <p> # cf. SDTS-22 html = html.replace(' ', '\r') splitted = html.split('\r\n') if len(splitted) == 1 and '<p>' not in html: splitted = html.split('\n') if len(splitted) > 1: html = ''.join([ '<p>{}</p>'.format(s) if not is_block_elem(s) else s for s in splitted if s.strip() ]) if "img" in html: content = sd_etree.parse_html(html, 'html') for img in content.xpath('//img'): try: src = self.check_url(img.get('src')) except ValueError: logger.warning("Can't fetch image: {elt}".format( elt=sd_etree.to_string(img))) continue try: key, media_data = self._add_image(item, src) except Exception as e: logger.error(e) img.getparent().remove(img) continue url = media_data['renditions']['original']['href'] img.set("src", url) if key == 'featuremedia': # no need to embed the image for featuremedia continue embed_start = etree.Comment(embed_TPL.format('START', key)) embed_end = etree.Comment(embed_TPL.format('END', key)) img.addprevious(embed_start) img.addnext(embed_end) content = sd_etree.fix_html_void_elements(content) html = sd_etree.to_string(content, encoding="unicode", method='xml') item['body_html'] = html
def test_void_elements_fix(self): html_raw = '<p>this is a test with empty <h3/> non-void <em/> elements and a void <br/> one</p>' expected = '<p>this is a test with empty <h3></h3> non-void <em></em> elements and a void <br/> one</p>' parsed = sd_etree.parse_html(html_raw) sd_etree.fix_html_void_elements(parsed) self.assertEqual(sd_etree.to_string(parsed), expected)
def test_void_elements_fix(self): html = '<p>this is a test with empty <h3/> non-void <em/> elements and a void <br/> one</p>' expected = '<p>this is a test with empty <h3></h3> non-void <em></em> elements and a void <br/> one</p>' parsed = sd_etree.parse_html(html) sd_etree.fix_html_void_elements(parsed) self.assertEqual(sd_etree.to_string(parsed), expected)