Exemplo n.º 1
0
    def test_encode_carriage_return(self):
        text = 'This is first line.\r\nThis is second line.\r\n'
        parsed = sd_etree.parse_html(text)
        self.assertEqual(text.replace('\r', '
'), sd_etree.to_string(parsed))

        text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>'
        parsed = sd_etree.parse_html(text, content='html')
        self.assertEqual(text.replace('\r', '&#13;'), sd_etree.to_string(parsed))
Exemplo n.º 2
0
    def test_encode_carriage_return(self):
        text = 'This is first line.\r\nThis is second line.\r\n'
        parsed = sd_etree.parse_html(text)
        self.assertEqual(text.replace('\r', '&#13;'), sd_etree.to_string(parsed))

        text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>'
        parsed = sd_etree.parse_html(text, content='html')
        self.assertEqual(text.replace('\r', '&#13;'), sd_etree.to_string(parsed))
Exemplo n.º 3
0
    def test_encode_carriage_return(self):
        text = "This is first line.\r\nThis is second line.\r\n"
        parsed = sd_etree.parse_html(text)
        self.assertEqual(text.replace("\r", "&#13;"),
                         sd_etree.to_string(parsed))

        text = "<pre>This is first line.\r\nThis is second line.\r\n</pre>"
        parsed = sd_etree.parse_html(text, content="html")
        self.assertEqual(text.replace("\r", "&#13;"),
                         sd_etree.to_string(parsed))
Exemplo n.º 4
0
    def test_encode_carriage_return(self):
        text = 'This is first line.\r\nThis is second line.\r\n'
        parsed = parse_html(text)
        self.assertEqual(text.replace('\r', '&#13;'), to_string(parsed))

        text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>'
        parsed = parse_html(text, content='html')
        self.assertEqual(
            '<html><body>{}</body></html>'.format(text.replace('\r', '&#13;')),
            to_string(parsed))
    def _format_body(self, formatted_article, main_news_component):
        """
        Create an body text NewsComponent element

        :param dict formatted_article:
        :param Element main_news_component:
        """
        content_item = SubElement(main_news_component, "ContentItem", attrib={'Duid': 'CI00001'})
        SubElement(content_item, 'MediaType', {'FormalName': 'Text'})
        SubElement(content_item, 'Format', {'FormalName': 'XHTML'})
        data_content = SubElement(content_item, 'DataContent')
        html = SubElement(data_content, 'html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'lang': 'en',
                                                        XML_LANG: formatted_article.get('language', 'en')})
        head = SubElement(html, 'head')
        SubElement(head, 'title')
        # Title has been removed to match the existing feed
        # title.text = formatted_article.get('headline', '')
        body = SubElement(html, 'body')

        if formatted_article.get(FORMAT, FORMATS.HTML) == FORMATS.PRESERVED:
            body.append(etree.fromstring(formatted_article.get('body_html')))
        else:
            if formatted_article.get('byline'):
                body.append(etree.fromstring('<p>' + formatted_article.get('byline', '') + '</p>'))

            root = sd_etree.parse_html(self.append_body_footer(formatted_article), content='html')
            if formatted_article.get('dateline', {}).get('text') and not formatted_article.get('auto_publish', False):
                ptag = root.find('.//p')
                if ptag is not None:
                    ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '')

            body_html = etree.tostring(root, encoding="unicode")
            body_html = body_html.replace('<p>', '__##br##__')
            body_html = body_html.replace('</p>', '__##br##__')
            body_html = body_html.replace('<br/>', '__##br##__')

            root = sd_etree.parse_html(body_html, content='html')
            body_html = etree.tostring(root, encoding="unicode", method="text")

            body_html = body_html.replace('\n', '__##br##__')
            list_paragraph = body_html.split('__##br##__')
            for p in list_paragraph:
                if p and p.strip():
                    body.append(etree.fromstring('<p>' + escape(p) + '</p>'))

            if SIGN_OFF in formatted_article:
                body.append(etree.fromstring(
                    '<p>' + formatted_article.get('source', '') + ' ' + formatted_article.get(SIGN_OFF, '') + '</p>'))
Exemplo n.º 6
0
    def map_html_to_xml(self, element, html):
        """
        Map the html text tags to xml

        :param etree.Element element: The xml element to populate
        :param str html: the html to parse the text from
        :return:
        """
        root = parse_html(html, content="html")
        # if there are no ptags just br
        if not len(root.xpath("//p")) and len(root.xpath("//br")):
            para = etree.SubElement(element, "p")
            for br in root.xpath("//br"):
                etree.SubElement(para, "br").text = br.text

        for p in root.xpath("//p"):
            para = etree.SubElement(element, "p")
            if len(p.xpath(".//br")) > 0:
                for br in p.xpath(".//br"):
                    etree.SubElement(para, "br").text = br.text
            para.text = etree.tostring(p, encoding="unicode", method="text")

        # there neither ptags pr br's
        if len(list(element)) == 0:
            etree.SubElement(element,
                             "p").text = etree.tostring(root,
                                                        encoding="unicode",
                                                        method="text")
Exemplo n.º 7
0
def _yonhap_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            parsed = sd_etree.parse_html(html, content='xml')
            pars = parsed.xpath('//p')
            for par in pars:
                if not par.text:
                    continue
                city, source, the_rest = par.text.partition(' (Yonhap) -- ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return
                    set_dateline(item, city, 'Yonhap')
                    break

        return item
    except:
        logging.exception('Yonhap dateline macro exception')
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            parsed = parse_html(html, content='xml')
            pars = parsed.xpath('//p')
            for par in pars:
                if not par.text:
                    continue
                city, source, the_rest = par.text.partition(' (Reuters) - ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return

                    # there is already a dateline that is not Bangalore/BENGALURU don't do anything just return
                    if 'located' in (item.get('dateline') or {}) and \
                            item['dateline']['located'].get('city').upper() not in ['BANGALORE', 'BENGALURU']:
                        return

                    set_dateline(item, city, 'Reuters')
                    break

        return item
    except:
        logging.exception('Reuters dateline macro exception')
Exemplo n.º 9
0
def racing_reformat_macro(item, **kwargs):
    """Given a pre tagged content convert it to HTML

    :param item:
    :param kwargs:
    :return:
    """

    # If not preserved in the first place then don't do anything
    if item[FORMAT] != FORMATS.PRESERVED:
        return

    # Nothing to do!
    if 'body_html' not in item:
        return None

    root = sd_etree.parse_html(item['body_html'], content='html')
    body_html = etree.tostring(root, encoding="unicode", method="text")

    # Paragraphs created on new lines
    body_html = body_html.replace('\n', '__##br##__')
    list_paragraph = body_html.split('__##br##__')
    item['body_html'] = ''.join('<p>' + p + '</p>' for p in list_paragraph
                                if p and p.strip())

    # Ensure that the format is HTML
    item[FORMAT] = FORMATS.HTML
    return item
Exemplo n.º 10
0
    def append_body_footer(self, article):
        """
        Checks if the article has any Public Service Announcements and if available appends each of them to the body.

        :return: body with public service announcements.
        """
        try:
            article['body_html'] = article['body_html'].replace('<br>', '<br/>')
        except KeyError:
            pass

        body = ''
        if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]:
            body = article.get('body_html', '')
        elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]:
            body = article.get('description', '')

        if body and article.get(FORMAT, '') == FORMATS.PRESERVED:
            body = body.replace('\n', '\r\n').replace('\r\r', '\r')
            parsed = parse_html(body, content='html')

            for br in parsed.xpath('//br'):
                br.tail = '\r\n' + br.tail if br.tail else '\r\n'

            etree.strip_elements(parsed, 'br', with_tail=False)
            body = etree.tostring(parsed, encoding="unicode")

        if body and article.get('body_footer'):
            footer = article.get('body_footer')
            if article.get(FORMAT, '') == FORMATS.PRESERVED:
                body = '{}\r\n{}'.format(body, get_text(footer))
            else:
                body = '{}{}'.format(body, footer)
        return body
Exemplo n.º 11
0
    def body_hook(self, item, html):
        """Copy content to body_html

        if img are found in the content, they are uploaded.
        First image is used as feature media, then there are embeds
        """
        if "img" in html:
            content = sd_etree.parse_html(html, 'html')
            for img in content.xpath('//img'):
                src = img.get('src')
                try:
                    key, media_data = self._add_image(item, src)
                except Exception as e:
                    logger.error(e)
                    img.getparent().remove(img)
                    continue
                _id = media_data['_id']
                url = url_for_media(_id)
                img.set("src", url)
                if key == 'featuremedia':
                    # no need to embed the image for featuremedia
                    continue
                embed_start = etree.Comment(embed_TPL.format('START', key))
                embed_end = etree.Comment(embed_TPL.format('END', key))
                img.addprevious(embed_start)
                img.addnext(embed_end)

            html = etree.tostring(content, encoding="unicode")

        item['body_html'] = html
    def _format_body_content(self, article, body_content):
        nitf_body = []

        if article.get('ednote'):
            nitf_body.append(to_ascii(self._format_line(article.get('ednote'))))

        if article.get(BYLINE):
            nitf_body.append(to_ascii(self._format_line(get_text(article.get(BYLINE)))))

        if article.get(FORMAT) == FORMATS.PRESERVED:
            nitf_body.append(to_ascii(get_text(self.append_body_footer(article), content='html')))
        else:
            body = article.get('body_html', '')
            # we need to inject the dateline
            if article.get('dateline', {}).get('text') and not article.get('auto_publish', False):
                body_html_elem = parse_html(article.get('body_html'))
                ptag = body_html_elem.find('.//p')
                if ptag is not None:
                    ptag.text = article['dateline']['text'] + ' ' + (ptag.text or '')
                    body = to_string(body_html_elem)

            nitf_body.append(self.get_text_content(body))
            if article.get('body_footer'):
                nitf_body.append(self.get_text_content(article.get('body_footer', '')))

        sign_off = '{} {}'.format(article.get('source') or '', (article.get('sign_off') or '')).strip()
        if sign_off:
            nitf_body.append(to_ascii(self._format_line(sign_off)))

        SubElement(body_content, 'pre').text = ''.join(nitf_body)
Exemplo n.º 13
0
def get_text(markup,
             content="xml",
             lf_on_block=False,
             space_on_elements=False,
             space=" "):
    """Get plain text version of (X)HTML or other XML element

    if the markup can't be parsed, it will be returned unchanged
    :param str markup: string to convert to plain text
    :param str content: 'xml' or 'html', as in parse_html
    :param bool lf_on_block: if True, add a line feed on block elements' tail
    :param bool space_on_elements: if True, add a space on each element's tail
        mainly used to count words with non HTML markup
    :param str space: space string which is used when `space_on_elements` is enabled
    :return str: plain text version of markup
    """
    try:
        root = sd_etree.parse_html(markup,
                                   content=content,
                                   lf_on_block=lf_on_block,
                                   space_on_elements=space_on_elements,
                                   space=space)
        text = etree.tostring(root, encoding="unicode", method="text")
        return text
    except etree.ParseError:
        return markup
    def get_wrapped_text_content(self, content):
        """Get a version of the body text that is wrapped
        :param content:
        :return:
        """
        text = ''
        content = content.replace('<br>', '<br/>').replace('</br>', '')
        # remove control chars except \r and \n
        content = re.sub('[\x00-\x09\x0b\x0c\x0f-\x1f]', '', content)
        # Special case x0e denotes a line break
        content = re.sub('\x0e', '\r\n', content)
        # remove runs of spaces and stray line feeds
        content = re.sub(r' +', ' ', re.sub(r'(?<!\r)\n+', ' ', content).strip())

        parsed = parse_html(content, content='html')

        for br in parsed.xpath('//br'):
            br.tail = '\r\n' + br.tail if br.tail else '\r\n'
        etree.strip_elements(parsed, 'br', with_tail=False)

        for tag in parsed.xpath('/html/div/child::*'):
            ptext = ''
            for x in tag.itertext():
                ptext += x
            text += self.format_wrapped_text_content(ptext)

        return text
Exemplo n.º 15
0
    def append_body_footer(self, article):
        """
        Checks if the article has any Public Service Announcements and if available appends each of them to the body.

        :return: body with public service announcements.
        """
        try:
            article['body_html'] = article['body_html'].replace('<br>', '<br/>')
        except KeyError:
            pass

        body = ''
        if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]:
            body = article.get('body_html', '')
        elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]:
            body = article.get('description', '')

        if body and article.get(FORMAT, '') == FORMATS.PRESERVED:
            body = body.replace('\n', '\r\n').replace('\r\r', '\r')
            parsed = parse_html(body, content='html')

            for br in parsed.xpath('//br'):
                br.tail = '\r\n' + br.tail if br.tail else '\r\n'

            etree.strip_elements(parsed, 'br', with_tail=False)
            body = etree.tostring(parsed, encoding="unicode")

        if body and article.get('body_footer'):
            footer = article.get('body_footer')
            if article.get(FORMAT, '') == FORMATS.PRESERVED:
                body = '{}\r\n{}'.format(body, get_text(footer))
            else:
                body = '{}{}'.format(body, footer)
        return body
Exemplo n.º 16
0
    def map_html_to_xml(self, element, html):
        """
        Map the html text tags to xml

        :param etree.Element element: The xml element to populate
        :param str html: the html to parse the text from
        :return:
        """
        root = parse_html(html, content='html')
        # if there are no ptags just br
        if not len(root.xpath('//p')) and len(root.xpath('//br')):
            para = etree.SubElement(element, 'p')
            for br in root.xpath('//br'):
                etree.SubElement(para, 'br').text = br.text

        for p in root.xpath('//p'):
            para = etree.SubElement(element, 'p')
            if len(p.xpath('.//br')) > 0:
                for br in p.xpath('.//br'):
                    etree.SubElement(para, 'br').text = br.text
            para.text = etree.tostring(p, encoding="unicode", method="text")

        # there neither ptags pr br's
        if len(list(element)) == 0:
            etree.SubElement(element, 'p').text = etree.tostring(root, encoding="unicode", method="text")
Exemplo n.º 17
0
    def append_body_footer(self, article):
        """
        Checks if the article has any Public Service Announcements and if available appends each of them to the body.

        :return: body with public service announcements.
        """
        try:
            article["body_html"] = article["body_html"].replace("<br>", "<br/>")
        except KeyError:
            pass

        body = ""
        if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]:
            body = article.get("body_html", "")
        elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]:
            body = article.get("description", "")

        if body and article.get(FORMAT, "") == FORMATS.PRESERVED:
            body = body.replace("\n", "\r\n").replace("\r\r", "\r")
            parsed = parse_html(body, content="html")

            for br in parsed.xpath("//br"):
                br.tail = "\r\n" + br.tail if br.tail else "\r\n"

            etree.strip_elements(parsed, "br", with_tail=False)
            body = etree.tostring(parsed, encoding="unicode")

        if body and article.get("body_footer"):
            footer = article.get("body_footer")
            if article.get(FORMAT, "") == FORMATS.PRESERVED:
                body = "{}\r\n{}".format(body, get_text(footer))
            else:
                body = "{}{}".format(body, footer)
        return body
Exemplo n.º 18
0
 def _inject_dateline(self, formatted_article):
     """Inject dateline in article's body_html"""
     body_html_elem = sd_etree.parse_html(formatted_article.get('body_html', '<p> </p>'))
     ptag = body_html_elem.find('.//p')
     if ptag is not None:
         ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '')
         formatted_article['body_html'] = sd_etree.to_string(body_html_elem)
Exemplo n.º 19
0
def racing_reformat_macro(item, **kwargs):
    """Given a pre tagged content convert it to HTML

    :param item:
    :param kwargs:
    :return:
    """

    # If not preserved in the first place then don't do anything
    if item[FORMAT] != FORMATS.PRESERVED:
        return

    # Nothing to do!
    if 'body_html' not in item:
        return None

    root = sd_etree.parse_html(item['body_html'], content='html')
    body_html = etree.tostring(root, encoding="unicode", method="text")

    # Paragraphs created on new lines
    body_html = body_html.replace('\n', '__##br##__')
    list_paragraph = body_html.split('__##br##__')
    item['body_html'] = ''.join('<p>' + p + '</p>' for p in list_paragraph if p and p.strip())

    # Ensure that the format is HTML
    item[FORMAT] = FORMATS.HTML
    return item
Exemplo n.º 20
0
def get_first_paragraph_text(input_string):
    try:
        elem = parse_html(input_string, content='html')
    except ValueError as e:
        logger.warning(e)
    else:
        # all non-empty paragraphs: ignores <p><br></p> sections
        return get_text_from_elem(elem) or get_text_from_elem(elem, tag=None)
Exemplo n.º 21
0
def extract_kill_reason_from_html(html, is_kill):
    """Extract the reason from html for a kill/takedown

    Iterates over the xml nodes and find the node that contains the reason prefix.
    Once the reason prefix has been found add the proceeding nodes to our reason tree,
    until the kill/takedown suffix has been found.

    :param html:
    :param is_kill:
    :return:
    """
    try:
        # Create a new tree that we will use to construct the reason nodes
        root = etree.Element('div')

        # A flag to indicate if we're to add the current child node to our reason tree
        adding_nodes = False
        for child in parse_html(html, content='html'):
            # Obtain the text from our child nodes (including sub-child nodes)
            child_text = ''.join(child.itertext())

            if not adding_nodes and REASON_PREFIX in child_text:
                # This child node contains the reason prefix (and we haven't found it already)
                # Therefor set the flag to True indicating that the following child nodes
                # are to be added to our reason tree
                adding_nodes = True
                continue
            elif adding_nodes:
                # If the kill/takedown suffix has been found, then our reason tree is complete
                if is_kill and KILL_SUFFIX in child_text:
                    break
                elif not is_kill and TAKEDOWN_SUFFIX in child_text:
                    break

                # Otherwise continue adding the child nodes to our reason tree

                # Remove the last sub-child if it only contains a line break
                if len(child) > 0:
                    last_child = child[-1]
                    if etree.tostring(last_child) == b'<p><br/></p>':
                        child.remove(last_child)

                # Then add this child node to our reason tree
                root.append(child)

        num_children = len(list(root))

        # If the reason tree was not populated, then return the original html provided
        if num_children == 0:
            return html

        # Our reason tree was populated, convert the tree to a string and return it
        return to_string(root,
                         method='html',
                         remove_root_div=num_children == 1)
    except Exception as e:
        logger.exception(e)
        return html
Exemplo n.º 22
0
 def _inject_dateline(self, formatted_article):
     """Inject dateline in article's body_html"""
     body_html_elem = sd_etree.parse_html(
         formatted_article.get("body_html", "<p> </p>"))
     ptag = body_html_elem.find(".//p")
     if ptag is not None:
         ptag.text = formatted_article["dateline"]["text"] + " " + (
             ptag.text or "")
         formatted_article["body_html"] = sd_etree.to_string(body_html_elem)
Exemplo n.º 23
0
def clean_html(body_html):
    '''
    Make sure the html will parse and inject \r\n in an attempt to avoid issues with lines being too long for SMTP
    :param body_html:
    :return: parsed and re-written html
    '''
    root = sd_etree.parse_html(body_html, content='html', lf_on_block=True)
    return sd_etree.to_string(root, method='html',
                              pretty_print=True).replace('>\n', '>\r\n')
Exemplo n.º 24
0
def extract_html_macro(item, **kwargs):
    """
        Delete from body_html all html tags except links
    """
    if 'body_html' not in item:
        return None

    root = sd_etree.parse_html(item['body_html'], content='html')

    links = {}
    count = 0
    # extract all links and add them to a dictionary with a unique
    # generated key for every link
    for a in root.findall('.//a'):
        links['__##link' + str(count) + '##__'] = etree.tostring(
            a, encoding="unicode")
        count = count + 1

    # replace all text links with the generated keys
    # regenerate html back from root in order to avoid issues
    # on link replacements where are used text links generated from root
    body_html = etree.tostring(root, encoding="unicode")
    for link in links:
        body_html = body_html.replace(links[link], link)
    body_html = body_html.replace('<p>', '__##br##__')
    body_html = body_html.replace('</p>', '__##br##__')
    body_html = body_html.replace('<br/>', '__##br##__')

    # extract text from the html that don't contains any link,
    # it just contains link keys that are not affected by text extraction
    # because they are already text
    root = sd_etree.parse_html(body_html, content='html')
    body_html = etree.tostring(root, encoding="unicode", method="text")

    # in extracted text replace the link keys with links
    for link in links:
        body_html = body_html.replace(link, links[link])

    body_html = body_html.replace('\n', '__##br##__')
    list_paragraph = body_html.split('__##br##__')
    item['body_html'] = ''.join('<p>' + p + '</p>' for p in list_paragraph
                                if p and p.strip())
    return item
    def get_text_content(self, content):
        # It's only a one line ticker so new line and carriage return become spaces
        content = re.sub('[\n]', ' ', content)
        content = re.sub('[\r]', ' ', content)
        # remove control chars as these will upset the ticker
        content = re.sub(r'[\x00-\x1f]', '', content)
        if content == '':
            return ''

        parsed = parse_html(content, content='html')
        text = etree.tostring(parsed, encoding="unicode", method="text")
        return text
Exemplo n.º 26
0
def get_par_count(html):
    try:
        elem = sd_etree.parse_html(html, content='html')
        return len([
            p for p in elem.iterfind('.//p')
            if p.text and len(p.text.strip()) > 0
        ])
    except ValueError as e:
        logger.warning(e)

    logger.warning('Failed to determine paragraph count from html: {}.'.format(html))
    return 0
Exemplo n.º 27
0
    def body_hook(self, item, html):
        """Copy content to body_html

        if img are found in the content, they are uploaded.
        First image is used as feature media, then there are embeds
        """
        # we need to convert CRLF to <p>
        # cf. SDTS-22
        html = html.replace("&#13;", "\r")
        splitted = html.split("\r\n")
        if len(splitted) == 1 and "<p>" not in html:
            splitted = html.split("\n")
        if len(splitted) > 1:
            html = "".join([
                "<p>{}</p>".format(s) if not is_block_elem(s) else s
                for s in splitted if s.strip()
            ])

        if "img" in html:
            content = sd_etree.parse_html(html, "html")
            for img in content.xpath("//img"):
                try:
                    src = self.check_url(img.get("src"))
                except ValueError:
                    logger.warning("Can't fetch image: {elt}".format(
                        elt=sd_etree.to_string(img)))
                    continue
                try:
                    key, media_data = self._add_image(item, src)
                except Exception as e:
                    logger.error(e)
                    img.getparent().remove(img)
                    continue
                url = media_data["renditions"]["original"]["href"]
                img.set("src", url)
                if key == "featuremedia":
                    # no need to embed the image for featuremedia
                    continue
                embed_start = etree.Comment(embed_TPL.format("START", key))
                embed_end = etree.Comment(embed_TPL.format("END", key))
                img.addprevious(embed_start)
                img.addnext(embed_end)

            content = sd_etree.fix_html_void_elements(content)

            html = sd_etree.to_string(content,
                                      encoding="unicode",
                                      method="xml")

        html = remove_shortcodes(html)

        item["body_html"] = html
Exemplo n.º 28
0
def first_paragraph_filter(input_string):
    try:
        elem = parse_html(input_string, content='html')
    except ValueError as e:
        logger.warning(e)
    else:
        # all non-empty paragraphs: ignores <p><br></p> sections
        for p in elem.iterfind('.//p'):
            if p.text:
                return etree.tostring(p, encoding="unicode")

    logger.warning('Failed to locate the first paragraph from input_string: {}.'.format(input_string))
    return ''
Exemplo n.º 29
0
def extract_html_macro(item, **kwargs):
    """Delete from body_html all html tags except links"""
    if 'body_html' not in item:
        return None

    root = sd_etree.parse_html(item['body_html'], content='html')

    links = {}
    count = 0
    # extract all links and add them to a dictionary with a unique
    # generated key for every link
    for a in root.findall('.//a'):
        links['__##link' + str(count) + '##__'] = etree.tostring(a, encoding="unicode")
        count = count + 1

    # replace all text links with the generated keys
    # regenerate html back from root in order to avoid issues
    # on link replacements where are used text links generated from root
    body_html = etree.tostring(root, encoding="unicode")
    for link in links:
        body_html = body_html.replace(links[link], link)
    body_html = body_html.replace('<p>', '__##br##__')
    body_html = body_html.replace('</p>', '__##br##__')
    body_html = body_html.replace('<br/>', '__##br##__')

    # extract text from the html that don't contains any link,
    # it just contains link keys that are not affected by text extraction
    # because they are already text
    root = sd_etree.parse_html(body_html, content='html')
    body_html = etree.tostring(root, encoding="unicode", method="text")

    # in extracted text replace the link keys with links
    for link in links:
        body_html = body_html.replace(link, links[link])

    body_html = body_html.replace('\n', '__##br##__')
    list_paragraph = body_html.split('__##br##__')
    item['body_html'] = ''.join('<p>' + p + '</p>' for p in list_paragraph if p and p.strip())
    return item
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            parsed = parse_html(html, content='xml')
            pars = parsed.xpath('//p')
            for par in pars:
                if not par.text:
                    continue
                city, source, the_rest = par.text.partition(' (Reuters) - ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return
                    cities = app.locators.find_cities()
                    located = [
                        c for c in cities if c['city'].lower() == city.lower()
                    ]
                    # if not dateline we create one
                    if 'dateline' not in item:
                        item['dateline'] = {}
                    # there is already a dateline that is not Bangalore don't do anything just return
                    elif 'located' in item['dateline'] and 'BANGALORE' != item[
                            'dateline']['located'].get('city').upper():
                        return

                    item['dateline']['located'] = located[0] if len(
                        located) == 1 else {
                            'city_code': city,
                            'city': city,
                            'tz': 'UTC',
                            'dateline': 'city'
                        }
                    item['dateline']['source'] = item.get(
                        'original_source', 'Reuters')
                    item['dateline']['text'] = format_dateline_to_locmmmddsrc(
                        item['dateline']['located'],
                        get_date(item['firstcreated']),
                        source=item.get('original_source', 'Reuters'))
                    break

        return item
    except:
        logging.exception('Reuters dateline macro exception')
    def _format_abstract(self, article, main_news_component):
        """
        Create an abstract NewsComponent element

        :param dict article:
        :param Element main_news_component:
        """
        abstract_news_component = SubElement(main_news_component, "NewsComponent")
        SubElement(abstract_news_component, 'Role', {'FormalName': 'Abstract'})
        content_item = SubElement(abstract_news_component, "ContentItem")
        SubElement(content_item, 'MediaType', {'FormalName': 'Text'})
        SubElement(content_item, 'Format', {'FormalName': 'Text'})
        abstract = parse_html(article.get('abstract', ''))
        SubElement(content_item, 'DataContent').text = etree.tostring(abstract, encoding="unicode", method="text")
Exemplo n.º 32
0
    def body_hook(self, item, html):
        """Copy content to body_html

        if img are found in the content, they are uploaded.
        First image is used as feature media, then there are embeds
        """
        # we need to convert CRLF to <p>
        # cf. SDTS-22
        html = html.replace('&#13;', '\r')
        splitted = html.split('\r\n')
        if len(splitted) == 1 and '<p>' not in html:
            splitted = html.split('\n')
        if len(splitted) > 1:
            html = ''.join([
                '<p>{}</p>'.format(s) if not is_block_elem(s) else s
                for s in splitted if s.strip()
            ])

        if "img" in html:
            content = sd_etree.parse_html(html, 'html')
            for img in content.xpath('//img'):
                try:
                    src = self.check_url(img.get('src'))
                except ValueError:
                    logger.warning("Can't fetch image: {elt}".format(
                        elt=sd_etree.to_string(img)))
                    continue
                try:
                    key, media_data = self._add_image(item, src)
                except Exception as e:
                    logger.error(e)
                    img.getparent().remove(img)
                    continue
                url = media_data['renditions']['original']['href']
                img.set("src", url)
                if key == 'featuremedia':
                    # no need to embed the image for featuremedia
                    continue
                embed_start = etree.Comment(embed_TPL.format('START', key))
                embed_end = etree.Comment(embed_TPL.format('END', key))
                img.addprevious(embed_start)
                img.addnext(embed_end)

            content = sd_etree.fix_html_void_elements(content)
            html = sd_etree.to_string(content,
                                      encoding="unicode",
                                      method='xml')

        item['body_html'] = html
Exemplo n.º 33
0
def first_paragraph_filter(input_string):
    try:
        elem = parse_html(input_string, content='html')
    except ValueError as e:
        logger.warning(e)
    else:
        # all non-empty paragraphs: ignores <p><br></p> sections
        for p in elem.iterfind('.//p'):
            if p.text:
                return etree.tostring(p, encoding="unicode")

    logger.warning(
        'Failed to locate the first paragraph from input_string: {}.'.format(
            input_string))
    return ''
def remove_breaks(item, **kwargs):
    try:
        html = item.get('body_html')
        if html:
            html = html.replace('<br>', '<br/>').replace('</br>', ' ')
            parsed = parse_html(html, content='xml')
            for br in parsed.xpath('//br'):
                br.tail = ' ' + br.tail if br.tail else ' '
            etree.strip_elements(parsed, 'br', with_tail=False)
            item['body_html'] = to_string(parsed)
            return item

    except Exception as ex:
        logging.exception('Exception in preserve format macro: ', ex)
        raise ex
    def parse(self, xml, provider=None):
        self.root = xml
        items = []
        try:
            for item_set in xml.findall(self.qname('itemSet')):
                for item_tree in item_set:
                    # Ignore the packageItem, it has no guid
                    if 'guid' in item_tree.attrib:
                        item = self.parse_item(item_tree)
                        item['priority'] = 6
                        item['anpa_category'] = [{'qcode': 'f'}]
                        item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]
                        item.setdefault('word_count', get_word_count(item['body_html']))
                        # Hard code the urgency
                        item['urgency'] = 3
                        # Dateline is always Wellington in NZ
                        located = [c for c in app.locators.find_cities(country_code='NZ', state_code='NZ.G2') if
                                   c.get('city', '').lower() == 'wellington']
                        if len(located) == 1:
                            item['dateline'] = dict()
                            item['dateline']['located'] = located[0]

                        if item.get('body_html') and item['dateline']:
                            parsed = parse_html(item.get('body_html'), content='xml')
                            pars = parsed.xpath('//p')
                            for par in pars:
                                if not par.text:
                                    continue
                                # check the first par for a byline
                                if pars.index(par) == 0 and par.text.startswith('By '):
                                    item['byline'] = par.text.replace('By ', '')
                                    par.getparent().remove(par)
                                date, source, the_rest = par.text.partition(' (BusinessDesk) - ')
                                if source:
                                    item['dateline']['date'] = date_parser(date, fuzzy=True)
                                    par.text = the_rest
                                # remove the signoff if in the last par
                                if par.text == '(BusinessDesk)' and pars.index(par) + 1 == len(pars):
                                    par.getparent().remove(par)
                            item['body_html'] = to_string(parsed, remove_root_div=True)
                        locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators')
                        if locator_map:
                            item['place'] = [x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ']

                        items.append(item)
            return items
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
 def map_html_to_xml(self, element, html):
     """
     Map the html text tags to xml
     :param element: The xml element to populate
     :param html: the html to parse the text from
     :return:
     """
     html = html.replace('<br>', '<br/>').replace('</br>', '')
     html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html)
     html = html.replace('\n', ' ')
     html = re.sub(r'\s\s+', ' ', html)
     parsed = parse_html(html, content='html')
     for tag in parsed.xpath('/html/div/child::*'):
         p = etree.Element('p')
         p.text = to_ascii(
             get_text(to_string(tag, method='html'), content='html'))
         element.append(p)
def reuters_derive_dateline(item, **kwargs):
    """
    It seems that most locations injected into the item by the parser are Bangalor
    This function looks for a dateline in the article body an uses that.
    :param items:
    :return:
    """
    try:
        html = item.get('body_html')
        if html:
            parsed = parse_html(html, content='xml')
            pars = parsed.xpath('//p')
            if len(pars) >= 2:
                if BYLINE in item and item.get(BYLINE) in ''.join(pars[0].itertext()):
                    first = ''.join(pars[1].itertext())
                else:
                    first = ''.join(pars[0].itertext())
                city, source, the_rest = first.partition(' (Reuters) - ')
                if source:
                    # sometimes the city is followed by a comma and either a date or a state
                    city = city.split(',')[0]
                    if any(char.isdigit() for char in city):
                        return
                    cities = app.locators.find_cities()
                    located = [c for c in cities if c['city'].lower() == city.lower()]
                    # if not dateline we create one
                    if 'dateline' not in item:
                        item['dateline'] = {}
                    # there is already a dateline that is not Bangalore don't do anything just return
                    elif 'located' in item['dateline'] and 'BANGALORE' != item['dateline']['located'].get(
                            'city').upper():
                        return

                    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city,
                                                                                       'city': city,
                                                                                       'tz': 'UTC',
                                                                                       'dateline': 'city'}
                    item['dateline']['source'] = item.get('original_source', 'Reuters')
                    item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                              get_date(item['firstcreated']),
                                                                              source=item.get('original_source',
                                                                                              'Reuters'))

        return item
    except:
        logging.exception('Reuters dateline macro exception')
 def map_html_to_xml(self, element, html):
     """
     Map the html text tags to xml
     :param element: The xml element to populate
     :param html: the html to parse the text from
     :return:
     """
     html = html.replace('<br>', '<br/>').replace('</br>', '')
     html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html)
     html = html.replace('\n', ' ')
     html = re.sub(r'\s\s+', ' ', html)
     parsed = parse_html(html, content='html')
     for tag in parsed.xpath('//*'):
         if tag.getparent() is not None and tag.getparent().tag == 'body':
             p = etree.Element('p')
             p.text = to_ascii(get_text(to_string(tag, method='html'), content='html'))
             element.append(p)
Exemplo n.º 39
0
    def _format_abstract(self, article, main_news_component):
        """
        Create an abstract NewsComponent element

        :param dict article:
        :param Element main_news_component:
        """
        abstract_news_component = SubElement(main_news_component,
                                             "NewsComponent")
        SubElement(abstract_news_component, "Role", {"FormalName": "Abstract"})
        content_item = SubElement(abstract_news_component, "ContentItem")
        SubElement(content_item, "MediaType", {"FormalName": "Text"})
        SubElement(content_item, "Format", {"FormalName": "Text"})
        abstract = parse_html(article.get("abstract", ""))
        SubElement(content_item,
                   "DataContent").text = etree.tostring(abstract,
                                                        encoding="unicode",
                                                        method="text")
Exemplo n.º 40
0
def sanitize_tags(item):
    content = item.get('body_html', '')
    content = content.replace('<br>', '<br/>').replace('</br>', '')
    content = content.replace('&nbsp;', ' ')

    parsed = parse_html(content, content='html')

    # breaks are replaced with line feeds
    for br in parsed.xpath('//br'):
        br.tail = '\n' + br.tail if br.tail else '\n'
    etree.strip_elements(parsed, 'br', with_tail=False)

    for tag in parsed.xpath('/html/div/child::*'):
        format_text_content(tag)

    item['body_html'] = '<pre>{}</pre>'.format(html.escape(''.join(parsed.itertext())))
    item[FORMAT] = FORMATS.PRESERVED
    return item
Exemplo n.º 41
0
    def get_text_content(self, content):
        content = content.replace('<br>', '<br/>').replace('</br>', '')
        content = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', content)
        content = content.replace('\xA0', ' ')

        parsed = parse_html(content, content='html')

        for br in parsed.xpath('//br'):
            br.tail = '\r\n' + br.tail if br.tail else '\r\n'
        etree.strip_elements(parsed, 'br', with_tail=False)

        for tag in parsed.xpath('/html/div/child::*'):
            if tag.tag not in ('br') and tag.text is not None and tag.text.strip() != '':
                tag.text = '   ' + re.sub(' +', ' ', re.sub('(?<!\r)\n+', ' ', tag.text)) if tag.text else ''
                tag.tail = '\r\n' + tag.tail if tag.tail else '\r\n'

        para_text = "".join(x for x in parsed.itertext())
        para_text = para_text.replace('\xA0', ' ')
        return para_text.encode('ascii', 'replace')
Exemplo n.º 42
0
 def _fix_headline(self, item):
     """
     AP Alerts do not get a healdine parsed out so pick up the first par of the content and put it in the headline
     :param item:
     :return:
     """
     try:
         html = item.get('body_html')
         if html:
             parsed = parse_html(html, content='html')
             pars = parsed.xpath('/html/div/child::*')
             if pars and len(pars) > 0:
                 city, source, the_rest = pars[0].text.partition(' (AP) _ ')
                 if the_rest:
                     item['headline'] = the_rest
                 else:
                     item['headline'] = pars[0].text
     except:
         pass
Exemplo n.º 43
0
    def get_text_content(self, content):
        content = content.replace('<br>', '<br/>').replace('</br>', '')
        content = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', content)
        content = content.replace('\xA0', ' ')

        parsed = parse_html(content, content='html')

        for br in parsed.xpath('//br'):
            br.tail = '\r\n' + br.tail if br.tail else '\r\n'
        etree.strip_elements(parsed, 'br', with_tail=False)

        for tag in parsed.xpath('/html/div/child::*'):
            if tag.tag != 'br' and tag.text is not None and tag.text.strip() != '':
                tag.text = self.line_prefix + re.sub(' +', ' ', re.sub('(?<!\r)\n+', ' ', tag.text))
                tag.tail = '\r\n' + tag.tail if tag.tail else '\r\n'

        para_text = "".join(x for x in parsed.itertext())
        # multiple line breaks to one line break
        para_text = re.sub('[{}]+'.format(self.line_feed), self.line_feed, para_text)
        return to_ascii(para_text)
    def get_text_content(self, content):
        text = ''
        content = content.replace('<br>', '<br/>').replace('</br>', '')
        content = re.sub(' +', ' ', re.sub('(?<!\r)\n+', ' ', content).strip())
        content = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', content)

        parsed = parse_html(content, content='html')

        for br in parsed.xpath('//br'):
            br.tail = '\r\n' + br.tail if br.tail else '\r\n'
        etree.strip_elements(parsed, 'br', with_tail=False)

        for tag in parsed.xpath('//*'):
            if tag.getparent() is not None and tag.getparent().tag == 'body':
                ptext = ''
                for x in tag.itertext():
                    ptext += x
                text += self.format_text_content(ptext)

        return text
Exemplo n.º 45
0
def get_text(markup, content='xml', lf_on_block=False, space_on_elements=False):
    """Get plain text version of (X)HTML or other XML element

    if the markup can't be parsed, it will be returned unchanged
    :param str markup: string to convert to plain text
    :param str content: 'xml' or 'html', as in parse_html
    :param bool lf_on_block: if True, add a line feed on block elements' tail
    :param bool space_on_elements: if True, add a space on each element's tail
        mainly used to count words with non HTML markup
    :return str: plain text version of markup
    """
    try:
        root = sd_etree.parse_html(
            markup,
            content=content,
            lf_on_block=lf_on_block,
            space_on_elements=space_on_elements)
        text = etree.tostring(root, encoding='unicode', method='text')
        return text
    except etree.ParseError:
        return markup
    def get_text_content(self, content):
        content = content.replace('<br>', '<br/>').replace('</br>', '')
        # remove control chars except \n
        content = re.sub('[\x00-\x09\x0b-\x1f]', '', content)
        # new lines are spaces
        content = re.sub('[\n]', ' ', content)
        if content == '':
            return ''

        parsed = parse_html(content, content='html', space_on_elements=True)

        # breaks are replaced with spaces
        for br in parsed.xpath('//br'):
            br.tail = ' ' + br.tail if br.tail else ' '
        etree.strip_elements(parsed, 'br', with_tail=False)

        text = ''
        for top_level_tag in parsed.xpath('/html/div/child::*'):
            text += self.format_text_content(top_level_tag)

        return re.sub(' +', ' ', text)
Exemplo n.º 47
0
def yonhap_format(item, **kwargs):
    try:
        html = item.get('body_html')
        # Article must be from Yonhap
        if '(Yonhap)' not in html:
            return item
        item['source'] = 'Yonhap'

        if html:
            parsed = sd_etree.parse_html(html, content='xml')
            pars = parsed.xpath('//body')
            if len(pars) == 1:
                pars[0].tag = 'p'
                content = etree.tostring(pars[0], encoding="unicode")
                item['body_html'] = content.replace('&#13;\n   ', '</p><p>').replace('&#13;\n', '').replace('<br/>',
                                                                                                            ' ')
                _yonhap_derive_dateline(item)

    except Exception as ex:
        logging.exception('Exception in yonhap format macro: ', ex)
        raise ex

    return item
Exemplo n.º 48
0
def remove_dateline(item):
    """Remove the dateline from item"""
    html = item.get('body_html')
    if not html:
        return

    match = re.search(DATELINE_REGEX, html, re.IGNORECASE)
    if not match:
        return

    # get the matched string
    matched_string = match.group(0)
    parsed = parse_html(html, content='xml')
    pars = parsed.xpath('//p')

    for par in pars:
        if not par.text:
            continue
        if matched_string in par.text:
            city, source, the_rest = par.text.partition(matched_string)
            search_string = ''.join([s for s in [city, source]])
            item['body_html'] = html.replace(search_string, '')
            break
Exemplo n.º 49
0
    def ap_derive_dateline(self, item):
        """This function looks for a dateline in the article body an uses that.

        :param item:
        :return: item populated with a dateline
        """
        try:
            html = item.get('body_html')
            if html:
                parsed = parse_html(html, content='html')
                for par in parsed.xpath('/html/div/child::*'):
                    if not par.text:
                        continue
                    city, source, the_rest = par.text.partition(' (AP) _ ')
                    if source:
                        # sometimes the city is followed by a comma and either a date or a state
                        city = city.split(',')[0]
                        if any(char.isdigit() for char in city):
                            return
                        cities = app.locators.find_cities()
                        located = [c for c in cities if c['city'].lower() == city.lower()]
                        item.setdefault('dateline', {})
                        item['dateline']['located'] = located[0] if len(located) == 1 else {'city_code': city,
                                                                                            'city': city,
                                                                                            'tz': 'UTC',
                                                                                            'dateline': 'city'}
                        item['dateline']['source'] = item.get('original_source', 'AP')
                        item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                                                  get_date(item['firstcreated']),
                                                                                  source=item.get('original_source',
                                                                                                  'AP'))
                        break

            return item
        except:
            logging.exception('AP dateline extraction exception')
Exemplo n.º 50
0
    def body_hook(self, item, html):
        """Copy content to body_html

        if img are found in the content, they are uploaded.
        First image is used as feature media, then there are embeds
        """
        # we need to convert CRLF to <p>
        # cf. SDTS-22
        html = html.replace('&#13;', '\r')
        splitted = html.split('\r\n')
        if len(splitted) > 1:
            html = ''.join(['<p>{}</p>'.format(s) if not s.startswith('<hr') else s for s in splitted if s])

        if "img" in html:
            content = sd_etree.parse_html(html, 'html')
            for img in content.xpath('//img'):
                src = img.get('src')
                try:
                    key, media_data = self._add_image(item, src)
                except Exception as e:
                    logger.error(e)
                    img.getparent().remove(img)
                    continue
                url = media_data['renditions']['original']['href']
                img.set("src", url)
                if key == 'featuremedia':
                    # no need to embed the image for featuremedia
                    continue
                embed_start = etree.Comment(embed_TPL.format('START', key))
                embed_end = etree.Comment(embed_TPL.format('END', key))
                img.addprevious(embed_start)
                img.addnext(embed_end)

            html = etree.tostring(content, encoding="unicode")

        item['body_html'] = html
Exemplo n.º 51
0
    def format(self, article, subscriber, codes=None):
        try:
            docs = []
            formatted_article = deepcopy(article)
            for category in self._get_category_list(formatted_article.get('anpa_category')):
                mapped_source = self._get_mapped_source(formatted_article)
                formatted_article[config.ID_FIELD] = formatted_article.get('item_id',
                                                                           formatted_article.get(config.ID_FIELD))
                pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
                anpa = []

                if codes:
                    anpa.append(b'\x05')
                    anpa.append(' '.join(codes).encode('ascii'))
                    anpa.append(b'\x0D\x0A')

                # start of message header (syn syn soh)
                anpa.append(b'\x16\x16\x01')
                anpa.append(get_service_level(category, formatted_article).encode('ascii'))

                # story number
                anpa.append(str(pub_seq_num).zfill(4).encode('ascii'))

                # field seperator
                anpa.append(b'\x0A')  # -LF
                anpa.append(map_priority(formatted_article.get('priority')).encode('ascii'))
                anpa.append(b'\x20')

                anpa.append(category['qcode'].lower().encode('ascii'))

                anpa.append(b'\x13')
                # format identifier
                if formatted_article.get(FORMAT, FORMATS.HTML) == FORMATS.PRESERVED:
                    anpa.append(b'\x12')
                else:
                    anpa.append(b'\x11')
                anpa.append(b'\x20')

                # keyword
                keyword = 'bc-{}'.format(self.append_legal(article=formatted_article, truncate=True)).replace(' ', '-')
                keyword = keyword[:24] if len(keyword) > 24 else keyword
                anpa.append(keyword.encode('ascii'))
                anpa.append(b'\x20')

                # version field
                anpa.append(b'\x20')

                # reference field
                anpa.append(b'\x20')

                # filing date
                anpa.append('{}-{}'.format(formatted_article['_updated'].strftime('%m'),
                                           formatted_article['_updated'].strftime('%d')).encode('ascii'))
                anpa.append(b'\x20')

                # add the word count
                anpa.append(str(formatted_article.get('word_count', '0000')).zfill(4).encode('ascii'))
                anpa.append(b'\x0D\x0A')

                anpa.append(b'\x02')  # STX

                self._process_headline(anpa, formatted_article, category['qcode'].encode('ascii'))

                keyword = SluglineMapper().map(article=formatted_article, category=category['qcode'].upper(),
                                               truncate=True).encode('ascii', 'ignore')
                anpa.append(keyword)
                take_key = (formatted_article.get('anpa_take_key', '') or '').encode('ascii', 'ignore')
                anpa.append((b'\x20' + take_key) if len(take_key) > 0 else b'')
                anpa.append(b'\x0D\x0A')

                if formatted_article.get('ednote', '') != '':
                    ednote = '{}\r\n'.format(to_ascii(formatted_article.get('ednote')))
                    anpa.append(ednote.encode('ascii', 'replace'))

                if formatted_article.get(BYLINE):
                    anpa.append(get_text(formatted_article.get(BYLINE)).encode('ascii', 'replace'))
                    anpa.append(b'\x0D\x0A')

                if formatted_article.get(FORMAT) == FORMATS.PRESERVED:
                    anpa.append(get_text(self.append_body_footer(formatted_article),
                                         content='html').encode('ascii', 'replace'))
                else:
                    body = to_ascii(formatted_article.get('body_html', ''))
                    # we need to inject the dateline
                    if formatted_article.get('dateline', {}).get('text') and not article.get('auto_publish', False):
                        body_html_elem = parse_html(formatted_article.get('body_html'))
                        ptag = body_html_elem.find('.//p')
                        if ptag is not None:
                            ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '')
                            body = to_string(body_html_elem)
                    anpa.append(self.get_text_content(body))
                    if formatted_article.get('body_footer'):
                        anpa.append(self.get_text_content(to_ascii(formatted_article.get('body_footer', ''))))

                anpa.append(b'\x0D\x0A')
                anpa.append(mapped_source.encode('ascii'))
                sign_off = (formatted_article.get('sign_off', '') or '').encode('ascii')
                anpa.append((b'\x20' + sign_off) if len(sign_off) > 0 else b'')
                anpa.append(b'\x0D\x0A')

                anpa.append(b'\x03')  # ETX

                # time and date
                anpa.append(datetime.datetime.now().strftime('%d-%m-%y %H-%M-%S').encode('ascii'))

                anpa.append(b'\x04')  # EOT
                anpa.append(b'\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A')

                docs.append({'published_seq_num': pub_seq_num, 'encoded_item': b''.join(anpa),
                             'formatted_item': b''.join(anpa).decode('ascii')})

            return docs
        except Exception as ex:
            raise FormatterError.AnpaFormatterError(ex, subscriber)
Exemplo n.º 52
0
 def test_void_elements_fix(self):
     html_raw = '<p>this is a test with empty <h3/> non-void <em/> elements and a void <br/> one</p>'
     expected = '<p>this is a test with empty <h3></h3> non-void <em></em> elements and a void <br/> one</p>'
     parsed = sd_etree.parse_html(html_raw)
     sd_etree.fix_html_void_elements(parsed)
     self.assertEqual(sd_etree.to_string(parsed), expected)
    def _parse_content(self, article):
        """Parse body_html and mapping to fields required for apple news format

        :param article:
        """
        statement_regex = re.compile(r'^The Statement$', re.IGNORECASE)
        analysis_regex = re.compile(r'^The Analysis$', re.IGNORECASE)
        verdict_regex = re.compile(r'^The Verdict$', re.IGNORECASE)
        references_regex = re.compile(r'^The References$', re.IGNORECASE)
        url_regex = re.compile(r'(?:(?:https|http)://)[\w/\-?=%.]+\.[\w/\-?=%.]+', re.IGNORECASE)
        abstract = get_text(article.get('abstract'), content='html').strip()

        article['_title'] = abstract
        body_html = article.get('body_html')
        article['_analysis_first_line'] = ''
        article['_analysis'] = ''
        article['_statement'] = ''
        article['_statement_attribution'] = ''
        article['_verdict1'] = ''
        article['_verdict2'] = ''
        article['_references'] = ''
        article['_revision_history'] = ''

        if article.get(ITEM_STATE) == CONTENT_STATE.KILLED or article.get(ITEM_STATE) == CONTENT_STATE.RECALLED:
            article['_title'] = 'This article has been removed.'
            article['_analysis_first_line'] = 'This article has been removed.'
            article['_analysis'] = 'This article has been removed.'
            article['_statement'] = 'This article has been removed.'
            article['_statement_attribution'] = 'This article has been removed.'
            article['_verdict1'] = 'This article has been removed.'
            article['_verdict2'] = 'This article has been removed.'
            article['_references'] = 'This article has been removed.'
            self._set_revision_history(article)
            return

        parsed_content = parse_html(body_html, content='html')
        statement_found = False
        analysis_found = False
        analysis_first_line = False
        verdict1_found = False
        verdict2_found = False
        references_found = False
        statement_elements = []

        for top_level_tag in parsed_content.xpath('/html/div/child::*'):
            tag_text = format_text_content(top_level_tag).strip()
            if not tag_text:
                continue

            if not verdict1_found:
                if not statement_found:
                    match = statement_regex.search(tag_text)
                    if match:
                        statement_found = True
                    continue
                else:
                    # statement found
                    match = verdict_regex.search(tag_text)
                    if match:
                        verdict1_found = True
                        if len(statement_elements) > 1:
                            statement_length = len(statement_elements) - 1
                            for i in range(statement_length):
                                article['_statement'] += get_text(
                                    to_string(statement_elements[i], remove_root_div=False),
                                    content='html'
                                ).strip()
                                if statement_length > 1 and i != statement_length - 1:
                                    article['_statement'] += '\r\n'

                            article['_statement_attribution'] = get_text(
                                to_string(statement_elements[-1:][0], remove_root_div=False),
                                content='html'
                            ).strip()
                        elif len(statement_elements) == 1:
                            article['_statement'] = to_string(
                                statement_elements[0],
                                remove_root_div=False
                            )
                        continue

                    statement_elements.append(top_level_tag)
                    continue

            if verdict1_found and not analysis_found:
                match = analysis_regex.search(tag_text)
                if match:
                    analysis_found = True
                else:
                    article['_verdict1'] += to_string(top_level_tag, remove_root_div=False)
                continue

            if analysis_found and not verdict2_found:
                if not analysis_first_line:
                    article['_analysis_first_line'] = tag_text
                    analysis_first_line = True

                match = verdict_regex.search(tag_text)
                if match:
                    verdict2_found = True
                else:
                    article['_analysis'] += to_string(top_level_tag, remove_root_div=False)
                continue

            if verdict2_found and not references_found:
                match = references_regex.search(tag_text)
                if match:
                    references_found = True
                else:
                    article['_verdict2'] += to_string(top_level_tag, remove_root_div=False)
                continue

            if references_found:
                def replacement(match_object):
                    value = match_object.group(0)
                    if value:
                        return '<a href="{0}">{0}</a>'.format(value)
                    return ''

                tag_text = re.sub(r'^\d*\s*[.):]?', '', tag_text).strip()

                article['_references'] += '<li>{}</li>'.format(
                    re.sub(url_regex, replacement, tag_text)
                )

        if len(article['_references']):
            article['_references'] = '<ol>{}</ol>'.format(article['_references'])

        if not article.get('_statement') and article.get('_statement_attribution'):
            # if statement is not as per the format
            article['_statement'] = article.get('_statement_attribution')
            article['_statement_attribution'] = ''

        self._set_revision_history(article)