Пример #1
0
    def format(self, article, subscriber, codes=None):
        """
        Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure
        :return: returns the sequence number of the subscriber and the constructed parameter dictionary
        """
        try:
            pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
            sms_message = article.get('sms_message', article.get('abstract', ''))

            # category = 1 is used to indicate a test message
            category = '1' if superdesk.app.config.get('TEST_SMS_OUTPUT', True) is True \
                else article.get('anpa_category', [{}])[0].get('qcode').upper()

            odbc_item = {'Sequence': pub_seq_num, 'Category': category,
                         'Headline': to_ascii(get_text(sms_message, content='html')).replace('\'', '\'\''),
                         'Priority': map_priority(article.get('priority'))}

            body = self.append_body_footer(article)

            if article[ITEM_TYPE] == CONTENT_TYPE.TEXT:
                body = get_text(body, content='html')

            odbc_item['StoryText'] = to_ascii(body).replace('\'', '\'\'')  # @article_text
            odbc_item['ident'] = '0'

            return [(pub_seq_num, json.dumps(odbc_item))]
        except Exception as ex:
            raise FormatterError.AAPSMSFormatterError(ex, subscriber)
Пример #2
0
    def format(self, article, subscriber, codes=None):
        try:
            formatted_doc = {}
            formatted_doc['headline'] = get_text(article.get('headline', ''),
                                                 content='html')
            formatted_doc['headline'] = formatted_doc['headline'].replace(
                '\'', '\'\'').replace('\xA0', ' ')
            formatted_doc['keyword'] = article.get('slugline',
                                                   '').replace('\'', '\'\'')

            # body formatting
            if article.get(FORMAT) == FORMATS.PRESERVED:
                body = get_text(self.append_body_footer(article),
                                content='html')
                formatted_doc['article_text'] = body.replace('\'', '\'\'')
            elif article.get(FORMAT, FORMATS.HTML) == FORMATS.HTML:
                body = self.get_wrapped_text_content(
                    to_ascii(self.append_body_footer(article))).replace(
                        '\'', '\'\'')
                formatted_doc['article_text'] = body

            self.refine_article_body(formatted_doc, article)

            # Frame the text output according to AAP requirement
            formatted_output = 'KEYWORD: ' + formatted_doc.get('keyword',
                                                               '') + '\r\n'
            formatted_output += 'HEADLINE: ' + formatted_doc.get(
                'headline', '') + '\r\n'
            formatted_output += '   ' + formatted_doc.get('article_text', '')

            return [(0, json.dumps({'article_text': formatted_output}))]
        except Exception as ex:
            raise FormatterError.AAPTextFormatterError(ex, subscriber)
Пример #3
0
 def _transform_to_ninjs(self, article, subscriber, recursive=True):
     return {
         "uuid":
         article["guid"],
         "createdTimestamp":
         format_datetime(article["firstcreated"]),
         "latestVersionTimestamp":
         format_datetime(article["versioncreated"]),
         "publicationTimestamp":
         format_datetime(article["firstpublished"]),
         "authors":
         [author["sub_label"] for author in article.get("authors") or []],
         "language":
         article["language"],
         "pubStatus":
         True,
         "concepts":
         self._format_concepts(article),
         "headline":
         get_text(article["headline"]),
         "preamble":
         get_text(article["abstract"], lf_on_block=True).strip()
         if article.get("abstract") else "",
         "dateline":
         article["dateline"]["text"] if article.get("dateline")
         and article["dateline"].get("text") else "",
         "body": [
             line.strip() for line in get_text(article["body_html"],
                                               lf_on_block=True).split("\n")
             if line
         ],
     }
    def _format_body_content(self, article, body_content):
        nitf_body = []

        if article.get('ednote'):
            nitf_body.append(to_ascii(self._format_line(article.get('ednote'))))

        if article.get(BYLINE):
            nitf_body.append(to_ascii(self._format_line(get_text(article.get(BYLINE)))))

        if article.get(FORMAT) == FORMATS.PRESERVED:
            nitf_body.append(to_ascii(get_text(self.append_body_footer(article), content='html')))
        else:
            body = article.get('body_html', '')
            # we need to inject the dateline
            if article.get('dateline', {}).get('text') and not article.get('auto_publish', False):
                body_html_elem = parse_html(article.get('body_html'))
                ptag = body_html_elem.find('.//p')
                if ptag is not None:
                    ptag.text = article['dateline']['text'] + ' ' + (ptag.text or '')
                    body = to_string(body_html_elem)

            nitf_body.append(self.get_text_content(body))
            if article.get('body_footer'):
                nitf_body.append(self.get_text_content(article.get('body_footer', '')))

        sign_off = '{} {}'.format(article.get('source') or '', (article.get('sign_off') or '')).strip()
        if sign_off:
            nitf_body.append(to_ascii(self._format_line(sign_off)))

        SubElement(body_content, 'pre').text = ''.join(nitf_body)
Пример #5
0
    def get_odbc_item(self,
                      article,
                      subscriber,
                      category,
                      codes,
                      pass_through=False):
        """
        Construct an odbc_item with the common key value pairs populated, if pass_through is true then the headline
        original headline is maintained.
        :param article:
        :param subscriber:
        :param category:
        :param codes:
        :param pass_through:
        :return:
        """
        article['headline'] = get_text(article.get('headline', ''),
                                       content='html')
        pub_seq_num = superdesk.get_resource_service(
            'subscribers').generate_sequence_number(subscriber)
        odbc_item = dict(
            originator=article.get('source', None),
            sequence=pub_seq_num,
            category=category.get('qcode').lower(),
            author=get_text(article.get('byline', '') or '',
                            content='html').replace('\'', '\'\''),
            keyword=SluglineMapper().map(
                article=article,
                category=category.get('qcode').upper(),
                truncate=True).replace('\'', '\'\'') if not pass_through else
            (article.get('slugline', '') or '').replace('\'', '\'\''),
            subject_reference=set_subject(category, article),
            take_key=(article.get('anpa_take_key', '')
                      or '').replace('\'', '\'\''))
        if 'genre' in article and len(article['genre']) >= 1:
            odbc_item['genre'] = article['genre'][0].get('name', None)
        else:
            odbc_item['genre'] = 'Current'  # @genre
        odbc_item['news_item_type'] = 'News'
        odbc_item['fullStory'] = 1
        odbc_item['ident'] = '0'  # @ident
        odbc_item['selector_codes'] = ' '.join(codes) if codes else ' '

        headline = to_ascii(LocatorMapper().get_formatted_headline(
            article,
            category.get('qcode').upper()))
        odbc_item['headline'] = headline.replace('\'',
                                                 '\'\'').replace('\xA0', ' ')

        self.expand_subject_codes(odbc_item)
        self.set_usn(odbc_item, article)

        return pub_seq_num, odbc_item
Пример #6
0
    def append_body_footer(self, article):
        """
        Checks if the article has any Public Service Announcements and if available appends each of them to the body.

        :return: body with public service announcements.
        """
        try:
            article['body_html'] = article['body_html'].replace('<br>', '<br/>')
        except KeyError:
            pass

        body = ''
        if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]:
            body = article.get('body_html', '')
        elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]:
            body = article.get('description', '')

        if body and article.get(FORMAT, '') == FORMATS.PRESERVED:
            body = body.replace('\n', '\r\n').replace('\r\r', '\r')
            parsed = parse_html(body, content='html')

            for br in parsed.xpath('//br'):
                br.tail = '\r\n' + br.tail if br.tail else '\r\n'

            etree.strip_elements(parsed, 'br', with_tail=False)
            body = etree.tostring(parsed, encoding="unicode")

        if body and article.get('body_footer'):
            footer = article.get('body_footer')
            if article.get(FORMAT, '') == FORMATS.PRESERVED:
                body = '{}\r\n{}'.format(body, get_text(footer))
            else:
                body = '{}{}'.format(body, footer)
        return body
Пример #7
0
 def _format_body_content(self, article, body_content):
     if article.get(FORMAT) == FORMATS.PRESERVED:
         pre = get_text(self.append_body_footer(article))
         SubElement(body_content, 'pre').text = pre
     else:
         self.map_html_to_xml(body_content,
                              self.append_body_footer(article))
Пример #8
0
 def _set_headline(self, item, value):
     if not value:
         # if there is no headline, we use first 100 chars of body
         # cf. SDNTB-481
         value = text_utils.get_text(item.get('body_html', ''),
                                     'html')[:100]
     item['headline'] = value
Пример #9
0
    def append_body_footer(self, article):
        """
        Checks if the article has any Public Service Announcements and if available appends each of them to the body.

        :return: body with public service announcements.
        """
        try:
            article["body_html"] = article["body_html"].replace("<br>", "<br/>")
        except KeyError:
            pass

        body = ""
        if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]:
            body = article.get("body_html", "")
        elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]:
            body = article.get("description", "")

        if body and article.get(FORMAT, "") == FORMATS.PRESERVED:
            body = body.replace("\n", "\r\n").replace("\r\r", "\r")
            parsed = parse_html(body, content="html")

            for br in parsed.xpath("//br"):
                br.tail = "\r\n" + br.tail if br.tail else "\r\n"

            etree.strip_elements(parsed, "br", with_tail=False)
            body = etree.tostring(parsed, encoding="unicode")

        if body and article.get("body_footer"):
            footer = article.get("body_footer")
            if article.get(FORMAT, "") == FORMATS.PRESERVED:
                body = "{}\r\n{}".format(body, get_text(footer))
            else:
                body = "{}{}".format(body, footer)
        return body
Пример #10
0
    def append_body_footer(self, article):
        """
        Checks if the article has any Public Service Announcements and if available appends each of them to the body.

        :return: body with public service announcements.
        """
        try:
            article['body_html'] = article['body_html'].replace('<br>', '<br/>')
        except KeyError:
            pass

        body = ''
        if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]:
            body = article.get('body_html', '')
        elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]:
            body = article.get('description', '')

        if body and article.get(FORMAT, '') == FORMATS.PRESERVED:
            body = body.replace('\n', '\r\n').replace('\r\r', '\r')
            parsed = parse_html(body, content='html')

            for br in parsed.xpath('//br'):
                br.tail = '\r\n' + br.tail if br.tail else '\r\n'

            etree.strip_elements(parsed, 'br', with_tail=False)
            body = etree.tostring(parsed, encoding="unicode")

        if body and article.get('body_footer'):
            footer = article.get('body_footer')
            if article.get(FORMAT, '') == FORMATS.PRESERVED:
                body = '{}\r\n{}'.format(body, get_text(footer))
            else:
                body = '{}{}'.format(body, footer)
        return body
Пример #11
0
 def _set_headline(self, item, value):
     if not value:
         # if there is no headline, we use first 100 chars of body
         # cf. SDNTB-481
         value = text_utils.get_text(item.get("body_html", ""),
                                     "html")[:100]
     item["headline"] = value
    def _set_revision_history(self, article):
        """Get revision history of published article

        :param dict article:
        """
        query = {
            'query': {
                'filtered': {
                    'filter': {
                        'bool': {
                            'must': {
                                'term': {
                                    'item_id': article.get('item_id')
                                }
                            }
                        }
                    }
                }
            },
            'sort': [{
                'versioncreated': {
                    'order': 'asc'
                }
            }]
        }

        req = ParsedRequest()
        repos = 'published,archived'
        req.args = {
            'source': json.dumps(query),
            'repo': repos,
            'aggregations': 0
        }
        revisions = list(
            get_resource_service('search').get(req=req, lookup=None))
        revisions_tag = []

        for rev in revisions:
            local_date = utc_to_local(
                config.DEFAULT_TIMEZONE,
                rev.get('firstpublished') if rev.get(ITEM_STATE)
                == CONTENT_STATE.PUBLISHED else rev.get('versioncreated'))
            date_string = datetime.strftime(local_date,
                                            '%b XXX, %Y %H:%M %Z').replace(
                                                'XXX', str(local_date.day))
            if rev.get(ITEM_STATE) == CONTENT_STATE.PUBLISHED:
                revisions_tag.append('<li>{} {}</li>'.format(
                    'First published', date_string))
            else:
                revision_markup = '{} {}'.format('Revision published',
                                                 date_string)
                ednote = get_text(rev.get('ednote') or '',
                                  content='html').strip()
                if rev.get(ITEM_STATE) == CONTENT_STATE.CORRECTED and ednote:
                    revision_markup += '<br><i>{}</i>'.format(ednote)
                revisions_tag.append('<li>{}</li>'.format(revision_markup))

        article['_revision_history'] = '<ul>{}</ul>'.format(
            ''.join(revisions_tag)) if revisions_tag else ''
 def _fill_definition_short(self, document, item):
     content = document.find('content')
     if content is not None:
         item['definition_short'] = text_utils.get_text(
             content.text,
             content='html',
             lf_on_block=True,
             space_on_elements=True).strip()
Пример #14
0
def get_item_body(item):
    body = []
    for field in ("body_html", "abstract"):
        try:
            body.extend([p.strip() for p in get_text(item[field], "html", True).split("\n") if p.strip()])
        except KeyError:
            pass
    return body
Пример #15
0
    def parse_item(self, tree):
        item = super().parse_item(tree)
        meta = tree.find(self.qname('contentMeta'))

        organisation = meta.xpath('./iptc:subject[@type="cpnat:organisation"][@literal]', namespaces=NS)
        if organisation:
            item['abstract'] = format_maxlength('FOR: {}. {}'.format(
                organisation[0].get('literal').upper().rstrip('.'),
                get_text(item['body_html']).replace('  ', ' '),
            ), 200)

        return item
Пример #16
0
    def format(self, article, subscriber, codes=None):
        """
        Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure
        :return: returns the sequence number of the subscriber and the constructed parameter dictionary
        """
        try:
            pub_seq_num = superdesk.get_resource_service(
                'subscribers').generate_sequence_number(subscriber)
            sms_message = article.get('sms_message',
                                      article.get('abstract', ''))

            # category = 1 is used to indicate a test message
            category = '1' if superdesk.app.config.get('TEST_SMS_OUTPUT', True) is True \
                else article.get('anpa_category', [{}])[0].get('qcode').upper()

            odbc_item = {
                'Sequence':
                pub_seq_num,
                'Category':
                category,
                'Headline':
                to_ascii(get_text(sms_message,
                                  content='html')).replace('\'', '\'\''),
                'Priority':
                map_priority(article.get('priority'))
            }

            body = self.append_body_footer(article)

            if article[ITEM_TYPE] == CONTENT_TYPE.TEXT:
                body = get_text(body, content='html')

            odbc_item['StoryText'] = to_ascii(body).replace(
                '\'', '\'\'')  # @article_text
            odbc_item['ident'] = '0'

            return [(pub_seq_num, json.dumps(odbc_item))]
        except Exception as ex:
            raise FormatterError.AAPSMSFormatterError(ex, subscriber)
Пример #17
0
def get_permalink(item):
    code = item["_id"][-6:]
    try:
        title = item["extra"][PERMALINK] or ""
        slug = slugify(get_text(title, 'html'))
    except (KeyError, AttributeError):
        slug = ""
    return urljoin(
        BASE_URL,
        "/{lang}/{code}/".format(
            lang=item.get("language", "en"), code="-".join(filter(bool, [slug, code])),
        ),
    )
 def add_byline(self, odbc_item, article):
     """
     Add the byline to the article text
     :param odbc_item:
     :param article:
     :return:
     """
     if article.get('byline') and article.get('byline') != '':
         byline = get_text(article.get('byline', ''), content='html')
         if len(byline) >= 3 and byline[:2].upper() != 'BY':
             byline = 'By ' + byline
         byline = '   {}\r\n\r\n'.format(byline).replace('\'', '\'\'')
         odbc_item['article_text'] = byline + odbc_item['article_text']
Пример #19
0
 def add_byline(self, odbc_item, article):
     """
     Add the byline to the article text
     :param odbc_item:
     :param article:
     :return:
     """
     if article.get('byline') and article.get('byline') != '':
         byline = get_text(article.get('byline', ''), content='html')
         if len(byline) >= 3 and byline[:2].upper() != 'BY':
             byline = 'By ' + byline
         byline = '\x19   {}\x19\r\n'.format(byline).replace('\'', '\'\'')
         odbc_item['article_text'] = byline + odbc_item['article_text']
Пример #20
0
    def get_odbc_item(self, article, subscriber, category, codes, pass_through=False):
        """
        Construct an odbc_item with the common key value pairs populated, if pass_through is true then the headline
        original headline is maintained.
        :param article:
        :param subscriber:
        :param category:
        :param codes:
        :param pass_through:
        :return:
        """
        article['headline'] = get_text(article.get('headline', ''), content='html')
        pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
        odbc_item = dict(originator=article.get('source', None), sequence=pub_seq_num,
                         category=category.get('qcode').lower(),
                         author=get_text(article.get('byline', '') or '', content='html').replace('\'', '\'\''),
                         keyword=SluglineMapper().map(article=article,
                                                      category=category.get('qcode').upper(),
                                                      truncate=True).replace('\'', '\'\'') if not pass_through else
                         (article.get('slugline', '') or '').replace('\'', '\'\''),
                         subject_reference=set_subject(category, article),
                         take_key=(article.get('anpa_take_key', '') or '').replace('\'', '\'\''))
        if 'genre' in article and len(article['genre']) >= 1:
            odbc_item['genre'] = article['genre'][0].get('name', None)
        else:
            odbc_item['genre'] = 'Current'  # @genre
        odbc_item['news_item_type'] = 'News'
        odbc_item['fullStory'] = 1
        odbc_item['ident'] = '0'  # @ident
        odbc_item['selector_codes'] = ' '.join(codes) if codes else ' '

        headline = to_ascii(LocatorMapper().get_formatted_headline(article, category.get('qcode').upper()))
        odbc_item['headline'] = headline.replace('\'', '\'\'').replace('\xA0', ' ')

        self.expand_subject_codes(odbc_item)
        self.set_usn(odbc_item, article)

        return pub_seq_num, odbc_item
Пример #21
0
def populate(item, **kwargs):
    """Populate the abstract field with the first sentence of the body"""

    # get the list of sentences of the body
    if not item.get("body_html", None):
        item["abstract"] = "No body found to use for abstract..."
    else:
        sentences = p.split(item["body_html"])

        # chop the first sentence to size for abstract (64)
        if sentences and len(sentences) > 0:
            item["abstract"] = get_text(sentences[0][:64]).strip()

    return item
def populate(item, **kwargs):
    """Populate the abstract field with the first sentence of the body"""

    # get the list of sentences of the body
    if not item.get('body_html', None):
        item['abstract'] = 'No body found to use for abstract...'
    else:
        sentences = p.split(item['body_html'])

        # chop the first sentence to size for abstract (64)
        if sentences and len(sentences) > 0:
            item['abstract'] = get_text(sentences[0][:64]).strip()

    return item
Пример #23
0
    def _process_headline(self, anpa, article, category):
        # prepend the locator to the headline if required
        article['headline'] = get_text(article.get('headline', ''))
        headline = to_ascii(LocatorMapper().get_formatted_headline(article, category.decode('UTF-8').upper()))

        # Set the maximum size to 64 including the sequence number if any
        if len(headline) > 64:
            if article.get('sequence'):
                digits = len(str(article['sequence'])) + 1
                shortened_headline = '{}={}'.format(headline[:-digits][:(64 - digits)], article['sequence'])
                anpa.append(shortened_headline.encode('ascii', 'replace'))
            else:
                anpa.append(headline[:64].encode('ascii', 'replace'))
        else:
            anpa.append(headline.encode('ascii', 'replace'))
        anpa.append(b'\x0D\x0A')
Пример #24
0
    def _format_content(self, article, news_item, nitf):
        """Adds the content set to the xml

        :param dict article:
        :param Element newsItem:
        :param Element nitf:
        """
        content_set = SubElement(news_item, 'contentSet')
        if article.get(FORMAT) == FORMATS.PRESERVED:
            inline_data = text_utils.get_text(self.append_body_footer(article))
            SubElement(content_set, 'inlineData',
                       attrib={'contenttype': 'text/plain'}).text = inline_data
        elif article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.COMPOSITE]:
            inline = SubElement(content_set, 'inlineXML',
                                attrib={'contenttype': 'application/nitf+xml'})
            inline.append(nitf)
Пример #25
0
    def _process_headline(self, anpa, article, category):
        # prepend the locator to the headline if required
        article['headline'] = get_text(article.get('headline', ''))
        headline = to_ascii(LocatorMapper().get_formatted_headline(article, category.decode('UTF-8').upper()))

        # Set the maximum size to 64 including the sequence number if any
        if len(headline) > 64:
            if article.get('sequence'):
                digits = len(str(article['sequence'])) + 1
                shortened_headline = '{}={}'.format(headline[:-digits][:(64 - digits)], article['sequence'])
                anpa.append(shortened_headline.encode('ascii', 'replace'))
            else:
                anpa.append(headline[:64].encode('ascii', 'replace'))
        else:
            anpa.append(headline.encode('ascii', 'replace'))
        anpa.append(b'\x0D\x0A')
Пример #26
0
def truncate_article_body(items, monitoring_profile, full_text=False):
    # To make sure PDF creator and RTF creator does truncate for linked_text settings
    # Manually truncate it
    for i in items:
        i['body_str'] = get_text(i.get('body_html', ''), content='html', lf_on_block=True)
        if monitoring_profile['alert_type'] == 'linked_text':
            if not full_text and len(i['body_str']) > 160:
                i['body_str'] = i['body_str'][:159] + '...'

        if monitoring_profile.get('format_type') == 'monitoring_pdf':
            body_lines = i.get('body_str', '').split('\n')
            altered_html = ''
            for line in body_lines:
                altered_html = '{}<div class="line">{}</div>'.format(altered_html, line)

            i['body_str'] = altered_html
    def _format_content(self, article, news_item, nitf):
        """Adds the content set to the xml

        :param dict article:
        :param Element newsItem:
        :param Element nitf:
        """
        content_set = SubElement(news_item, 'contentSet')
        if article.get(FORMAT) == FORMATS.PRESERVED:
            inline_data = text_utils.get_text(self.append_body_footer(article))
            SubElement(content_set, 'inlineData',
                       attrib={'contenttype': 'text/plain'}).text = inline_data
        elif article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.COMPOSITE]:
            inline = SubElement(content_set, 'inlineXML',
                                attrib={'contenttype': 'application/nitf+xml'})
            inline.append(nitf)
    def format_for_source(self, article, subscriber, source, codes=None):
        """Constructs a dictionary that represents the parameters passed to the IPNews InsertNews stored procedure
        :type article: object
        :return: returns the sequence number of the subscriber and the constructed parameter dictionary
        """
        pass_through = article.get('auto_publish', False)
        try:
            docs = []
            for category in self._get_category_list(article.get('anpa_category')):
                # All NZN sourced content is AAP content for the AAP output formatted
                article['source'] = source
                pub_seq_num, odbc_item = self.get_odbc_item(article, subscriber, category, codes, pass_through)

                if article.get(FORMAT) == FORMATS.PRESERVED:  # @article_text
                    body = get_text(self.append_body_footer(article))
                    odbc_item['article_text'] = body.replace('\'', '\'\'')
                    odbc_item['texttab'] = 't'
                elif article.get(FORMAT, FORMATS.HTML) == FORMATS.HTML:
                    body = self.get_wrapped_text_content(
                        to_ascii(self.append_body_footer(article))).replace('\'', '\'\'')
                    # if we have a dateline inject it
                    if 'dateline' in article and 'text' in article.get('dateline', {}) and not pass_through:
                        if body.startswith('   '):
                            body = '   {} {}'.format(article.get('dateline')
                                                     .get('text').replace('\'', '\'\''),
                                                     body[3:])

                    odbc_item['article_text'] = body
                    odbc_item['texttab'] = 'x'

                if not pass_through:
                    self.add_ednote(odbc_item, article)
                    self.add_byline(odbc_item, article)

                odbc_item['article_text'] += '\r\n' + article.get('source', '')
                sign_off = article.get('sign_off', '') or ''
                if len(sign_off) > 0:
                    odbc_item['article_text'] += ' ' + sign_off

                odbc_item['service_level'] = get_service_level(category, article)  # @service_level
                odbc_item['wordcount'] = article.get('word_count') or 0   # @wordcount
                odbc_item['priority'] = map_priority(article.get('priority'))  # @priority

                docs.append((pub_seq_num, json.dumps(odbc_item)))
            return docs
        except Exception as ex:
            raise FormatterError.AAPIpNewsFormatterError(ex, subscriber)
Пример #29
0
    def parse(self, xml, provider=None):
        self.root = xml
        try:
            item = self.parse_item(xml)
            if not item.get('headline'):
                item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100]

            try:
                abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if abstract:
                    item['abstract'] = abstract
            return [item]
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
 def map_html_to_xml(self, element, html):
     """
     Map the html text tags to xml
     :param element: The xml element to populate
     :param html: the html to parse the text from
     :return:
     """
     html = html.replace('<br>', '<br/>').replace('</br>', '')
     html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html)
     html = html.replace('\n', ' ')
     html = re.sub(r'\s\s+', ' ', html)
     parsed = parse_html(html, content='html')
     for tag in parsed.xpath('/html/div/child::*'):
         p = etree.Element('p')
         p.text = to_ascii(
             get_text(to_string(tag, method='html'), content='html'))
         element.append(p)
    def _set_revision_history(self, article):
        """Get revision history of published article

        :param dict article:
        """
        query = {
            'query': {
                'filtered': {
                    'filter': {
                        'bool': {
                            'must': {
                                'term': {'item_id': article.get('item_id')}
                            }
                        }
                    }
                }
            },
            'sort': [
                {'versioncreated': {'order': 'asc'}}
            ]
        }

        req = ParsedRequest()
        repos = 'published,archived'
        req.args = {'source': json.dumps(query), 'repo': repos, 'aggregations': 0}
        revisions = list(get_resource_service('search').get(req=req, lookup=None))
        revisions_tag = []

        for rev in revisions:
            local_date = utc_to_local(
                config.DEFAULT_TIMEZONE,
                rev.get('firstpublished') if rev.get(ITEM_STATE) == CONTENT_STATE.PUBLISHED
                else rev.get('versioncreated')
            )
            date_string = datetime.strftime(local_date, '%b XXX, %Y %H:%M %Z').replace('XXX', str(local_date.day))
            if rev.get(ITEM_STATE) == CONTENT_STATE.PUBLISHED:
                revisions_tag.append('<li>{} {}</li>'.format('First published', date_string))
            else:
                revision_markup = '{} {}'.format('Revision published', date_string)
                ednote = get_text(rev.get('ednote') or '', content='html').strip()
                if rev.get(ITEM_STATE) == CONTENT_STATE.CORRECTED and ednote:
                    revision_markup += '<br><i>{}</i>'.format(ednote)
                revisions_tag.append('<li>{}</li>'.format(revision_markup))

        article['_revision_history'] = '<ul>{}</ul>' .format(''.join(revisions_tag)) if revisions_tag else ''
Пример #32
0
def callback(item, **kwargs):
    diff = {}
    if not item.get('body_html'):
        return diff
    rate = get_rate()
    text = get_text(item['body_html'], 'html', True)

    def repl(m, is_fr=False):
        if m.group('currency') and m.group('currency') != 'US':
            return
        if is_fr:
            num = m.group('num').replace(',', '.').replace(' ', '')
        else:
            num = m.group('num').replace(',', '')
        converted = decimal.Decimal(num) * rate
        if m.group('decimal'):
            _format = '{:.3f}'  # convert 55.21 to 73.73 - round to 3 decimals and strip last one
            if is_fr and ' ' in m.group('num') or not is_fr and ',' in m.group(
                    'num'):
                _format = '{:,.3f}'
            fixed = _format.format(converted)[:-1]
        else:
            _format = '{:.1f}0'  # convert 55 to 73.70 - round to 1 decimal and add 0
            if is_fr and ' ' in m.group('num') or not is_fr and ',' in m.group(
                    'num'):
                _format = '{:,.1f}0'
            fixed = _format.format(converted).replace('.00', '')
        # keep leeding whitespace so on client it won't
        # replace $500 in C$500
        diff[m.group(0).rstrip(
        )] = '{whitespace} ({en_currency}{value}{mil}{fr_currency})'.format(
            whitespace=m.group(0).rstrip(),
            en_currency='' if is_fr else 'C$',
            value=fixed if not is_fr else fixed.replace(',', ' ').replace(
                '.', ','),
            mil=m.group('mil') or '',
            fr_currency=' $ CAN' if is_fr else '',
        ).rstrip()

    re.sub(CURRENCY_REGEX, repl, text)
    re.sub(CURRENCY_REGEX_FR, functools.partial(repl, is_fr=True), text)

    return (item, diff)
Пример #33
0
def callback(item, **kwargs):
    diff = {}
    if not item.get("body_html"):
        return diff
    rate = get_rate()
    text = get_text(item["body_html"], "html", True)

    def repl(m, is_fr=False):
        if m.group("currency") and m.group("currency") != "US":
            return
        if is_fr:
            num = m.group("num").replace(",", ".").replace(" ", "")
        else:
            num = m.group("num").replace(",", "")
        converted = decimal.Decimal(num) * rate
        if m.group("decimal"):
            _format = "{:.3f}"  # convert 55.21 to 73.73 - round to 3 decimals and strip last one
            if is_fr and " " in m.group("num") or not is_fr and "," in m.group(
                    "num"):
                _format = "{:,.3f}"
            fixed = _format.format(converted)[:-1]
        else:
            _format = "{:.1f}0"  # convert 55 to 73.70 - round to 1 decimal and add 0
            if is_fr and " " in m.group("num") or not is_fr and "," in m.group(
                    "num"):
                _format = "{:,.1f}0"
            fixed = _format.format(converted).replace(".00", "")
        # keep leeding whitespace so on client it won't
        # replace $500 in C$500
        diff[m.group(0).rstrip(
        )] = "{whitespace} ({en_currency}{value}{mil}{fr_currency})".format(
            whitespace=m.group(0).rstrip(),
            en_currency="" if is_fr else "C$",
            value=fixed if not is_fr else fixed.replace(",", " ").replace(
                ".", ","),
            mil=m.group("mil") or "",
            fr_currency=" $ CAN" if is_fr else "",
        ).rstrip()

    re.sub(CURRENCY_REGEX, repl, text)
    re.sub(CURRENCY_REGEX_FR, functools.partial(repl, is_fr=True), text)

    return (item, diff)
    def format_for_source(self, article, subscriber, source, codes=None):
        try:
            pass_through = article.get('auto_publish', False)
            docs = []
            for category in self._get_category_list(
                    article.get('anpa_category')):
                article['source'] = source
                pub_seq_num, odbc_item = self.get_odbc_item(
                    article, subscriber, category, codes, pass_through)
                if article.get(FORMAT) == FORMATS.PRESERVED:  # @article_text
                    body = get_text(self.append_body_footer(article),
                                    content='html')
                    odbc_item['article_text'] = body.replace('\'', '\'\'')
                else:
                    body = self.get_text_content(
                        to_ascii(self.append_body_footer(article)))

                    if 'dateline' in article \
                            and 'text' in article.get('dateline', {}) and not pass_through:
                        if body.startswith('   '):
                            body = '   {} {}'.format(
                                article.get('dateline').get('text'), body[3:])
                    odbc_item['article_text'] = body.replace('\'', '\'\'')

                if not pass_through:
                    self.add_ednote(odbc_item, article)
                    self.add_byline(odbc_item, article)

                odbc_item['article_text'] += '\r\n' + source
                sign_off = article.get('sign_off', '') or ''
                if len(sign_off) > 0:
                    odbc_item['article_text'] += ' ' + sign_off

                odbc_item['category'] = odbc_item.get('category', '').upper()
                odbc_item['selector_codes'] = odbc_item.get(
                    'selector_codes', '').upper()

                docs.append((pub_seq_num, json.dumps(odbc_item)))

            return docs
        except Exception as ex:
            raise FormatterError.AAPNewscentreFormatterError(ex, subscriber)
Пример #35
0
    def parse(self, xml, provider=None):
        self.root = xml
        try:
            item = self.parse_item(xml)
            if not item.get('headline'):
                item['headline'] = text_utils.get_text(
                    item.get('body_html', ''), 'html')[:100]

            try:
                abstract = xml.xpath(
                    "//iptc:description[@role='drol:summary']",
                    namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if abstract:
                    item['abstract'] = abstract
            return [item]
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
Пример #36
0
    def _sanitize_fields(self, doc, validator):
        """If maxlength or minlength is specified in the validator then remove any markups from that field

        :param doc: Article to be validated
        :param validator: Validation rule
        :return: updated article
        """
        fields_to_check = ['minlength', 'maxlength']
        item_schema = validator.get('schema', {})
        extra_schema = item_schema.get('extra', {}).get('schema', {})
        schemes_docs = [(item_schema, doc), (extra_schema, doc.get('extra', {}))]
        for schema, content in schemes_docs:
            for field in schema:
                if content.get(field) and schema.get(field) and type(content[field]) is str and \
                        any(k in schema[field] for k in fields_to_check):
                    try:
                        content[field] = get_text(content[field])
                    except (ValueError, TypeError):
                        # fails for json fields like subject, genre
                        pass
Пример #37
0
    def parse_newsitem(self, item, newsitem_el):
        super().parse_newsitem(item, newsitem_el)
        # mapping services-products from category, and have only one product
        matching = False
        for category in item.get('anpa_category', []):
            qcode = self.MAPPING_CATEGORY.get(category.get('qcode'),
                                              'NEWS/GENERAL')
            item.setdefault('subject', []).append({
                'name': qcode,
                'qcode': qcode,
                'parent': 'NEWS',
                'scheme': 'services-products'
            })
            matching = True
        if not matching:
            item.setdefault('subject', []).append({
                'name': 'NEWS/GENERAL',
                'qcode': 'NEWS/GENERAL',
                'parent': 'NEWS',
                'scheme': 'services-products'
            })

        # add content for headline when it is empty
        if item.get('urgency') in (1, 2) and not item.get('headline'):
            for line in get_text(item.get('body_html', ''),
                                 lf_on_block=True).split('\n'):
                if line.strip():
                    item['headline'] = 'URGENT: ' + line.strip()
                    break
        # Label must be empty
        item['subject'] = [
            i for i in item['subject'] if i.get('scheme') != 'label'
        ]
        # Source is AFP
        credit = {"name": 'AFP', "qcode": 'AFP', "scheme": "sources"}
        item.setdefault('subject', []).append(credit)

        if item.get('urgency') == 4:
            item['urgency'] = 3

        return item
    def _sanitize_fields(self, doc, validator):
        """If maxlength or minlength is specified in the validator then remove any markups from that field

        :param doc: Article to be validated
        :param validator: Validation rule
        :return: updated article
        """
        fields_to_check = ['minlength', 'maxlength']
        item_schema = validator.get('schema', {})
        extra_schema = item_schema.get('extra', {}).get('schema', {})
        schemes_docs = [(item_schema, doc),
                        (extra_schema, doc.get('extra', {}))]
        for schema, content in schemes_docs:
            for field in schema:
                if content.get(field) and schema.get(field) and type(content[field]) is str and \
                        any(k in schema[field] for k in fields_to_check):
                    try:
                        content[field] = get_text(content[field])
                    except (ValueError, TypeError):
                        # fails for json fields like subject, genre
                        pass
Пример #39
0
    def parse_item(self, tree):
        item = super().parse_item(tree)
        meta = tree.find(self.qname("contentMeta"))

        organisation = meta.xpath(
            './iptc:subject[@type="cpnat:organisation"][@literal]',
            namespaces=NS)
        if organisation:
            org_name = organisation[0].get("literal")
            item["abstract"] = format_maxlength(
                "FOR: {}. {}".format(
                    org_name.upper().rstrip("."),
                    get_text(item["body_html"]).replace("  ", " "),
                ),
                200,
            )
            item.setdefault("subject", []).append({
                "name": org_name,
                "qcode": org_name,
                "scheme": cp.ORGANISATION,
            })

        return item
Пример #40
0
def upload_document(item):
    item_name = item.get("headline") or item.get("slugline")
    if not item_name or not item.get("body_html"):
        return

    payload = {
        "lang": {
            "fromLang": "en",
            "toLang": "fr",
        },
        "name": item_name,
        "state": "new",
        "text": {
            "original": get_text(item["body_html"]),
        },
    }

    resp = sess.post(
        ULTRAD_URL, json=payload, headers=get_headers(), timeout=ULTRAD_TIMEOUT
    )
    raise_for_resp_error(resp)
    data = get_json(resp)
    return data["_id"]
 def get_value(self, article):
     try:
         return get_text(article[self.field.name]).replace('\n', ' ')
     except (etree.XMLSyntaxError, ValueError):
         return article[self.field.name]
Пример #42
0
    def format(self, article, subscriber, codes=None):
        try:
            docs = []
            formatted_article = deepcopy(article)
            for category in self._get_category_list(formatted_article.get('anpa_category')):
                mapped_source = self._get_mapped_source(formatted_article)
                formatted_article[config.ID_FIELD] = formatted_article.get('item_id',
                                                                           formatted_article.get(config.ID_FIELD))
                pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
                anpa = []

                if codes:
                    anpa.append(b'\x05')
                    anpa.append(' '.join(codes).encode('ascii'))
                    anpa.append(b'\x0D\x0A')

                # start of message header (syn syn soh)
                anpa.append(b'\x16\x16\x01')
                anpa.append(get_service_level(category, formatted_article).encode('ascii'))

                # story number
                anpa.append(str(pub_seq_num).zfill(4).encode('ascii'))

                # field seperator
                anpa.append(b'\x0A')  # -LF
                anpa.append(map_priority(formatted_article.get('priority')).encode('ascii'))
                anpa.append(b'\x20')

                anpa.append(category['qcode'].lower().encode('ascii'))

                anpa.append(b'\x13')
                # format identifier
                if formatted_article.get(FORMAT, FORMATS.HTML) == FORMATS.PRESERVED:
                    anpa.append(b'\x12')
                else:
                    anpa.append(b'\x11')
                anpa.append(b'\x20')

                # keyword
                keyword = 'bc-{}'.format(self.append_legal(article=formatted_article, truncate=True)).replace(' ', '-')
                keyword = keyword[:24] if len(keyword) > 24 else keyword
                anpa.append(keyword.encode('ascii'))
                anpa.append(b'\x20')

                # version field
                anpa.append(b'\x20')

                # reference field
                anpa.append(b'\x20')

                # filing date
                anpa.append('{}-{}'.format(formatted_article['_updated'].strftime('%m'),
                                           formatted_article['_updated'].strftime('%d')).encode('ascii'))
                anpa.append(b'\x20')

                # add the word count
                anpa.append(str(formatted_article.get('word_count', '0000')).zfill(4).encode('ascii'))
                anpa.append(b'\x0D\x0A')

                anpa.append(b'\x02')  # STX

                self._process_headline(anpa, formatted_article, category['qcode'].encode('ascii'))

                keyword = SluglineMapper().map(article=formatted_article, category=category['qcode'].upper(),
                                               truncate=True).encode('ascii', 'ignore')
                anpa.append(keyword)
                take_key = (formatted_article.get('anpa_take_key', '') or '').encode('ascii', 'ignore')
                anpa.append((b'\x20' + take_key) if len(take_key) > 0 else b'')
                anpa.append(b'\x0D\x0A')

                if formatted_article.get('ednote', '') != '':
                    ednote = '{}\r\n'.format(to_ascii(formatted_article.get('ednote')))
                    anpa.append(ednote.encode('ascii', 'replace'))

                if formatted_article.get(BYLINE):
                    anpa.append(get_text(formatted_article.get(BYLINE)).encode('ascii', 'replace'))
                    anpa.append(b'\x0D\x0A')

                if formatted_article.get(FORMAT) == FORMATS.PRESERVED:
                    anpa.append(get_text(self.append_body_footer(formatted_article),
                                         content='html').encode('ascii', 'replace'))
                else:
                    body = to_ascii(formatted_article.get('body_html', ''))
                    # we need to inject the dateline
                    if formatted_article.get('dateline', {}).get('text') and not article.get('auto_publish', False):
                        body_html_elem = parse_html(formatted_article.get('body_html'))
                        ptag = body_html_elem.find('.//p')
                        if ptag is not None:
                            ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '')
                            body = to_string(body_html_elem)
                    anpa.append(self.get_text_content(body))
                    if formatted_article.get('body_footer'):
                        anpa.append(self.get_text_content(to_ascii(formatted_article.get('body_footer', ''))))

                anpa.append(b'\x0D\x0A')
                anpa.append(mapped_source.encode('ascii'))
                sign_off = (formatted_article.get('sign_off', '') or '').encode('ascii')
                anpa.append((b'\x20' + sign_off) if len(sign_off) > 0 else b'')
                anpa.append(b'\x0D\x0A')

                anpa.append(b'\x03')  # ETX

                # time and date
                anpa.append(datetime.datetime.now().strftime('%d-%m-%y %H-%M-%S').encode('ascii'))

                anpa.append(b'\x04')  # EOT
                anpa.append(b'\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A')

                docs.append({'published_seq_num': pub_seq_num, 'encoded_item': b''.join(anpa),
                             'formatted_item': b''.join(anpa).decode('ascii')})

            return docs
        except Exception as ex:
            raise FormatterError.AnpaFormatterError(ex, subscriber)
    def _parse_content(self, article):
        """Parse body_html and mapping to fields required for apple news format

        :param article:
        """
        statement_regex = re.compile(r'^The Statement$', re.IGNORECASE)
        analysis_regex = re.compile(r'^The Analysis$', re.IGNORECASE)
        verdict_regex = re.compile(r'^The Verdict$', re.IGNORECASE)
        references_regex = re.compile(r'^The References$', re.IGNORECASE)
        url_regex = re.compile(r'(?:(?:https|http)://)[\w/\-?=%.]+\.[\w/\-?=%.]+', re.IGNORECASE)
        abstract = get_text(article.get('abstract'), content='html').strip()

        article['_title'] = abstract
        body_html = article.get('body_html')
        article['_analysis_first_line'] = ''
        article['_analysis'] = ''
        article['_statement'] = ''
        article['_statement_attribution'] = ''
        article['_verdict1'] = ''
        article['_verdict2'] = ''
        article['_references'] = ''
        article['_revision_history'] = ''

        if article.get(ITEM_STATE) == CONTENT_STATE.KILLED or article.get(ITEM_STATE) == CONTENT_STATE.RECALLED:
            article['_title'] = 'This article has been removed.'
            article['_analysis_first_line'] = 'This article has been removed.'
            article['_analysis'] = 'This article has been removed.'
            article['_statement'] = 'This article has been removed.'
            article['_statement_attribution'] = 'This article has been removed.'
            article['_verdict1'] = 'This article has been removed.'
            article['_verdict2'] = 'This article has been removed.'
            article['_references'] = 'This article has been removed.'
            self._set_revision_history(article)
            return

        parsed_content = parse_html(body_html, content='html')
        statement_found = False
        analysis_found = False
        analysis_first_line = False
        verdict1_found = False
        verdict2_found = False
        references_found = False
        statement_elements = []

        for top_level_tag in parsed_content.xpath('/html/div/child::*'):
            tag_text = format_text_content(top_level_tag).strip()
            if not tag_text:
                continue

            if not verdict1_found:
                if not statement_found:
                    match = statement_regex.search(tag_text)
                    if match:
                        statement_found = True
                    continue
                else:
                    # statement found
                    match = verdict_regex.search(tag_text)
                    if match:
                        verdict1_found = True
                        if len(statement_elements) > 1:
                            statement_length = len(statement_elements) - 1
                            for i in range(statement_length):
                                article['_statement'] += get_text(
                                    to_string(statement_elements[i], remove_root_div=False),
                                    content='html'
                                ).strip()
                                if statement_length > 1 and i != statement_length - 1:
                                    article['_statement'] += '\r\n'

                            article['_statement_attribution'] = get_text(
                                to_string(statement_elements[-1:][0], remove_root_div=False),
                                content='html'
                            ).strip()
                        elif len(statement_elements) == 1:
                            article['_statement'] = to_string(
                                statement_elements[0],
                                remove_root_div=False
                            )
                        continue

                    statement_elements.append(top_level_tag)
                    continue

            if verdict1_found and not analysis_found:
                match = analysis_regex.search(tag_text)
                if match:
                    analysis_found = True
                else:
                    article['_verdict1'] += to_string(top_level_tag, remove_root_div=False)
                continue

            if analysis_found and not verdict2_found:
                if not analysis_first_line:
                    article['_analysis_first_line'] = tag_text
                    analysis_first_line = True

                match = verdict_regex.search(tag_text)
                if match:
                    verdict2_found = True
                else:
                    article['_analysis'] += to_string(top_level_tag, remove_root_div=False)
                continue

            if verdict2_found and not references_found:
                match = references_regex.search(tag_text)
                if match:
                    references_found = True
                else:
                    article['_verdict2'] += to_string(top_level_tag, remove_root_div=False)
                continue

            if references_found:
                def replacement(match_object):
                    value = match_object.group(0)
                    if value:
                        return '<a href="{0}">{0}</a>'.format(value)
                    return ''

                tag_text = re.sub(r'^\d*\s*[.):]?', '', tag_text).strip()

                article['_references'] += '<li>{}</li>'.format(
                    re.sub(url_regex, replacement, tag_text)
                )

        if len(article['_references']):
            article['_references'] = '<ol>{}</ol>'.format(article['_references'])

        if not article.get('_statement') and article.get('_statement_attribution'):
            # if statement is not as per the format
            article['_statement'] = article.get('_statement_attribution')
            article['_statement_attribution'] = ''

        self._set_revision_history(article)
Пример #44
0
def plaintext_filter(value):
    """Filter out html from value."""
    return get_text(value).replace('\n', ' ').strip()
Пример #45
0
    def parse(self, xml, provider=None):
        self.root = xml
        try:
            item = self.parse_item(xml)
            if not item.get('headline'):
                item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100]

            # abstract
            try:
                abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if abstract:
                    item['abstract'] = abstract

            # genre
            for genre_elt in xml.xpath("//iptc:genre", namespaces={'iptc': IPTC_NS}):
                qcode = genre_elt.get('qcode')
                if qcode is None:
                    continue
                elif qcode.startswith('sttgenre:'):
                    qcode = qcode[9:]
                    genre_data = {'qcode': qcode}
                    name_elt = genre_elt.find(self.qname('name'))
                    name = name_elt.text if name_elt is not None and name_elt.text else ""
                    try:
                        name = self.getVocabulary("genre", qcode, name)
                    except ValueError:
                        continue
                    else:
                        genre_data['name'] = name
                        item.setdefault('genre', []).append(genre_data)
                elif qcode.startswith('sttversion:'):
                    qcode = qcode[11:]
                    version_data = {'qcode': qcode, 'scheme': 'sttversion'}
                    name_elt = genre_elt.find(self.qname('name'))
                    name = name_elt.text if name_elt is not None and name_elt.text else ""
                    try:
                        name = self.getVocabulary("sttgenre", qcode, name)
                    except ValueError:
                        continue
                    else:
                        version_data['name'] = name
                        item.setdefault('subject', []).append(version_data)

            # location
            for location_elt in xml.xpath("//iptc:assert", namespaces={'iptc': IPTC_NS}):
                qcode = location_elt.get("qcode")
                if not qcode or not qcode.startswith("sttlocmeta:default:"):
                    continue
                qcode = qcode[19:]
                location_data = {"scheme": "sttlocmeta:default", "qcode": qcode}
                for broader_elt in location_elt.xpath(".//iptc:broader[@type='cpnat:geoArea']",
                                                      namespaces={'iptc': IPTC_NS}):
                    qcode = broader_elt.get('qcode')
                    if not qcode:
                        continue
                    for key, mapping in STT_LOCATION_MAP.items():
                        if qcode.startswith(key + ":"):
                            if "qcode" in mapping:
                                qcode = qcode[len(key) + 1:]
                            try:
                                name = broader_elt.find(self.qname('name')).text
                            except AttributeError:
                                name = ""
                            try:
                                name = self.getVocabulary(key, qcode, name)
                            except ValueError:
                                continue
                            else:
                                location_data[mapping["qcode"]] = qcode
                                if "name" in mapping:
                                    location_data[mapping["name"]] = name
                item.setdefault('place', []).append(location_data)

            # public editorial note
            if 'ednote' in item:
                # stt has specific roles for public and private editorial notes
                # so we remove ednote found by parent parser, as it takes first one
                # as a public note
                del item['ednote']
            try:
                ednote = xml.xpath("//iptc:edNote[@role='sttnote:public']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if ednote:
                    item['ednote'] = ednote

            # private editorial note
            try:
                private_note = xml.xpath("//iptc:edNote[@role='sttnote:private']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if private_note:
                    item.setdefault('extra', {})['sttnote_private'] = private_note

            return [item]
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
    def _transform_to_ninjs(self, article, subscriber, recursive=True):
        ninjs = {
            'guid': article.get(GUID_FIELD, article.get('uri')),
            'version': str(article.get(config.VERSION, 1)),
            'type': self._get_type(article)
        }

        if article.get('byline'):
            ninjs['byline'] = article['byline']

        located = article.get('dateline', {}).get('located', {})
        if located:
            ninjs['located'] = located.get('city', '')

        for copy_property in self.direct_copy_properties:
            if article.get(copy_property) is not None:
                ninjs[copy_property] = article[copy_property]

        if 'body_text' not in article and 'alt_text' in article:
            ninjs['body_text'] = article['alt_text']

        if 'title' in article:
            ninjs['headline'] = article['title']

        if article.get('body_html'):
            ninjs['body_html'] = self.append_body_footer(article)

        if article.get('description'):
            ninjs['description_html'] = self.append_body_footer(article)

        if article.get('place'):
            ninjs['place'] = self._format_place(article)

        if article.get('profile'):
            ninjs['profile'] = self._format_profile(article['profile'])

        extra_items = None
        if recursive:
            if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE:
                ninjs[ASSOCIATIONS] = self._get_associations(article, subscriber)
                if article.get(ASSOCIATIONS):
                    associations, extra_items = self._format_related(article, subscriber)
                    ninjs[ASSOCIATIONS].update(associations)
            elif article.get(ASSOCIATIONS):
                ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber)
        elif article.get(ASSOCIATIONS):
            ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber)
        if extra_items:
            ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items)

        if article.get(EMBARGO):
            ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat()

        if article.get('priority'):
            ninjs['priority'] = article['priority']
        else:
            ninjs['priority'] = 5

        if article.get('subject'):
            ninjs['subject'] = self._get_subject(article)

        if article.get('anpa_category'):
            ninjs['service'] = self._get_service(article)
        if article.get('renditions'):
            ninjs['renditions'] = self._get_renditions(article)
        elif 'url' in article:
            ninjs['renditions'] = self._generate_renditions(article)

        # SDPA-317
        if 'abstract' in article:
            abstract = article.get('abstract', '')
            ninjs['description_html'] = abstract
            ninjs['description_text'] = text_utils.get_text(abstract)
        elif article.get('description_text'):
            ninjs['description_text'] = article.get('description_text')

        if article.get('company_codes'):
            ninjs['organisation'] = [{'name': c.get('name', ''), 'rel': 'Securities Identifier',
                                      'symbols': [{'ticker': c.get('qcode', ''),
                                                   'exchange': c.get('security_exchange', '')}]}
                                     for c in article['company_codes']]
        elif 'company' in article:
            ninjs['organisation'] = [{'name': article['company']}]

        if article.get('rewrite_of'):
            ninjs['evolvedfrom'] = article['rewrite_of']

        if not ninjs.get('copyrightholder') and not ninjs.get('copyrightnotice') and not ninjs.get('usageterms'):
            ninjs.update(superdesk.get_resource_service('vocabularies').get_rightsinfo(article))

        if 'genre' in article:
            ninjs['genre'] = self._get_genre(article)

        if article.get('flags', {}).get('marked_for_legal'):
            ninjs['signal'] = self._format_signal_cwarn()

        if article.get('attachments'):
            ninjs['attachments'] = self._format_attachments(article)

        if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs or 'body_text' in ninjs):
            if 'body_html' in ninjs:
                body_html = ninjs['body_html']
                word_count = text_utils.get_word_count(body_html)
                char_count = text_utils.get_char_count(body_html)
                readtime = text_utils.get_reading_time(body_html, word_count, article.get('language'))
            else:
                body_text = ninjs['body_text']
                word_count = text_utils.get_text_word_count(body_text)
                char_count = len(body_text)
                readtime = text_utils.get_reading_time(body_text, word_count, article.get('language'))
            ninjs['charcount'] = char_count
            ninjs['wordcount'] = word_count
            ninjs['readtime'] = readtime

        if article.get('authors'):
            ninjs['authors'] = self._format_authors(article)

        return ninjs
 def _format_body_content(self, article, body_content):
     if article.get(FORMAT) == FORMATS.PRESERVED:
         pre = get_text(self.append_body_footer(article))
         SubElement(body_content, 'pre').text = pre
     else:
         self.map_html_to_xml(body_content, self.append_body_footer(article))
Пример #48
0
 def _set_headline(self, item, value):
     if not value:
         # if there is no headline, we use first 100 chars of body
         # cf. SDNTB-481
         value = text_utils.get_text(item.get('body_html', ''), 'html')[:100]
     item['headline'] = value
Пример #49
0
 def _ednote_filter(self, ednote):
     return text_utils.get_text(ednote, lf_on_block=True).strip()