def format(self, article, subscriber, codes=None): """ Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure :return: returns the sequence number of the subscriber and the constructed parameter dictionary """ try: pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) sms_message = article.get('sms_message', article.get('abstract', '')) # category = 1 is used to indicate a test message category = '1' if superdesk.app.config.get('TEST_SMS_OUTPUT', True) is True \ else article.get('anpa_category', [{}])[0].get('qcode').upper() odbc_item = {'Sequence': pub_seq_num, 'Category': category, 'Headline': to_ascii(get_text(sms_message, content='html')).replace('\'', '\'\''), 'Priority': map_priority(article.get('priority'))} body = self.append_body_footer(article) if article[ITEM_TYPE] == CONTENT_TYPE.TEXT: body = get_text(body, content='html') odbc_item['StoryText'] = to_ascii(body).replace('\'', '\'\'') # @article_text odbc_item['ident'] = '0' return [(pub_seq_num, json.dumps(odbc_item))] except Exception as ex: raise FormatterError.AAPSMSFormatterError(ex, subscriber)
def format(self, article, subscriber, codes=None): try: formatted_doc = {} formatted_doc['headline'] = get_text(article.get('headline', ''), content='html') formatted_doc['headline'] = formatted_doc['headline'].replace( '\'', '\'\'').replace('\xA0', ' ') formatted_doc['keyword'] = article.get('slugline', '').replace('\'', '\'\'') # body formatting if article.get(FORMAT) == FORMATS.PRESERVED: body = get_text(self.append_body_footer(article), content='html') formatted_doc['article_text'] = body.replace('\'', '\'\'') elif article.get(FORMAT, FORMATS.HTML) == FORMATS.HTML: body = self.get_wrapped_text_content( to_ascii(self.append_body_footer(article))).replace( '\'', '\'\'') formatted_doc['article_text'] = body self.refine_article_body(formatted_doc, article) # Frame the text output according to AAP requirement formatted_output = 'KEYWORD: ' + formatted_doc.get('keyword', '') + '\r\n' formatted_output += 'HEADLINE: ' + formatted_doc.get( 'headline', '') + '\r\n' formatted_output += ' ' + formatted_doc.get('article_text', '') return [(0, json.dumps({'article_text': formatted_output}))] except Exception as ex: raise FormatterError.AAPTextFormatterError(ex, subscriber)
def _transform_to_ninjs(self, article, subscriber, recursive=True): return { "uuid": article["guid"], "createdTimestamp": format_datetime(article["firstcreated"]), "latestVersionTimestamp": format_datetime(article["versioncreated"]), "publicationTimestamp": format_datetime(article["firstpublished"]), "authors": [author["sub_label"] for author in article.get("authors") or []], "language": article["language"], "pubStatus": True, "concepts": self._format_concepts(article), "headline": get_text(article["headline"]), "preamble": get_text(article["abstract"], lf_on_block=True).strip() if article.get("abstract") else "", "dateline": article["dateline"]["text"] if article.get("dateline") and article["dateline"].get("text") else "", "body": [ line.strip() for line in get_text(article["body_html"], lf_on_block=True).split("\n") if line ], }
def _format_body_content(self, article, body_content): nitf_body = [] if article.get('ednote'): nitf_body.append(to_ascii(self._format_line(article.get('ednote')))) if article.get(BYLINE): nitf_body.append(to_ascii(self._format_line(get_text(article.get(BYLINE))))) if article.get(FORMAT) == FORMATS.PRESERVED: nitf_body.append(to_ascii(get_text(self.append_body_footer(article), content='html'))) else: body = article.get('body_html', '') # we need to inject the dateline if article.get('dateline', {}).get('text') and not article.get('auto_publish', False): body_html_elem = parse_html(article.get('body_html')) ptag = body_html_elem.find('.//p') if ptag is not None: ptag.text = article['dateline']['text'] + ' ' + (ptag.text or '') body = to_string(body_html_elem) nitf_body.append(self.get_text_content(body)) if article.get('body_footer'): nitf_body.append(self.get_text_content(article.get('body_footer', ''))) sign_off = '{} {}'.format(article.get('source') or '', (article.get('sign_off') or '')).strip() if sign_off: nitf_body.append(to_ascii(self._format_line(sign_off))) SubElement(body_content, 'pre').text = ''.join(nitf_body)
def get_odbc_item(self, article, subscriber, category, codes, pass_through=False): """ Construct an odbc_item with the common key value pairs populated, if pass_through is true then the headline original headline is maintained. :param article: :param subscriber: :param category: :param codes: :param pass_through: :return: """ article['headline'] = get_text(article.get('headline', ''), content='html') pub_seq_num = superdesk.get_resource_service( 'subscribers').generate_sequence_number(subscriber) odbc_item = dict( originator=article.get('source', None), sequence=pub_seq_num, category=category.get('qcode').lower(), author=get_text(article.get('byline', '') or '', content='html').replace('\'', '\'\''), keyword=SluglineMapper().map( article=article, category=category.get('qcode').upper(), truncate=True).replace('\'', '\'\'') if not pass_through else (article.get('slugline', '') or '').replace('\'', '\'\''), subject_reference=set_subject(category, article), take_key=(article.get('anpa_take_key', '') or '').replace('\'', '\'\'')) if 'genre' in article and len(article['genre']) >= 1: odbc_item['genre'] = article['genre'][0].get('name', None) else: odbc_item['genre'] = 'Current' # @genre odbc_item['news_item_type'] = 'News' odbc_item['fullStory'] = 1 odbc_item['ident'] = '0' # @ident odbc_item['selector_codes'] = ' '.join(codes) if codes else ' ' headline = to_ascii(LocatorMapper().get_formatted_headline( article, category.get('qcode').upper())) odbc_item['headline'] = headline.replace('\'', '\'\'').replace('\xA0', ' ') self.expand_subject_codes(odbc_item) self.set_usn(odbc_item, article) return pub_seq_num, odbc_item
def append_body_footer(self, article): """ Checks if the article has any Public Service Announcements and if available appends each of them to the body. :return: body with public service announcements. """ try: article['body_html'] = article['body_html'].replace('<br>', '<br/>') except KeyError: pass body = '' if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]: body = article.get('body_html', '') elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]: body = article.get('description', '') if body and article.get(FORMAT, '') == FORMATS.PRESERVED: body = body.replace('\n', '\r\n').replace('\r\r', '\r') parsed = parse_html(body, content='html') for br in parsed.xpath('//br'): br.tail = '\r\n' + br.tail if br.tail else '\r\n' etree.strip_elements(parsed, 'br', with_tail=False) body = etree.tostring(parsed, encoding="unicode") if body and article.get('body_footer'): footer = article.get('body_footer') if article.get(FORMAT, '') == FORMATS.PRESERVED: body = '{}\r\n{}'.format(body, get_text(footer)) else: body = '{}{}'.format(body, footer) return body
def _format_body_content(self, article, body_content): if article.get(FORMAT) == FORMATS.PRESERVED: pre = get_text(self.append_body_footer(article)) SubElement(body_content, 'pre').text = pre else: self.map_html_to_xml(body_content, self.append_body_footer(article))
def _set_headline(self, item, value): if not value: # if there is no headline, we use first 100 chars of body # cf. SDNTB-481 value = text_utils.get_text(item.get('body_html', ''), 'html')[:100] item['headline'] = value
def append_body_footer(self, article): """ Checks if the article has any Public Service Announcements and if available appends each of them to the body. :return: body with public service announcements. """ try: article["body_html"] = article["body_html"].replace("<br>", "<br/>") except KeyError: pass body = "" if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]: body = article.get("body_html", "") elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]: body = article.get("description", "") if body and article.get(FORMAT, "") == FORMATS.PRESERVED: body = body.replace("\n", "\r\n").replace("\r\r", "\r") parsed = parse_html(body, content="html") for br in parsed.xpath("//br"): br.tail = "\r\n" + br.tail if br.tail else "\r\n" etree.strip_elements(parsed, "br", with_tail=False) body = etree.tostring(parsed, encoding="unicode") if body and article.get("body_footer"): footer = article.get("body_footer") if article.get(FORMAT, "") == FORMATS.PRESERVED: body = "{}\r\n{}".format(body, get_text(footer)) else: body = "{}{}".format(body, footer) return body
def _set_headline(self, item, value): if not value: # if there is no headline, we use first 100 chars of body # cf. SDNTB-481 value = text_utils.get_text(item.get("body_html", ""), "html")[:100] item["headline"] = value
def _set_revision_history(self, article): """Get revision history of published article :param dict article: """ query = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': { 'term': { 'item_id': article.get('item_id') } } } } } }, 'sort': [{ 'versioncreated': { 'order': 'asc' } }] } req = ParsedRequest() repos = 'published,archived' req.args = { 'source': json.dumps(query), 'repo': repos, 'aggregations': 0 } revisions = list( get_resource_service('search').get(req=req, lookup=None)) revisions_tag = [] for rev in revisions: local_date = utc_to_local( config.DEFAULT_TIMEZONE, rev.get('firstpublished') if rev.get(ITEM_STATE) == CONTENT_STATE.PUBLISHED else rev.get('versioncreated')) date_string = datetime.strftime(local_date, '%b XXX, %Y %H:%M %Z').replace( 'XXX', str(local_date.day)) if rev.get(ITEM_STATE) == CONTENT_STATE.PUBLISHED: revisions_tag.append('<li>{} {}</li>'.format( 'First published', date_string)) else: revision_markup = '{} {}'.format('Revision published', date_string) ednote = get_text(rev.get('ednote') or '', content='html').strip() if rev.get(ITEM_STATE) == CONTENT_STATE.CORRECTED and ednote: revision_markup += '<br><i>{}</i>'.format(ednote) revisions_tag.append('<li>{}</li>'.format(revision_markup)) article['_revision_history'] = '<ul>{}</ul>'.format( ''.join(revisions_tag)) if revisions_tag else ''
def _fill_definition_short(self, document, item): content = document.find('content') if content is not None: item['definition_short'] = text_utils.get_text( content.text, content='html', lf_on_block=True, space_on_elements=True).strip()
def get_item_body(item): body = [] for field in ("body_html", "abstract"): try: body.extend([p.strip() for p in get_text(item[field], "html", True).split("\n") if p.strip()]) except KeyError: pass return body
def parse_item(self, tree): item = super().parse_item(tree) meta = tree.find(self.qname('contentMeta')) organisation = meta.xpath('./iptc:subject[@type="cpnat:organisation"][@literal]', namespaces=NS) if organisation: item['abstract'] = format_maxlength('FOR: {}. {}'.format( organisation[0].get('literal').upper().rstrip('.'), get_text(item['body_html']).replace(' ', ' '), ), 200) return item
def format(self, article, subscriber, codes=None): """ Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure :return: returns the sequence number of the subscriber and the constructed parameter dictionary """ try: pub_seq_num = superdesk.get_resource_service( 'subscribers').generate_sequence_number(subscriber) sms_message = article.get('sms_message', article.get('abstract', '')) # category = 1 is used to indicate a test message category = '1' if superdesk.app.config.get('TEST_SMS_OUTPUT', True) is True \ else article.get('anpa_category', [{}])[0].get('qcode').upper() odbc_item = { 'Sequence': pub_seq_num, 'Category': category, 'Headline': to_ascii(get_text(sms_message, content='html')).replace('\'', '\'\''), 'Priority': map_priority(article.get('priority')) } body = self.append_body_footer(article) if article[ITEM_TYPE] == CONTENT_TYPE.TEXT: body = get_text(body, content='html') odbc_item['StoryText'] = to_ascii(body).replace( '\'', '\'\'') # @article_text odbc_item['ident'] = '0' return [(pub_seq_num, json.dumps(odbc_item))] except Exception as ex: raise FormatterError.AAPSMSFormatterError(ex, subscriber)
def get_permalink(item): code = item["_id"][-6:] try: title = item["extra"][PERMALINK] or "" slug = slugify(get_text(title, 'html')) except (KeyError, AttributeError): slug = "" return urljoin( BASE_URL, "/{lang}/{code}/".format( lang=item.get("language", "en"), code="-".join(filter(bool, [slug, code])), ), )
def add_byline(self, odbc_item, article): """ Add the byline to the article text :param odbc_item: :param article: :return: """ if article.get('byline') and article.get('byline') != '': byline = get_text(article.get('byline', ''), content='html') if len(byline) >= 3 and byline[:2].upper() != 'BY': byline = 'By ' + byline byline = ' {}\r\n\r\n'.format(byline).replace('\'', '\'\'') odbc_item['article_text'] = byline + odbc_item['article_text']
def add_byline(self, odbc_item, article): """ Add the byline to the article text :param odbc_item: :param article: :return: """ if article.get('byline') and article.get('byline') != '': byline = get_text(article.get('byline', ''), content='html') if len(byline) >= 3 and byline[:2].upper() != 'BY': byline = 'By ' + byline byline = '\x19 {}\x19\r\n'.format(byline).replace('\'', '\'\'') odbc_item['article_text'] = byline + odbc_item['article_text']
def get_odbc_item(self, article, subscriber, category, codes, pass_through=False): """ Construct an odbc_item with the common key value pairs populated, if pass_through is true then the headline original headline is maintained. :param article: :param subscriber: :param category: :param codes: :param pass_through: :return: """ article['headline'] = get_text(article.get('headline', ''), content='html') pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) odbc_item = dict(originator=article.get('source', None), sequence=pub_seq_num, category=category.get('qcode').lower(), author=get_text(article.get('byline', '') or '', content='html').replace('\'', '\'\''), keyword=SluglineMapper().map(article=article, category=category.get('qcode').upper(), truncate=True).replace('\'', '\'\'') if not pass_through else (article.get('slugline', '') or '').replace('\'', '\'\''), subject_reference=set_subject(category, article), take_key=(article.get('anpa_take_key', '') or '').replace('\'', '\'\'')) if 'genre' in article and len(article['genre']) >= 1: odbc_item['genre'] = article['genre'][0].get('name', None) else: odbc_item['genre'] = 'Current' # @genre odbc_item['news_item_type'] = 'News' odbc_item['fullStory'] = 1 odbc_item['ident'] = '0' # @ident odbc_item['selector_codes'] = ' '.join(codes) if codes else ' ' headline = to_ascii(LocatorMapper().get_formatted_headline(article, category.get('qcode').upper())) odbc_item['headline'] = headline.replace('\'', '\'\'').replace('\xA0', ' ') self.expand_subject_codes(odbc_item) self.set_usn(odbc_item, article) return pub_seq_num, odbc_item
def populate(item, **kwargs): """Populate the abstract field with the first sentence of the body""" # get the list of sentences of the body if not item.get("body_html", None): item["abstract"] = "No body found to use for abstract..." else: sentences = p.split(item["body_html"]) # chop the first sentence to size for abstract (64) if sentences and len(sentences) > 0: item["abstract"] = get_text(sentences[0][:64]).strip() return item
def populate(item, **kwargs): """Populate the abstract field with the first sentence of the body""" # get the list of sentences of the body if not item.get('body_html', None): item['abstract'] = 'No body found to use for abstract...' else: sentences = p.split(item['body_html']) # chop the first sentence to size for abstract (64) if sentences and len(sentences) > 0: item['abstract'] = get_text(sentences[0][:64]).strip() return item
def _process_headline(self, anpa, article, category): # prepend the locator to the headline if required article['headline'] = get_text(article.get('headline', '')) headline = to_ascii(LocatorMapper().get_formatted_headline(article, category.decode('UTF-8').upper())) # Set the maximum size to 64 including the sequence number if any if len(headline) > 64: if article.get('sequence'): digits = len(str(article['sequence'])) + 1 shortened_headline = '{}={}'.format(headline[:-digits][:(64 - digits)], article['sequence']) anpa.append(shortened_headline.encode('ascii', 'replace')) else: anpa.append(headline[:64].encode('ascii', 'replace')) else: anpa.append(headline.encode('ascii', 'replace')) anpa.append(b'\x0D\x0A')
def _format_content(self, article, news_item, nitf): """Adds the content set to the xml :param dict article: :param Element newsItem: :param Element nitf: """ content_set = SubElement(news_item, 'contentSet') if article.get(FORMAT) == FORMATS.PRESERVED: inline_data = text_utils.get_text(self.append_body_footer(article)) SubElement(content_set, 'inlineData', attrib={'contenttype': 'text/plain'}).text = inline_data elif article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.COMPOSITE]: inline = SubElement(content_set, 'inlineXML', attrib={'contenttype': 'application/nitf+xml'}) inline.append(nitf)
def truncate_article_body(items, monitoring_profile, full_text=False): # To make sure PDF creator and RTF creator does truncate for linked_text settings # Manually truncate it for i in items: i['body_str'] = get_text(i.get('body_html', ''), content='html', lf_on_block=True) if monitoring_profile['alert_type'] == 'linked_text': if not full_text and len(i['body_str']) > 160: i['body_str'] = i['body_str'][:159] + '...' if monitoring_profile.get('format_type') == 'monitoring_pdf': body_lines = i.get('body_str', '').split('\n') altered_html = '' for line in body_lines: altered_html = '{}<div class="line">{}</div>'.format(altered_html, line) i['body_str'] = altered_html
def format_for_source(self, article, subscriber, source, codes=None): """Constructs a dictionary that represents the parameters passed to the IPNews InsertNews stored procedure :type article: object :return: returns the sequence number of the subscriber and the constructed parameter dictionary """ pass_through = article.get('auto_publish', False) try: docs = [] for category in self._get_category_list(article.get('anpa_category')): # All NZN sourced content is AAP content for the AAP output formatted article['source'] = source pub_seq_num, odbc_item = self.get_odbc_item(article, subscriber, category, codes, pass_through) if article.get(FORMAT) == FORMATS.PRESERVED: # @article_text body = get_text(self.append_body_footer(article)) odbc_item['article_text'] = body.replace('\'', '\'\'') odbc_item['texttab'] = 't' elif article.get(FORMAT, FORMATS.HTML) == FORMATS.HTML: body = self.get_wrapped_text_content( to_ascii(self.append_body_footer(article))).replace('\'', '\'\'') # if we have a dateline inject it if 'dateline' in article and 'text' in article.get('dateline', {}) and not pass_through: if body.startswith(' '): body = ' {} {}'.format(article.get('dateline') .get('text').replace('\'', '\'\''), body[3:]) odbc_item['article_text'] = body odbc_item['texttab'] = 'x' if not pass_through: self.add_ednote(odbc_item, article) self.add_byline(odbc_item, article) odbc_item['article_text'] += '\r\n' + article.get('source', '') sign_off = article.get('sign_off', '') or '' if len(sign_off) > 0: odbc_item['article_text'] += ' ' + sign_off odbc_item['service_level'] = get_service_level(category, article) # @service_level odbc_item['wordcount'] = article.get('word_count') or 0 # @wordcount odbc_item['priority'] = map_priority(article.get('priority')) # @priority docs.append((pub_seq_num, json.dumps(odbc_item))) return docs except Exception as ex: raise FormatterError.AAPIpNewsFormatterError(ex, subscriber)
def parse(self, xml, provider=None): self.root = xml try: item = self.parse_item(xml) if not item.get('headline'): item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100] try: abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if abstract: item['abstract'] = abstract return [item] except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def map_html_to_xml(self, element, html): """ Map the html text tags to xml :param element: The xml element to populate :param html: the html to parse the text from :return: """ html = html.replace('<br>', '<br/>').replace('</br>', '') html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html) html = html.replace('\n', ' ') html = re.sub(r'\s\s+', ' ', html) parsed = parse_html(html, content='html') for tag in parsed.xpath('/html/div/child::*'): p = etree.Element('p') p.text = to_ascii( get_text(to_string(tag, method='html'), content='html')) element.append(p)
def _set_revision_history(self, article): """Get revision history of published article :param dict article: """ query = { 'query': { 'filtered': { 'filter': { 'bool': { 'must': { 'term': {'item_id': article.get('item_id')} } } } } }, 'sort': [ {'versioncreated': {'order': 'asc'}} ] } req = ParsedRequest() repos = 'published,archived' req.args = {'source': json.dumps(query), 'repo': repos, 'aggregations': 0} revisions = list(get_resource_service('search').get(req=req, lookup=None)) revisions_tag = [] for rev in revisions: local_date = utc_to_local( config.DEFAULT_TIMEZONE, rev.get('firstpublished') if rev.get(ITEM_STATE) == CONTENT_STATE.PUBLISHED else rev.get('versioncreated') ) date_string = datetime.strftime(local_date, '%b XXX, %Y %H:%M %Z').replace('XXX', str(local_date.day)) if rev.get(ITEM_STATE) == CONTENT_STATE.PUBLISHED: revisions_tag.append('<li>{} {}</li>'.format('First published', date_string)) else: revision_markup = '{} {}'.format('Revision published', date_string) ednote = get_text(rev.get('ednote') or '', content='html').strip() if rev.get(ITEM_STATE) == CONTENT_STATE.CORRECTED and ednote: revision_markup += '<br><i>{}</i>'.format(ednote) revisions_tag.append('<li>{}</li>'.format(revision_markup)) article['_revision_history'] = '<ul>{}</ul>' .format(''.join(revisions_tag)) if revisions_tag else ''
def callback(item, **kwargs): diff = {} if not item.get('body_html'): return diff rate = get_rate() text = get_text(item['body_html'], 'html', True) def repl(m, is_fr=False): if m.group('currency') and m.group('currency') != 'US': return if is_fr: num = m.group('num').replace(',', '.').replace(' ', '') else: num = m.group('num').replace(',', '') converted = decimal.Decimal(num) * rate if m.group('decimal'): _format = '{:.3f}' # convert 55.21 to 73.73 - round to 3 decimals and strip last one if is_fr and ' ' in m.group('num') or not is_fr and ',' in m.group( 'num'): _format = '{:,.3f}' fixed = _format.format(converted)[:-1] else: _format = '{:.1f}0' # convert 55 to 73.70 - round to 1 decimal and add 0 if is_fr and ' ' in m.group('num') or not is_fr and ',' in m.group( 'num'): _format = '{:,.1f}0' fixed = _format.format(converted).replace('.00', '') # keep leeding whitespace so on client it won't # replace $500 in C$500 diff[m.group(0).rstrip( )] = '{whitespace} ({en_currency}{value}{mil}{fr_currency})'.format( whitespace=m.group(0).rstrip(), en_currency='' if is_fr else 'C$', value=fixed if not is_fr else fixed.replace(',', ' ').replace( '.', ','), mil=m.group('mil') or '', fr_currency=' $ CAN' if is_fr else '', ).rstrip() re.sub(CURRENCY_REGEX, repl, text) re.sub(CURRENCY_REGEX_FR, functools.partial(repl, is_fr=True), text) return (item, diff)
def callback(item, **kwargs): diff = {} if not item.get("body_html"): return diff rate = get_rate() text = get_text(item["body_html"], "html", True) def repl(m, is_fr=False): if m.group("currency") and m.group("currency") != "US": return if is_fr: num = m.group("num").replace(",", ".").replace(" ", "") else: num = m.group("num").replace(",", "") converted = decimal.Decimal(num) * rate if m.group("decimal"): _format = "{:.3f}" # convert 55.21 to 73.73 - round to 3 decimals and strip last one if is_fr and " " in m.group("num") or not is_fr and "," in m.group( "num"): _format = "{:,.3f}" fixed = _format.format(converted)[:-1] else: _format = "{:.1f}0" # convert 55 to 73.70 - round to 1 decimal and add 0 if is_fr and " " in m.group("num") or not is_fr and "," in m.group( "num"): _format = "{:,.1f}0" fixed = _format.format(converted).replace(".00", "") # keep leeding whitespace so on client it won't # replace $500 in C$500 diff[m.group(0).rstrip( )] = "{whitespace} ({en_currency}{value}{mil}{fr_currency})".format( whitespace=m.group(0).rstrip(), en_currency="" if is_fr else "C$", value=fixed if not is_fr else fixed.replace(",", " ").replace( ".", ","), mil=m.group("mil") or "", fr_currency=" $ CAN" if is_fr else "", ).rstrip() re.sub(CURRENCY_REGEX, repl, text) re.sub(CURRENCY_REGEX_FR, functools.partial(repl, is_fr=True), text) return (item, diff)
def format_for_source(self, article, subscriber, source, codes=None): try: pass_through = article.get('auto_publish', False) docs = [] for category in self._get_category_list( article.get('anpa_category')): article['source'] = source pub_seq_num, odbc_item = self.get_odbc_item( article, subscriber, category, codes, pass_through) if article.get(FORMAT) == FORMATS.PRESERVED: # @article_text body = get_text(self.append_body_footer(article), content='html') odbc_item['article_text'] = body.replace('\'', '\'\'') else: body = self.get_text_content( to_ascii(self.append_body_footer(article))) if 'dateline' in article \ and 'text' in article.get('dateline', {}) and not pass_through: if body.startswith(' '): body = ' {} {}'.format( article.get('dateline').get('text'), body[3:]) odbc_item['article_text'] = body.replace('\'', '\'\'') if not pass_through: self.add_ednote(odbc_item, article) self.add_byline(odbc_item, article) odbc_item['article_text'] += '\r\n' + source sign_off = article.get('sign_off', '') or '' if len(sign_off) > 0: odbc_item['article_text'] += ' ' + sign_off odbc_item['category'] = odbc_item.get('category', '').upper() odbc_item['selector_codes'] = odbc_item.get( 'selector_codes', '').upper() docs.append((pub_seq_num, json.dumps(odbc_item))) return docs except Exception as ex: raise FormatterError.AAPNewscentreFormatterError(ex, subscriber)
def parse(self, xml, provider=None): self.root = xml try: item = self.parse_item(xml) if not item.get('headline'): item['headline'] = text_utils.get_text( item.get('body_html', ''), 'html')[:100] try: abstract = xml.xpath( "//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if abstract: item['abstract'] = abstract return [item] except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def _sanitize_fields(self, doc, validator): """If maxlength or minlength is specified in the validator then remove any markups from that field :param doc: Article to be validated :param validator: Validation rule :return: updated article """ fields_to_check = ['minlength', 'maxlength'] item_schema = validator.get('schema', {}) extra_schema = item_schema.get('extra', {}).get('schema', {}) schemes_docs = [(item_schema, doc), (extra_schema, doc.get('extra', {}))] for schema, content in schemes_docs: for field in schema: if content.get(field) and schema.get(field) and type(content[field]) is str and \ any(k in schema[field] for k in fields_to_check): try: content[field] = get_text(content[field]) except (ValueError, TypeError): # fails for json fields like subject, genre pass
def parse_newsitem(self, item, newsitem_el): super().parse_newsitem(item, newsitem_el) # mapping services-products from category, and have only one product matching = False for category in item.get('anpa_category', []): qcode = self.MAPPING_CATEGORY.get(category.get('qcode'), 'NEWS/GENERAL') item.setdefault('subject', []).append({ 'name': qcode, 'qcode': qcode, 'parent': 'NEWS', 'scheme': 'services-products' }) matching = True if not matching: item.setdefault('subject', []).append({ 'name': 'NEWS/GENERAL', 'qcode': 'NEWS/GENERAL', 'parent': 'NEWS', 'scheme': 'services-products' }) # add content for headline when it is empty if item.get('urgency') in (1, 2) and not item.get('headline'): for line in get_text(item.get('body_html', ''), lf_on_block=True).split('\n'): if line.strip(): item['headline'] = 'URGENT: ' + line.strip() break # Label must be empty item['subject'] = [ i for i in item['subject'] if i.get('scheme') != 'label' ] # Source is AFP credit = {"name": 'AFP', "qcode": 'AFP', "scheme": "sources"} item.setdefault('subject', []).append(credit) if item.get('urgency') == 4: item['urgency'] = 3 return item
def parse_item(self, tree): item = super().parse_item(tree) meta = tree.find(self.qname("contentMeta")) organisation = meta.xpath( './iptc:subject[@type="cpnat:organisation"][@literal]', namespaces=NS) if organisation: org_name = organisation[0].get("literal") item["abstract"] = format_maxlength( "FOR: {}. {}".format( org_name.upper().rstrip("."), get_text(item["body_html"]).replace(" ", " "), ), 200, ) item.setdefault("subject", []).append({ "name": org_name, "qcode": org_name, "scheme": cp.ORGANISATION, }) return item
def upload_document(item): item_name = item.get("headline") or item.get("slugline") if not item_name or not item.get("body_html"): return payload = { "lang": { "fromLang": "en", "toLang": "fr", }, "name": item_name, "state": "new", "text": { "original": get_text(item["body_html"]), }, } resp = sess.post( ULTRAD_URL, json=payload, headers=get_headers(), timeout=ULTRAD_TIMEOUT ) raise_for_resp_error(resp) data = get_json(resp) return data["_id"]
def get_value(self, article): try: return get_text(article[self.field.name]).replace('\n', ' ') except (etree.XMLSyntaxError, ValueError): return article[self.field.name]
def format(self, article, subscriber, codes=None): try: docs = [] formatted_article = deepcopy(article) for category in self._get_category_list(formatted_article.get('anpa_category')): mapped_source = self._get_mapped_source(formatted_article) formatted_article[config.ID_FIELD] = formatted_article.get('item_id', formatted_article.get(config.ID_FIELD)) pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) anpa = [] if codes: anpa.append(b'\x05') anpa.append(' '.join(codes).encode('ascii')) anpa.append(b'\x0D\x0A') # start of message header (syn syn soh) anpa.append(b'\x16\x16\x01') anpa.append(get_service_level(category, formatted_article).encode('ascii')) # story number anpa.append(str(pub_seq_num).zfill(4).encode('ascii')) # field seperator anpa.append(b'\x0A') # -LF anpa.append(map_priority(formatted_article.get('priority')).encode('ascii')) anpa.append(b'\x20') anpa.append(category['qcode'].lower().encode('ascii')) anpa.append(b'\x13') # format identifier if formatted_article.get(FORMAT, FORMATS.HTML) == FORMATS.PRESERVED: anpa.append(b'\x12') else: anpa.append(b'\x11') anpa.append(b'\x20') # keyword keyword = 'bc-{}'.format(self.append_legal(article=formatted_article, truncate=True)).replace(' ', '-') keyword = keyword[:24] if len(keyword) > 24 else keyword anpa.append(keyword.encode('ascii')) anpa.append(b'\x20') # version field anpa.append(b'\x20') # reference field anpa.append(b'\x20') # filing date anpa.append('{}-{}'.format(formatted_article['_updated'].strftime('%m'), formatted_article['_updated'].strftime('%d')).encode('ascii')) anpa.append(b'\x20') # add the word count anpa.append(str(formatted_article.get('word_count', '0000')).zfill(4).encode('ascii')) anpa.append(b'\x0D\x0A') anpa.append(b'\x02') # STX self._process_headline(anpa, formatted_article, category['qcode'].encode('ascii')) keyword = SluglineMapper().map(article=formatted_article, category=category['qcode'].upper(), truncate=True).encode('ascii', 'ignore') anpa.append(keyword) take_key = (formatted_article.get('anpa_take_key', '') or '').encode('ascii', 'ignore') anpa.append((b'\x20' + take_key) if len(take_key) > 0 else b'') anpa.append(b'\x0D\x0A') if formatted_article.get('ednote', '') != '': ednote = '{}\r\n'.format(to_ascii(formatted_article.get('ednote'))) anpa.append(ednote.encode('ascii', 'replace')) if formatted_article.get(BYLINE): anpa.append(get_text(formatted_article.get(BYLINE)).encode('ascii', 'replace')) anpa.append(b'\x0D\x0A') if formatted_article.get(FORMAT) == FORMATS.PRESERVED: anpa.append(get_text(self.append_body_footer(formatted_article), content='html').encode('ascii', 'replace')) else: body = to_ascii(formatted_article.get('body_html', '')) # we need to inject the dateline if formatted_article.get('dateline', {}).get('text') and not article.get('auto_publish', False): body_html_elem = parse_html(formatted_article.get('body_html')) ptag = body_html_elem.find('.//p') if ptag is not None: ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '') body = to_string(body_html_elem) anpa.append(self.get_text_content(body)) if formatted_article.get('body_footer'): anpa.append(self.get_text_content(to_ascii(formatted_article.get('body_footer', '')))) anpa.append(b'\x0D\x0A') anpa.append(mapped_source.encode('ascii')) sign_off = (formatted_article.get('sign_off', '') or '').encode('ascii') anpa.append((b'\x20' + sign_off) if len(sign_off) > 0 else b'') anpa.append(b'\x0D\x0A') anpa.append(b'\x03') # ETX # time and date anpa.append(datetime.datetime.now().strftime('%d-%m-%y %H-%M-%S').encode('ascii')) anpa.append(b'\x04') # EOT anpa.append(b'\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A') docs.append({'published_seq_num': pub_seq_num, 'encoded_item': b''.join(anpa), 'formatted_item': b''.join(anpa).decode('ascii')}) return docs except Exception as ex: raise FormatterError.AnpaFormatterError(ex, subscriber)
def _parse_content(self, article): """Parse body_html and mapping to fields required for apple news format :param article: """ statement_regex = re.compile(r'^The Statement$', re.IGNORECASE) analysis_regex = re.compile(r'^The Analysis$', re.IGNORECASE) verdict_regex = re.compile(r'^The Verdict$', re.IGNORECASE) references_regex = re.compile(r'^The References$', re.IGNORECASE) url_regex = re.compile(r'(?:(?:https|http)://)[\w/\-?=%.]+\.[\w/\-?=%.]+', re.IGNORECASE) abstract = get_text(article.get('abstract'), content='html').strip() article['_title'] = abstract body_html = article.get('body_html') article['_analysis_first_line'] = '' article['_analysis'] = '' article['_statement'] = '' article['_statement_attribution'] = '' article['_verdict1'] = '' article['_verdict2'] = '' article['_references'] = '' article['_revision_history'] = '' if article.get(ITEM_STATE) == CONTENT_STATE.KILLED or article.get(ITEM_STATE) == CONTENT_STATE.RECALLED: article['_title'] = 'This article has been removed.' article['_analysis_first_line'] = 'This article has been removed.' article['_analysis'] = 'This article has been removed.' article['_statement'] = 'This article has been removed.' article['_statement_attribution'] = 'This article has been removed.' article['_verdict1'] = 'This article has been removed.' article['_verdict2'] = 'This article has been removed.' article['_references'] = 'This article has been removed.' self._set_revision_history(article) return parsed_content = parse_html(body_html, content='html') statement_found = False analysis_found = False analysis_first_line = False verdict1_found = False verdict2_found = False references_found = False statement_elements = [] for top_level_tag in parsed_content.xpath('/html/div/child::*'): tag_text = format_text_content(top_level_tag).strip() if not tag_text: continue if not verdict1_found: if not statement_found: match = statement_regex.search(tag_text) if match: statement_found = True continue else: # statement found match = verdict_regex.search(tag_text) if match: verdict1_found = True if len(statement_elements) > 1: statement_length = len(statement_elements) - 1 for i in range(statement_length): article['_statement'] += get_text( to_string(statement_elements[i], remove_root_div=False), content='html' ).strip() if statement_length > 1 and i != statement_length - 1: article['_statement'] += '\r\n' article['_statement_attribution'] = get_text( to_string(statement_elements[-1:][0], remove_root_div=False), content='html' ).strip() elif len(statement_elements) == 1: article['_statement'] = to_string( statement_elements[0], remove_root_div=False ) continue statement_elements.append(top_level_tag) continue if verdict1_found and not analysis_found: match = analysis_regex.search(tag_text) if match: analysis_found = True else: article['_verdict1'] += to_string(top_level_tag, remove_root_div=False) continue if analysis_found and not verdict2_found: if not analysis_first_line: article['_analysis_first_line'] = tag_text analysis_first_line = True match = verdict_regex.search(tag_text) if match: verdict2_found = True else: article['_analysis'] += to_string(top_level_tag, remove_root_div=False) continue if verdict2_found and not references_found: match = references_regex.search(tag_text) if match: references_found = True else: article['_verdict2'] += to_string(top_level_tag, remove_root_div=False) continue if references_found: def replacement(match_object): value = match_object.group(0) if value: return '<a href="{0}">{0}</a>'.format(value) return '' tag_text = re.sub(r'^\d*\s*[.):]?', '', tag_text).strip() article['_references'] += '<li>{}</li>'.format( re.sub(url_regex, replacement, tag_text) ) if len(article['_references']): article['_references'] = '<ol>{}</ol>'.format(article['_references']) if not article.get('_statement') and article.get('_statement_attribution'): # if statement is not as per the format article['_statement'] = article.get('_statement_attribution') article['_statement_attribution'] = '' self._set_revision_history(article)
def plaintext_filter(value): """Filter out html from value.""" return get_text(value).replace('\n', ' ').strip()
def parse(self, xml, provider=None): self.root = xml try: item = self.parse_item(xml) if not item.get('headline'): item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100] # abstract try: abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if abstract: item['abstract'] = abstract # genre for genre_elt in xml.xpath("//iptc:genre", namespaces={'iptc': IPTC_NS}): qcode = genre_elt.get('qcode') if qcode is None: continue elif qcode.startswith('sttgenre:'): qcode = qcode[9:] genre_data = {'qcode': qcode} name_elt = genre_elt.find(self.qname('name')) name = name_elt.text if name_elt is not None and name_elt.text else "" try: name = self.getVocabulary("genre", qcode, name) except ValueError: continue else: genre_data['name'] = name item.setdefault('genre', []).append(genre_data) elif qcode.startswith('sttversion:'): qcode = qcode[11:] version_data = {'qcode': qcode, 'scheme': 'sttversion'} name_elt = genre_elt.find(self.qname('name')) name = name_elt.text if name_elt is not None and name_elt.text else "" try: name = self.getVocabulary("sttgenre", qcode, name) except ValueError: continue else: version_data['name'] = name item.setdefault('subject', []).append(version_data) # location for location_elt in xml.xpath("//iptc:assert", namespaces={'iptc': IPTC_NS}): qcode = location_elt.get("qcode") if not qcode or not qcode.startswith("sttlocmeta:default:"): continue qcode = qcode[19:] location_data = {"scheme": "sttlocmeta:default", "qcode": qcode} for broader_elt in location_elt.xpath(".//iptc:broader[@type='cpnat:geoArea']", namespaces={'iptc': IPTC_NS}): qcode = broader_elt.get('qcode') if not qcode: continue for key, mapping in STT_LOCATION_MAP.items(): if qcode.startswith(key + ":"): if "qcode" in mapping: qcode = qcode[len(key) + 1:] try: name = broader_elt.find(self.qname('name')).text except AttributeError: name = "" try: name = self.getVocabulary(key, qcode, name) except ValueError: continue else: location_data[mapping["qcode"]] = qcode if "name" in mapping: location_data[mapping["name"]] = name item.setdefault('place', []).append(location_data) # public editorial note if 'ednote' in item: # stt has specific roles for public and private editorial notes # so we remove ednote found by parent parser, as it takes first one # as a public note del item['ednote'] try: ednote = xml.xpath("//iptc:edNote[@role='sttnote:public']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if ednote: item['ednote'] = ednote # private editorial note try: private_note = xml.xpath("//iptc:edNote[@role='sttnote:private']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if private_note: item.setdefault('extra', {})['sttnote_private'] = private_note return [item] except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { 'guid': article.get(GUID_FIELD, article.get('uri')), 'version': str(article.get(config.VERSION, 1)), 'type': self._get_type(article) } if article.get('byline'): ninjs['byline'] = article['byline'] located = article.get('dateline', {}).get('located', {}) if located: ninjs['located'] = located.get('city', '') for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if 'body_text' not in article and 'alt_text' in article: ninjs['body_text'] = article['alt_text'] if 'title' in article: ninjs['headline'] = article['title'] if article.get('body_html'): ninjs['body_html'] = self.append_body_footer(article) if article.get('description'): ninjs['description_html'] = self.append_body_footer(article) if article.get('place'): ninjs['place'] = self._format_place(article) if article.get('profile'): ninjs['profile'] = self._format_profile(article['profile']) extra_items = None if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations(article, subscriber) if article.get(ASSOCIATIONS): associations, extra_items = self._format_related(article, subscriber) ninjs[ASSOCIATIONS].update(associations) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber) if extra_items: ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items) if article.get(EMBARGO): ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat() if article.get('priority'): ninjs['priority'] = article['priority'] else: ninjs['priority'] = 5 if article.get('subject'): ninjs['subject'] = self._get_subject(article) if article.get('anpa_category'): ninjs['service'] = self._get_service(article) if article.get('renditions'): ninjs['renditions'] = self._get_renditions(article) elif 'url' in article: ninjs['renditions'] = self._generate_renditions(article) # SDPA-317 if 'abstract' in article: abstract = article.get('abstract', '') ninjs['description_html'] = abstract ninjs['description_text'] = text_utils.get_text(abstract) elif article.get('description_text'): ninjs['description_text'] = article.get('description_text') if article.get('company_codes'): ninjs['organisation'] = [{'name': c.get('name', ''), 'rel': 'Securities Identifier', 'symbols': [{'ticker': c.get('qcode', ''), 'exchange': c.get('security_exchange', '')}]} for c in article['company_codes']] elif 'company' in article: ninjs['organisation'] = [{'name': article['company']}] if article.get('rewrite_of'): ninjs['evolvedfrom'] = article['rewrite_of'] if not ninjs.get('copyrightholder') and not ninjs.get('copyrightnotice') and not ninjs.get('usageterms'): ninjs.update(superdesk.get_resource_service('vocabularies').get_rightsinfo(article)) if 'genre' in article: ninjs['genre'] = self._get_genre(article) if article.get('flags', {}).get('marked_for_legal'): ninjs['signal'] = self._format_signal_cwarn() if article.get('attachments'): ninjs['attachments'] = self._format_attachments(article) if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs or 'body_text' in ninjs): if 'body_html' in ninjs: body_html = ninjs['body_html'] word_count = text_utils.get_word_count(body_html) char_count = text_utils.get_char_count(body_html) readtime = text_utils.get_reading_time(body_html, word_count, article.get('language')) else: body_text = ninjs['body_text'] word_count = text_utils.get_text_word_count(body_text) char_count = len(body_text) readtime = text_utils.get_reading_time(body_text, word_count, article.get('language')) ninjs['charcount'] = char_count ninjs['wordcount'] = word_count ninjs['readtime'] = readtime if article.get('authors'): ninjs['authors'] = self._format_authors(article) return ninjs
def _ednote_filter(self, ednote): return text_utils.get_text(ednote, lf_on_block=True).strip()