def test_get_word_count(self): self.assertEqual(2, get_word_count('plain text'), 'plain text') self.assertEqual(2, get_word_count('<p> html text </p>'), 'paragraph') self.assertEqual(22, get_word_count( '<doc><p xml:lang="en-US">The weather was superb today in Norfolk, Virginia. Made me want to take\n' 'out my boat, manufactured by the <org value="acm" idsrc="iptc.org">Acme Boat Company</org>.</p></doc>'))
def parse(self, file_path, provider=None): try: item = { 'guid': '{}-{}'.format(file_path, uuid.uuid4()), 'pubstatus': 'usable', 'versioncreated': utcnow(), ITEM_TYPE: CONTENT_TYPE.TEXT, FORMAT: FORMATS.HTML, } with open(file_path, 'r', encoding='windows-1252') as f: data = f.read().replace('\r', '') header, dateline_data, body_data = data.split('\n\n', 2) self._process_header(item, header) start_of_body = 'MEDIA RELEASE ' source, data = data.split(start_of_body, 1) data = start_of_body + data item['anpa_category'] = [{'qcode': 'j'}] item['original_source'] = 'AsiaNet' body_html = to_ascii(html.escape(data)).replace('\n\n', '</p><p>').replace('\n', ' ') item['body_html'] = '<p>' + body_html + '</p>' item['word_count'] = get_word_count(item['body_html']) return item except Exception as e: raise AAPParserError.AsiaNetParserError(file_path, e)
def test_word_count_html(self): # If you change the following text, please change it in client too at # superdesk-client-core/scripts/apps/authoring/authoring/tests/WordCount.spec.js text = """ <p>This is a test text with numbers (1 000 000 and 1,000,000 and 1.000.000) and <strong>compound word (two-done)</strong> and <em>abbreviation (Washington D.C.)</p> <p>it should be the same word count as in client and backend</p>""" self.assertEqual(32, text_utils.get_word_count(text))
def test_word_count_ul(self): self.assertEqual(3, text_utils.get_word_count(""" <ul> <li>foo</li> <li>bar</li> <li>baz</li> <li></li> </ul> """))
def parse(self, xml, provider=None): item = {ITEM_TYPE: CONTENT_TYPE.TEXT, # set the default type. } try: self.do_mapping(item, xml, SETTINGS_MAPPING_PARAM) elem = xml.find('body/body.head/dateline/location/city') if elem is not None: self.set_dateline(item, city=elem.text) item.setdefault('word_count', get_word_count(item['body_html'], no_html=True)) except Exception as ex: raise ParserError.nitfParserError(ex, provider) return item
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: # Ignore the packageItem, it has no guid if 'guid' in item_tree.attrib: item = self.parse_item(item_tree) item['priority'] = 6 item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] item.setdefault('word_count', get_word_count(item['body_html'])) # Hard code the urgency item['urgency'] = 3 # Dateline is always Wellington in NZ located = [c for c in app.locators.find_cities(country_code='NZ', state_code='NZ.G2') if c.get('city', '').lower() == 'wellington'] if len(located) == 1: item['dateline'] = dict() item['dateline']['located'] = located[0] if item.get('body_html') and item['dateline']: parsed = parse_html(item.get('body_html'), content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue # check the first par for a byline if pars.index(par) == 0 and par.text.startswith('By '): item['byline'] = par.text.replace('By ', '') par.getparent().remove(par) date, source, the_rest = par.text.partition(' (BusinessDesk) - ') if source: item['dateline']['date'] = date_parser(date, fuzzy=True) par.text = the_rest # remove the signoff if in the last par if par.text == '(BusinessDesk)' and pars.index(par) + 1 == len(pars): par.getparent().remove(par) item['body_html'] = to_string(parsed, remove_root_div=True) locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') if locator_map: item['place'] = [x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ'] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def test_word_count_nitf_2(self): self.assertEqual(316, text_utils.get_word_count(""" <p>Rio Tinto has kept intact its target for iron ore shipments in 2017 after hitting the mid-point of its revised guidance range for 2016. </p><p>The world's second largest iron ore exporter shipped 327.6 million tonnes of iron ore from its Pilbara operations in 2016, in line with the slightly lowered full-year guidance of between 325 and 330 million tonnes.</p><p>It expects to ship between 330 to 340 million tonnes in 2017 from its main mining hub in WA.</p><p>"We have delivered a strong operational performance in 2016, underpinned by our drive for efficiency and maximising cash flow," chief executive Jean Sebastien Jacques said in a statement.</p><p>"Our disciplined approach remains in place in 2017, with the continued focus on productivity, cost reduction and commercial excellence."</p><p>Rio shipped 87.7 million tonnes of iron ore in the December quarter - up eight per cent from the preceding three months - mainly helped by minimal weather disruption.</p><p>Fourth-quarter production was also up four per cent from a year ago to 85.5 million tonnes.</p><p>Sales in the quarter exceeded production by 2.2 million tonnes, primarily through a drawdown on inventories built at the ports in the third quarter, the company said.</p><p>The miner also looks to have capitalised on a strong rebound in iron ore prices in 2016, saying 80 per cent of its sales were either on the spot market or on current quarter or current month average.</p><p>Rio’s copper production rose four per cent from a year ago to 523,000 tonnes, but still came in below its guidance range of 535,000 to 565,000 tonnes due to lower-than-expected production at its Kennecott mine in the US and no supplies from the Grasberg joint venture in Indonesia.</p><p>It has forecast a wide guidance range of 525,000 to 665,000 tonnes for 2017.</p><p>The miner topped production forecasts for bauxite and coking coal, while aluminium output jumped 10 per cent in 2016.</p>"""))
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { 'guid': article.get(GUID_FIELD, article.get('uri')), 'version': str(article.get(config.VERSION, 1)), 'type': self._get_type(article) } if article.get('byline'): ninjs['byline'] = article['byline'] located = article.get('dateline', {}).get('located', {}) if located: ninjs['located'] = located.get('city', '') for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if 'body_text' not in article and 'alt_text' in article: ninjs['body_text'] = article['alt_text'] if 'title' in article: ninjs['headline'] = article['title'] if article.get('body_html'): ninjs['body_html'] = self.append_body_footer(article) if article.get('description'): ninjs['description_html'] = self.append_body_footer(article) if article.get('place'): ninjs['place'] = self._format_place(article) if article.get('profile'): ninjs['profile'] = self._format_profile(article['profile']) extra_items = None if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations(article, subscriber) if article.get(ASSOCIATIONS): associations, extra_items = self._format_related(article, subscriber) ninjs[ASSOCIATIONS].update(associations) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber) if extra_items: ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items) if article.get(EMBARGO): ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat() if article.get('priority'): ninjs['priority'] = article['priority'] else: ninjs['priority'] = 5 if article.get('subject'): ninjs['subject'] = self._get_subject(article) if article.get('anpa_category'): ninjs['service'] = self._get_service(article) if article.get('renditions'): ninjs['renditions'] = self._get_renditions(article) elif 'url' in article: ninjs['renditions'] = self._generate_renditions(article) # SDPA-317 if 'abstract' in article: abstract = article.get('abstract', '') ninjs['description_html'] = abstract ninjs['description_text'] = text_utils.get_text(abstract) elif article.get('description_text'): ninjs['description_text'] = article.get('description_text') if article.get('company_codes'): ninjs['organisation'] = [{'name': c.get('name', ''), 'rel': 'Securities Identifier', 'symbols': [{'ticker': c.get('qcode', ''), 'exchange': c.get('security_exchange', '')}]} for c in article['company_codes']] elif 'company' in article: ninjs['organisation'] = [{'name': article['company']}] if article.get('rewrite_of'): ninjs['evolvedfrom'] = article['rewrite_of'] if not ninjs.get('copyrightholder') and not ninjs.get('copyrightnotice') and not ninjs.get('usageterms'): ninjs.update(superdesk.get_resource_service('vocabularies').get_rightsinfo(article)) if 'genre' in article: ninjs['genre'] = self._get_genre(article) if article.get('flags', {}).get('marked_for_legal'): ninjs['signal'] = self._format_signal_cwarn() if article.get('attachments'): ninjs['attachments'] = self._format_attachments(article) if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs or 'body_text' in ninjs): if 'body_html' in ninjs: body_html = ninjs['body_html'] word_count = text_utils.get_word_count(body_html) char_count = text_utils.get_char_count(body_html) readtime = text_utils.get_reading_time(body_html, word_count, article.get('language')) else: body_text = ninjs['body_text'] word_count = text_utils.get_text_word_count(body_text) char_count = len(body_text) readtime = text_utils.get_reading_time(body_text, word_count, article.get('language')) ninjs['charcount'] = char_count ninjs['wordcount'] = word_count ninjs['readtime'] = readtime if article.get('authors'): ninjs['authors'] = self._format_authors(article) return ninjs
def test_word_count_p_tags(self): self.assertEqual(2, text_utils.get_word_count("<p>foo<strong>s</strong></p><p>bar</p>")) self.assertEqual(500, text_utils.get_word_count("<p>word</p>" * 500))
def test_word_count_hrs(self): self.assertEqual(2, text_utils.get_word_count('<p>foo<br><hr>bar</p>')) self.assertEqual(2, text_utils.get_word_count('<p>foo<br /><hr />bar</p>'))
def test_word_count_hrs(self): self.assertEqual(2, text_utils.get_word_count("<p>foo<br><hr>bar</p>")) self.assertEqual(2, text_utils.get_word_count("<p>foo<br /><hr />bar</p>"))
def test_word_count_whitespace_string(self): self.assertEqual(0, text_utils.get_word_count(" "))
def brief_internal_routing(item: dict, **kwargs): guid = item.get('guid', 'unknown') logger.info('macro started item=%s', guid) try: assert str(item['profile']) == str( _get_profile_id(TEXT_PROFILE)), 'profile is not text' assert get_word_count(item['body_html']) < 301, 'body is too long' # The title should not start with the word "CORRECTION" if item.get('headline'): title_start_with_correction = item['headline'].lstrip().startswith( 'CORRECTION') assert not title_start_with_correction, 'The headline/title should not start with word CORRECTION' except AssertionError as err: logger.info('macro stop on assert item=%s error=%s', guid, err) raise StopDuplication() except KeyError as err: logger.error(err) raise StopDuplication() item.setdefault('subject', []) item['urgency'] = 2 item['profile'] = _get_profile_id(BRIEF_PROFILE) item['subject'] = _get_product_subject( _get_brief_subject(item.get('subject', []))) item['status'] = CONTENT_STATE.SCHEDULED item['operation'] = 'publish' _fix_headline(item) _fix_body_html(item) UTC_FIELD = 'utc_{}'.format(PUBLISH_SCHEDULE) try: published_at = item[SCHEDULE_SETTINGS][UTC_FIELD] except KeyError: published_at = utcnow() item[SCHEDULE_SETTINGS] = { 'time_zone': 'Europe/Brussels', } # Set item publish schedule to 7:30 am for autopublish between 4 to 7 am is_press_headline = item.get( 'headline') and 'press' in item['headline'].lower() current_datetime = utc_to_local(superdesk.app.config['DEFAULT_TIMEZONE'], utcnow()) if is_press_headline and time(4, 00) <= current_datetime.time() <= time( 7, 00): item[PUBLISH_SCHEDULE] = current_datetime.replace(hour=7, minute=30, second=00) logger.info( 'Set publish schedule to 7:30 am for autopublish between 4 to 7 am item=%s', item.get('guid', 'unknown')) else: # schedule +30m item[PUBLISH_SCHEDULE] = utc_to_local( item[SCHEDULE_SETTINGS]['time_zone'], published_at + timedelta(minutes=30)) update_schedule_settings(item, PUBLISH_SCHEDULE, item[PUBLISH_SCHEDULE]) item[PUBLISH_SCHEDULE] = item[PUBLISH_SCHEDULE].replace(tzinfo=None) # remove text in () brackets along with brackets if item.get("headline"): title = re.sub(r"\([^()]*\)", "", item['headline']) item['headline'] = " ".join(title.split()) # publish try: internal_destination_auto_publish(item) except StopDuplication: logger.info('macro done item=%s', guid) except DocumentError as err: logger.error('validation error when creating brief item=%s error=%s', guid, err) except Exception as err: logger.exception(err) # avoid another item to be created raise StopDuplication()
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { 'guid': article.get(GUID_FIELD, article.get('uri')), 'version': str(article.get(config.VERSION, 1)), 'type': self._get_type(article) } if article.get('editor_state'): self._parse_editor_state(article, ninjs) if article.get('byline'): ninjs['byline'] = article['byline'] located = article.get('dateline', {}).get('located', {}) if located: ninjs['located'] = located.get('city', '') for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if 'body_text' not in article and 'alt_text' in article: ninjs['body_text'] = article['alt_text'] if 'title' in article: ninjs['headline'] = article['title'] if article.get('body_html'): ninjs['body_html'] = self.append_body_footer(article) if article.get('description'): ninjs['description_html'] = self.append_body_footer(article) if article.get('place'): ninjs['place'] = self._format_place(article['place']) if article.get('profile'): ninjs['profile'] = self._format_profile(article['profile']) if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations( article, subscriber) if article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS].update( self._format_related(article, subscriber)) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS] = self._format_related(article, subscriber) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS] = self._format_related(article, subscriber) if article.get(EMBARGO): ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat() if article.get('priority'): ninjs['priority'] = article['priority'] else: ninjs['priority'] = 5 if article.get('subject'): ninjs['subject'] = self._get_subject(article) if article.get('anpa_category'): ninjs['service'] = self._get_service(article) if article.get('renditions'): ninjs['renditions'] = self._get_renditions(article) elif 'url' in article: ninjs['renditions'] = self._generate_renditions(article) # SDPA-317 if article.get('abstract'): abstract = article.get('abstract', '') ninjs['description_html'] = abstract ninjs['description_text'] = text_utils.get_text(abstract) elif article.get('description_text'): ninjs['description_text'] = article.get('description_text') if article.get('company_codes'): ninjs['organisation'] = [{ 'name': c.get('name', ''), 'rel': 'Securities Identifier', 'symbols': [{ 'ticker': c.get('qcode', ''), 'exchange': c.get('security_exchange', '') }] } for c in article['company_codes']] elif 'company' in article: ninjs['organisation'] = [{'name': article['company']}] if article.get('rewrite_of'): ninjs['evolvedfrom'] = article['rewrite_of'] if not ninjs.get('copyrightholder') and not ninjs.get( 'copyrightnotice') and not ninjs.get('usageterms'): ninjs.update( superdesk.get_resource_service('vocabularies').get_rightsinfo( article)) if article.get('genre'): ninjs['genre'] = self._format_qcodes(article['genre']) if article.get('flags', {}).get('marked_for_legal'): ninjs['signal'] = self._format_signal_cwarn() if article.get('attachments'): ninjs['attachments'] = self._format_attachments(article) if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs or 'body_text' in ninjs): if 'body_html' in ninjs: word_count = text_utils.get_word_count(ninjs.get('body_html')) else: word_count = text_utils.get_text_word_count(ninjs['body_text']) ninjs['readtime'] = text_utils.get_reading_time(word_count) if article.get('authors'): ninjs['authors'] = self._format_authors(article) return ninjs
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { "guid": article.get(GUID_FIELD, article.get("uri")), "version": str(article.get(config.VERSION, 1)), "type": self._get_type(article), } if article.get("byline"): ninjs["byline"] = article["byline"] located = article.get("dateline", {}).get("located", {}) if located: ninjs["located"] = located.get("city", "") for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if "body_text" not in article and "alt_text" in article: ninjs["body_text"] = article["alt_text"] if "title" in article: ninjs["headline"] = article["title"] if article.get("body_html"): ninjs["body_html"] = self.append_body_footer(article) if article.get("description"): ninjs["description_html"] = self.append_body_footer(article) if article.get("place"): ninjs["place"] = self._format_place(article) if article.get("profile"): ninjs["profile"] = self._format_profile(article["profile"]) extra_items = None if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations( article, subscriber) if article.get(ASSOCIATIONS): associations, extra_items = self._format_related( article, subscriber) ninjs[ASSOCIATIONS].update(associations) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS], extra_items = self._format_related( article, subscriber) elif article.get(ASSOCIATIONS) and recursive: ninjs[ASSOCIATIONS], extra_items = self._format_related( article, subscriber) if extra_items: ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items) if article.get("embargoed"): ninjs["embargoed"] = article["embargoed"].isoformat() if article.get( EMBARGO): # embargo set in superdesk overrides ingested one ninjs["embargoed"] = get_utc_schedule(article, EMBARGO).isoformat() if article.get("priority"): ninjs["priority"] = article["priority"] else: ninjs["priority"] = 5 if article.get("subject"): ninjs["subject"] = self._get_subject(article) if article.get("anpa_category"): ninjs["service"] = self._get_service(article) if article.get("renditions"): ninjs["renditions"] = self._get_renditions(article) elif "url" in article: ninjs["renditions"] = self._generate_renditions(article) if "order" in article: ninjs["order"] = article["order"] # SDPA-317 if "abstract" in article: abstract = article.get("abstract", "") ninjs["description_html"] = abstract ninjs["description_text"] = text_utils.get_text(abstract) elif article.get("description_text"): ninjs["description_text"] = article.get("description_text") if article.get("company_codes"): ninjs["organisation"] = [{ "name": c.get("name", ""), "rel": "Securities Identifier", "symbols": [{ "ticker": c.get("qcode", ""), "exchange": c.get("security_exchange", "") }], } for c in article["company_codes"]] elif "company" in article: ninjs["organisation"] = [{"name": article["company"]}] if article.get("rewrite_of"): ninjs["evolvedfrom"] = article["rewrite_of"] if not ninjs.get("copyrightholder") and not ninjs.get( "copyrightnotice") and not ninjs.get("usageterms"): ninjs.update( superdesk.get_resource_service("vocabularies").get_rightsinfo( article)) if article.get("genre"): ninjs["genre"] = self._get_genre(article) if article.get("flags", {}).get("marked_for_legal"): ninjs["signal"] = self._format_signal_cwarn() if article.get("attachments"): ninjs["attachments"] = self._format_attachments(article) if ninjs["type"] == CONTENT_TYPE.TEXT and ("body_html" in ninjs or "body_text" in ninjs): if "body_html" in ninjs: body_html = ninjs["body_html"] word_count = text_utils.get_word_count(body_html) char_count = text_utils.get_char_count(body_html) readtime = text_utils.get_reading_time(body_html, word_count, article.get("language")) else: body_text = ninjs["body_text"] word_count = text_utils.get_text_word_count(body_text) char_count = len(body_text) readtime = text_utils.get_reading_time(body_text, word_count, article.get("language")) ninjs["charcount"] = char_count ninjs["wordcount"] = word_count ninjs["readtime"] = readtime if article.get("authors"): ninjs["authors"] = self._format_authors(article) if (article.get("schedule_settings") or {}).get("utc_publish_schedule"): ninjs["publish_schedule"] = article["schedule_settings"][ "utc_publish_schedule"] return ninjs
def test_word_count_p_tags(self): self.assertEqual( 2, text_utils.get_word_count( '<p>foo<strong>s</strong></p><p>bar</p>'))
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: # Ignore the packageItem, it has no guid if 'guid' in item_tree.attrib: item = self.parse_item(item_tree) item['priority'] = 6 item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{ 'qcode': '04000000', 'name': subject_codes['04000000'] }] item.setdefault('word_count', get_word_count(item['body_html'])) # Hard code the urgency item['urgency'] = 3 # Dateline is always Wellington in NZ located = [ c for c in app.locators.find_cities( country_code='NZ', state_code='NZ.G2') if c.get('city', '').lower() == 'wellington' ] if len(located) == 1: item['dateline'] = dict() item['dateline']['located'] = located[0] if item.get('body_html') and item['dateline']: parsed = parse_html(item.get('body_html'), content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue # check the first par for a byline if pars.index( par) == 0 and par.text.startswith( 'By '): item['byline'] = par.text.replace( 'By ', '') par.getparent().remove(par) date, source, the_rest = par.text.partition( ' (BusinessDesk) - ') if source: item['dateline']['date'] = date_parser( date, fuzzy=True) par.text = the_rest # remove the signoff if in the last par if par.text == '(BusinessDesk)' and pars.index( par) + 1 == len(pars): par.getparent().remove(par) item['body_html'] = to_string(parsed, remove_root_div=True) locator_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='locators') if locator_map: item['place'] = [ x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ' ] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def test_word_count_nitf(self): self.assertEqual(40, text_utils.get_word_count(""" <p>2014: Northern Ireland beat <location>Greece</location> 2-0 in <location>Athens</location> with goals from <person>Jamie Ward</person> and <person>Kyle Lafferty</person> to boost their hopes of qualifying for <money>Euro 2016</money>. <person>Michael O'Neill's</person> side sealed their place at the finals in <chron>October 2015</chron>.</p>"""))
def word_count(html): return get_word_count(html or '')
def _format_item(self, root, item, pub_seq_num, service, services) -> None: if is_picture(item): D2P1 = "http://www.w3.org/2001/XMLSchema-instance" content = etree.SubElement( root, "ContentItem", {"{%s}type" % D2P1: "PhotoContentItem"}, nsmap={ "d2p1": D2P1, }, ) else: content = etree.SubElement(root, "ContentItem") extra = item.get("extra") or {} # root system fields etree.SubElement(root, "Reschedule").text = "false" etree.SubElement(root, "IsRegional").text = "false" etree.SubElement(root, "CanAutoRoute").text = "true" etree.SubElement(root, "PublishID").text = str(pub_seq_num) etree.SubElement(root, "Username") etree.SubElement(root, "UseLocalsOut").text = "false" etree.SubElement(root, "UserProfileID").text = "0" etree.SubElement(root, "PublishOrder").text = "0" etree.SubElement(root, "NewCycle").text = "false" etree.SubElement(root, "OnlineResend").text = "false" # item system fields etree.SubElement(content, "AutoSaveID").text = "0" etree.SubElement(content, "Type").text = "0" etree.SubElement(content, "MediaType").text = "0" etree.SubElement(content, "Status").text = "0" if is_picture(item): etree.SubElement(root, "Services").text = "Pictures" self._format_subject_code(root, item, "PscCodes", cp.DESTINATIONS) if root.find("PscCodes") is None: etree.SubElement(root, "PscCodes").text = "Online" elif service: etree.SubElement(root, "Services").text = "Écrit" if is_french(item) else "Print" etree.SubElement(root, "PscCodes").text = service else: self._format_subject_code(root, item, "PscCodes", cp.DESTINATIONS) self._format_services(root, item) is_broadcast = cp.is_broadcast(item) # content system fields orig = self._get_original_item(item) seq_id = "{:08d}".format(pub_seq_num % 100000000) item_id = "{:08d}".format(self.get_item_id(orig) % 100000000) etree.SubElement(content, "Name") etree.SubElement(content, "Cachable").text = "false" etree.SubElement(content, "FileName").text = filename(orig) etree.SubElement(content, "NewsCompID").text = item_id etree.SubElement(content, "SystemSlug").text = slug(orig) etree.SubElement(content, "ContentItemID").text = seq_id etree.SubElement(content, "ProfileID").text = "204" etree.SubElement(content, "SysContentType").text = "0" if is_picture(item): etree.SubElement(content, "PhotoContentItemID").text = item_id if extra.get(cp.FILENAME): etree.SubElement(content, "OrigTransRef").text = extra[cp.FILENAME] if service: etree.SubElement(content, "Note").text = ",".join(services) # timestamps firstpublished = item.get("firstpublished") or item["versioncreated"] etree.SubElement(root, "PublishDateTime").text = self._format_datetime( firstpublished ) try: etree.SubElement(content, "EmbargoTime").text = self._format_datetime( item[SCHEDULE_SETTINGS]["utc_embargo"], local=True, ) except KeyError: etree.SubElement(content, "EmbargoTime").text = self._format_datetime( item.get("embargoed"), local=True ) etree.SubElement(content, "CreatedDateTime").text = self._format_datetime( firstpublished ) # SDCP-380 etree.SubElement(content, "UpdatedDateTime").text = self._format_datetime( item["versioncreated"], rel=True ) # obvious etree.SubElement(content, "ContentType").text = ( "Photo" if is_picture(item) else item["type"].capitalize() ) # SDCP-309 etree.SubElement(content, "Headline").text = format_maxlength( extra.get(cp.HEADLINE2) or item.get("headline"), OUTPUT_LENGTH_LIMIT ) if not is_picture(item): etree.SubElement(content, "Headline2").text = format_maxlength( item.get("headline"), OUTPUT_LENGTH_LIMIT ) etree.SubElement(content, "SlugProper").text = item.get("slugline") etree.SubElement(content, "Credit").text = self._format_credit(item) etree.SubElement(content, "Source").text = item.get("source") content_html = self._format_content(item, is_broadcast) etree.SubElement(content, "DirectoryText").text = self._format_text( item.get("abstract") ) etree.SubElement(content, "ContentText").text = self._format_html(content_html) etree.SubElement(content, "Language").text = ( "2" if is_french(item) else "1" ) if item["type"] == "text" and content_html: content.find("DirectoryText").text = format_maxlength( get_text(content_html, "html", lf_on_block=False).replace("\n", " "), 200, ) word_count = str(get_word_count(content_html)) etree.SubElement(content, "Length").text = word_count etree.SubElement(content, "WordCount").text = word_count etree.SubElement(content, "BreakWordCount").text = word_count if item.get("keywords") and item.get("source") == globenewswire.SOURCE: etree.SubElement(content, "Stocks").text = ",".join(item["keywords"]) self._format_category_index(content, item) self._format_genre(content, item) self._format_urgency(content, item.get("urgency"), item["language"]) self._format_keyword( content, item.get("keywords"), ", " if item.get("type") == "picture" else ",", ) self._format_dateline(content, item.get("dateline")) self._format_writethru(content, item) if item.get("byline"): etree.SubElement(content, "Byline").text = item["byline"] if is_picture(item): self._format_picture_metadata(content, item) else: etree.SubElement(content, "EditorNote").text = item.get("ednote") if extra.get(cp.UPDATE): etree.SubElement(content, "UpdateNote").text = extra[cp.UPDATE] if extra.get(cp.CORRECTION): etree.SubElement(content, "Corrections").text = extra[cp.CORRECTION] if item.get("associations"): self._format_associations(content, item)
def test_word_count_nitf(self): self.assertEqual(40, text_utils.get_word_count(""" <p>2014: Northern Ireland beat <location>Greece</location> 2-0 in <location>Athens</location> with goals from <person>Jamie Ward</person> and <person>Kyle Lafferty</person> to boost their hopes of qualifying for <money>Euro 2016</money>. <person>Michael O'Neill's</person> side sealed their place at the finals in <chron>October 2015</chron>.</p>"""))
def _format_item(self, root, item, pub_seq_num, service, services) -> None: if is_picture(item): D2P1 = 'http://www.w3.org/2001/XMLSchema-instance' content = etree.SubElement(root, 'ContentItem', {'{%s}type' % D2P1: 'PhotoContentItem'}, nsmap={ 'd2p1': D2P1, }) else: content = etree.SubElement(root, 'ContentItem') extra = item.get('extra') or {} # root system fields etree.SubElement(root, 'Reschedule').text = 'false' etree.SubElement(root, 'IsRegional').text = 'false' etree.SubElement(root, 'CanAutoRoute').text = 'true' etree.SubElement(root, 'PublishID').text = str(pub_seq_num) etree.SubElement(root, 'Username') etree.SubElement(root, 'UseLocalsOut').text = 'false' etree.SubElement(root, 'UserProfileID').text = '0' etree.SubElement(root, 'PublishOrder').text = '0' etree.SubElement(root, 'NewCycle').text = 'false' etree.SubElement(root, 'OnlineResend').text = 'false' # item system fields etree.SubElement(content, 'AutoSaveID').text = '0' etree.SubElement(content, 'Type').text = '0' etree.SubElement(content, 'MediaType').text = '0' etree.SubElement(content, 'Status').text = '0' if is_picture(item): etree.SubElement(root, 'Services').text = 'Pictures' self._format_subject_code(root, item, 'PscCodes', cp.DESTINATIONS) if root.find('PscCodes') is None: etree.SubElement(root, 'PscCodes').text = 'Online' elif service: etree.SubElement(root, 'Services').text = 'Print' etree.SubElement(root, 'PscCodes').text = service else: self._format_subject_code(root, item, 'PscCodes', cp.DESTINATIONS) self._format_services(root, item) is_broadcast = cp.is_broadcast(item) # content system fields orig = self._get_original_item(item) seq_id = '{:08d}'.format(pub_seq_num % 100000000) item_id = '{:08d}'.format(orig['unique_id'] % 100000000) etree.SubElement(content, 'Name') etree.SubElement(content, 'Cachable').text = 'false' etree.SubElement(content, 'FileName').text = filename(orig) etree.SubElement(content, 'NewsCompID').text = item_id etree.SubElement(content, 'SystemSlug').text = slug(orig) etree.SubElement(content, 'ContentItemID').text = seq_id etree.SubElement(content, 'ProfileID').text = '204' etree.SubElement(content, 'SysContentType').text = '0' if is_picture(item): etree.SubElement(content, 'PhotoContentItemID').text = item_id if extra.get(cp.FILENAME): etree.SubElement(content, 'OrigTransRef').text = extra[cp.FILENAME] if service: etree.SubElement(content, 'Note').text = ','.join(services) # timestamps firstpublished = item.get('firstpublished') or item['versioncreated'] etree.SubElement( root, 'PublishDateTime').text = self._format_datetime(firstpublished) try: etree.SubElement(content, 'EmbargoTime').text = self._format_datetime( item[SCHEDULE_SETTINGS]['utc_embargo'], local=True, ) except KeyError: etree.SubElement(content, 'EmbargoTime').text = self._format_datetime( item.get('embargoed'), local=True) etree.SubElement(content, 'CreatedDateTime').text = self._format_datetime( firstpublished) # SDCP-380 etree.SubElement(content, 'UpdatedDateTime').text = self._format_datetime( item['versioncreated'], rel=True) # obvious etree.SubElement(content, 'ContentType').text = 'Photo' if is_picture( item) else item['type'].capitalize() # SDCP-309 etree.SubElement(content, 'Headline').text = format_maxlength( extra.get(cp.HEADLINE2) or item.get('headline'), OUTPUT_LENGTH_LIMIT) if not is_picture(item): etree.SubElement(content, 'Headline2').text = format_maxlength( item.get('headline'), OUTPUT_LENGTH_LIMIT) etree.SubElement(content, 'SlugProper').text = item.get('slugline') etree.SubElement(content, 'Credit').text = self._format_credit(item) etree.SubElement(content, 'Source').text = item.get('source') content_html = self._format_content(item, is_broadcast) etree.SubElement(content, 'DirectoryText').text = self._format_text( item.get('abstract')) etree.SubElement(content, 'ContentText').text = self._format_html(content_html) etree.SubElement( content, 'Language').text = '2' if 'fr' in item.get('language', '') else '1' if item['type'] == 'text' and content_html: content.find('DirectoryText').text = format_maxlength( get_text(content_html, 'html', lf_on_block=False).replace('\n', ' '), 200) word_count = str(get_word_count(content_html)) etree.SubElement(content, 'Length').text = word_count etree.SubElement(content, 'WordCount').text = word_count etree.SubElement(content, 'BreakWordCount').text = word_count if item.get('keywords') and item.get('source') == globenewswire.SOURCE: etree.SubElement(content, 'Stocks').text = ','.join(item['keywords']) self._format_category_index(content, item) self._format_genre(content, item) self._format_urgency(content, item.get('urgency'), item['language']) self._format_keyword(content, item.get('keywords'), ', ' if item.get('type') == 'picture' else ',') self._format_dateline(content, item.get('dateline')) self._format_writethru(content, item) if item.get('byline'): etree.SubElement(content, 'Byline').text = item['byline'] if is_picture(item): self._format_picture_metadata(content, item) else: etree.SubElement(content, 'EditorNote').text = item.get('ednote') if extra.get(cp.UPDATE): etree.SubElement(content, 'UpdateNote').text = extra[cp.UPDATE] if extra.get(cp.CORRECTION): etree.SubElement(content, 'Corrections').text = extra[cp.CORRECTION] if item.get('associations'): self._format_associations(content, item)
def test_word_count_brs(self): self.assertEqual(2, text_utils.get_word_count('<p>foo<br><br>bar</p>')) self.assertEqual( 2, text_utils.get_word_count('<p>foo<br /><br />bar</p>'))
def test_word_count_p_tags(self): self.assertEqual(2, text_utils.get_word_count('<p>foo<strong>s</strong></p><p>bar</p>'))
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname("itemSet")): for item_tree in item_set: # Ignore the packageItem, it has no guid if "guid" in item_tree.attrib: item = self.parse_item(item_tree) item["priority"] = 6 item["anpa_category"] = [{"qcode": "f"}] item["subject"] = [{ "qcode": "04000000", "name": subject_codes["04000000"] }] item.setdefault("word_count", get_word_count(item["body_html"])) # Hard code the urgency item["urgency"] = 3 # Dateline is always Wellington in NZ located = [ c for c in app.locators.find_cities( country_code="NZ", state_code="NZ.G2") if c.get("city", "").lower() == "wellington" ] if len(located) == 1: item["dateline"] = dict() item["dateline"]["located"] = located[0] if item.get("body_html") and item["dateline"]: parsed = parse_html(item.get("body_html"), content="xml") pars = parsed.xpath("//p") for par in pars: if not par.text: continue # check the first par for a byline if pars.index( par) == 0 and par.text.startswith( "By "): item["byline"] = par.text.replace( "By ", "") par.getparent().remove(par) date, source, the_rest = par.text.partition( " (BusinessDesk) - ") if source: item["dateline"]["date"] = date_parser( date, fuzzy=True) par.text = the_rest # remove the signoff if in the last par if par.text == "(BusinessDesk)" and pars.index( par) + 1 == len(pars): par.getparent().remove(par) item["body_html"] = to_string(parsed, remove_root_div=True) locator_map = superdesk.get_resource_service( "vocabularies").find_one(req=None, _id="locators") if locator_map: item["place"] = [ x for x in locator_map.get("items", []) if x["qcode"].upper() == "NZ" ] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)