def test_fetch_to_jimi(self, update_renditions_mock): service = OrangelogicSearchProvider(self.provider) update_renditions_mock.side_effects = set_rendition self.app.media.get.return_value = io.BytesIO( read_fixture('9e627f74b97841b3b8562b6547ada9c7-d1538139479c43e88021152.jpg', 'rb') ) with HTTMock(auth_ok, fetch_ok): with patch.dict(superdesk.resources, resources): fetched = service.fetch({}) update_renditions_mock.assert_called_once_with( fetched, 'https://example.com/htm/GetDocumentAPI.aspx?F=TRX&DocID=2RLQZBCB4R4R4&token=token.foo', None, ) self.assertEqual('picture', fetched['type']) self.assertIsInstance(fetched['firstcreated'], datetime) # populate ids fetched['family_id'] = fetched['guid'] fetched['unique_id'] = 1 with patch.dict(superdesk.resources, resources): formatter = JimiFormatter() xml = formatter.format(fetched, {})[0][1] root = etree.fromstring(xml.encode(formatter.ENCODING)) self.assertEqual('Pictures', root.find('Services').text) item = root.find('ContentItem') self.assertEqual('Zhang Yuwei', item.find('Byline').text) self.assertEqual('I', item.find('Category').text) self.assertEqual('News - Optional', item.find('Ranking').text) self.assertEqual('5', item.find('RankingValue').text) self.assertEqual('THE ASSOCIATED PRESS', item.find('Credit').text) self.assertEqual('Virus Outbreak China Vaccine', item.find('SlugProper').text) self.assertEqual('Unknown AP', item.find('Source').text) self.assertEqual('Beijing', item.find('City').text) self.assertEqual('China', item.find('Country').text) self.assertEqual('Beijing;;China', item.find('Placeline').text) # self.assertEqual('XIN902', item.find('OrigTransRef').text) self.assertEqual('SUB', item.find('BylineTitle').text) self.assertEqual('NHG', item.find('CaptionWriter').text) self.assertEqual('Xinhua', item.find('Copyright').text) self.assertIn("In this April 10, 2020, photo released by Xinhua News Agency, a staff", item.find('EnglishCaption').text) self.assertEqual('2020-04-12T00:09:37', item.find('DateTaken').text) self.assertEqual('NO SALES, PHOTO RELEASED BY XINHUA NEWS AGENCY APRIL 10, 2020 PHOTO', item.find('SpecialInstructions').text) self.assertEqual('Unknown AP', item.find('ArchiveSources').text) self.assertEqual('9e627f74b97841b3b8562b6547ada9c7', item.find('CustomField1').text) self.assertEqual('Xinhua', item.find('CustomField6').text) self.assertEqual('9e627f74b97841b3b8562b6547ada9c7', item.find('SystemSlug').text)
def test_fr(self): item = self.parse("fr.xml") self.assertIsNotNone(item) self.assertEqual("fr", item["language"]) self.assertEqual("Communiqué", item["description_text"]) item["unique_id"] = 1 with self.app.app_context(): with patch.dict(superdesk.resources, resources): _, output = JimiFormatter().format(item, {}, None)[0] self.assertIn("<Services>Écrit</Services>", output)
import superdesk import requests_mock import settings from lxml import etree from flask import json from unittest.mock import MagicMock, patch from tests.mock import SEQUENCE_NUMBER, resources from tests.ingest.parser import get_fixture_path from cp.ingest import CP_APMediaFeedParser from cp.output.formatter.jimi import JimiFormatter parser = CP_APMediaFeedParser() formatter = JimiFormatter() def fixture(filename): return os.path.join( os.path.dirname(__file__), "fixtures", filename, ) class AP2JimiTestCase(unittest.TestCase): app = flask.Flask(__name__) app.locators = MagicMock() app.config.update({"AP_TAGS_MAPPING": settings.AP_TAGS_MAPPING})
class JimiFormatterTestCase(BaseXmlFormatterTestCase): formatter = JimiFormatter() article = { "_id": "id", "guid": "id", "family_id": "famid", "type": "text", "headline": "Headline", "slugline": "slug", "creditline": "Credit", "source": "Source", "ednote": "Ednote", "word_count": 123, "abstract": "<p>Abstract</p>", "body_html": "<p>Body HTML<br>test <b>bold</b> and <i>idiom</i></p>", "keywords": ["Foo bar", "baz"], "anpa_category": [{ "name": "National", "qcode": "n" }], "subject": [ { "name": "health", "qcode": "07000000", "scheme": "subject_custom" }, { "name": "citizens", "qcode": "20000575", "scheme": "subject_custom" }, { "name": "Foo", "qcode": "1231245", "scheme": "foo" }, { "name": "Print", "qcode": "Print", "scheme": cp.DISTRIBUTION }, { "name": "The Associated Press", "qcode": "ap---", "scheme": cp.DESTINATIONS, }, ], "urgency": 2, "language": "en-CA", "unique_id": 123, "firstcreated": datetime(2020, 4, 1, 11, 13, 12, 25, tzinfo=UTC), "versioncreated": datetime(2020, 4, 1, 11, 23, 12, 25, tzinfo=UTC), "firstpublished": datetime(2020, 4, 1, 11, 33, 12, 25, tzinfo=UTC), "genre": [ { "name": "NewsAlert", "qcode": "NewsAlert" }, ], "extra": { cp.HEADLINE2: "headline2", cp.FILENAME: "filename", }, } def format_item(self, updates=None, return_root=False): xml = self.format(updates) root = self.parse(xml) if return_root: return root return root.find("ContentItem") def test_can_format(self): self.assertTrue(self.formatter.can_format("jimi", {})) def test_format(self): xml = self.format() self.assertIn("<?xml version='1.0' encoding='utf-8'?>", xml) self.assertIn("<ContentText><p>Body HTML<br />test", xml) root = self.parse(xml) self.assertEqual("Publish", root.tag) self.assertEqual("false", root.find("Reschedule").text) self.assertEqual("false", root.find("IsRegional").text) self.assertEqual("true", root.find("CanAutoRoute").text) self.assertEqual(str(SEQUENCE_NUMBER), root.find("PublishID").text) self.assertEqual("Print", root.find("Services").text) self.assertEqual(None, root.find("Username").text) self.assertEqual("false", root.find("UseLocalsOut").text) self.assertEqual("ap---", root.find("PscCodes").text) self.assertEqual("2020-04-01T11:33:12", root.find("PublishDateTime").text) item = root.find("ContentItem") self.assertEqual(None, item.find("Name").text) self.assertEqual("false", item.find("Cachable").text) # ids self.assertEqual("00000100", item.find("ContentItemID").text) self.assertEqual("00000123", item.find("NewsCompID").text) self.assertEqual(self.article["guid"], item.find("SystemSlug").text) self.assertEqual(self.article["guid"], item.find("FileName").text) self.assertEqual(self.article["extra"][cp.FILENAME], item.find("OrigTransRef").text) # obvious self.assertEqual("Text", item.find("ContentType").text) # SDCP-309 self.assertEqual(self.article["headline"], item.find("Headline2").text) self.assertEqual("headline2", item.find("Headline").text) self.assertEqual(self.article["creditline"], item.find("Credit").text) self.assertEqual(self.article["slugline"], item.find("SlugProper").text) self.assertEqual(self.article["source"], item.find("Source").text) self.assertEqual(self.article["ednote"], item.find("EditorNote").text) self.assertEqual("6", item.find("WordCount").text) self.assertEqual("6", item.find("BreakWordCount").text) self.assertEqual("6", item.find("Length").text) self.assertEqual("Body HTMLtest bold and idiom", item.find("DirectoryText").text) self.assertEqual( "<p>Body HTML<br />test <strong>bold</strong> and <em>idiom</em></p>", item.find("ContentText").text, ) self.assertEqual(None, item.find("Placeline").text) self.assertEqual("0", item.find("WritethruValue").text) self.assertEqual("Foo bar,baz", item.find("Keyword").text) self.assertEqual("National", item.find("Category").text) self.assertEqual("National,Health,Politics", item.find("IndexCode").text) self.assertEqual(str(self.article["urgency"]), item.find("RankingValue").text) self.assertEqual("News - Need to Know", item.find("Ranking").text) self.assertEqual("1", item.find("Language").text) # timestamps self.assertEqual("0001-01-01T00:00:00", item.find("EmbargoTime").text) self.assertEqual("2020-04-01T11:33:12", item.find("CreatedDateTime").text) self.assertEqual("2020-04-01T07:23:12-04:00", item.find("UpdatedDateTime").text) # etc self.assertEqual("NewsAlert", item.find("VersionType").text) def test_writethru(self): expected_data = { 1: "1st", 2: "2nd", 3: "3rd", 4: "4th", 5: "5th", 10: "10th", 100: "100th", 101: "101st", } for val, num in expected_data.items(): item = self.format_item({"rewrite_sequence": val}) self.assertEqual(num, item.find("WritethruNum").text) self.assertEqual(str(val), item.find("WritethruValue").text) self.assertEqual("Writethru", item.find("WriteThruType").text) def test_dateline(self): item = self.format_item({ "dateline": { "source": "AAP", "text": "sample dateline", "located": { "alt_name": "", "state": "California", "city_code": "Los Angeles", "city": "Los Angeles", "dateline": "city", "country_code": "US", "country": "USA", "tz": "America/Los_Angeles", "state_code": "CA", "location": { "lat": 34.0522, "lon": -118.2347, }, }, }, }) self.assertEqual("Los Angeles", item.find("City").text) self.assertEqual("California", item.find("Province").text) self.assertEqual("USA", item.find("Country").text) self.assertEqual("Los Angeles;California;USA", item.find("Placeline").text) self.assertEqual("34.0522", item.find("Latitude").text) self.assertEqual("-118.2347", item.find("Longitude").text) def test_globenewswire(self): output = self.format( { "source": globenewswire.SOURCE, "headline": "Foo", "keywords": ["TSX VENTURE:AXL", "OTC:NTGSF"], "anpa_category": [{ "name": globenewswire.DESCRIPTION["en"], "qcode": "p", }], "subject": [ { "name": "FOO", "qcode": "FOO", "scheme": cp.SERVICE }, { "name": "BAR", "qcode": "BAR", "scheme": cp.SERVICE }, ], "extra": {}, }, _all=True, ) self.assertEqual(2, len(output)) root = self.parse(output[0][1]) item = root.find("ContentItem") self.assertEqual("Print", root.find("Services").text) self.assertEqual("FOO", root.find("PscCodes").text) self.assertEqual("Press Release", item.find("Category").text) self.assertEqual("Press Release", item.find("IndexCode").text) self.assertEqual("FOO,BAR", item.find("Note").text) self.assertEqual("TSX VENTURE:AXL,OTC:NTGSF", item.find("Stocks").text) self.assertEqual("Foo", item.find("Headline").text) self.assertEqual("Foo", item.find("Headline2").text) def test_limits(self): long = "foo bar {}".format("x" * 200) item = self.format_item({ "headline": long, "extra": { "headline2": long, }, "keywords": ["foo", "bar", long], }) self.assertEqual("foo bar", item.find("Headline").text) self.assertEqual("foo bar", item.find("Headline2").text) self.assertEqual("foo,bar,foo bar", item.find("Keyword").text) def test_picture(self): updates = { "type": "picture", "guid": "urn:picture", "urgency": 5, "byline": "photographer", "headline": "some headline", "slugline": "slug", "firstcreated": datetime(2020, 6, 3, 17, 0, 56, tzinfo=UTC), "extra": { cp.FILENAME: "NY538", "photographer_code": "stf", }, "subject": [ { "name": "Americas", "qcode": "A", "scheme": "photo_categories" }, ], "creditline": "THE ASSOCIATED PRESS", "original_source": "The Associated Press", "copyrightnotice": "Copyright 2020 The Associated Press. All rights reserved.", "description_text": "Pedestrians are silhouetted", "renditions": { "original": { "media": "media_id", "mimetype": "image/jpeg", }, }, } root = self.format_item(updates, True) self.assertEqual("Pictures", root.find("Services").text) self.assertEqual("Online", root.find("PscCodes").text) item = root.find("ContentItem") self.assertEqual(updates["byline"], item.find("Byline").text) self.assertEqual("false", item.find("HeadlineService").text) self.assertEqual("A", item.find("Category").text) self.assertEqual("None", item.find("VideoType").text) self.assertEqual("None", item.find("PhotoType").text) self.assertEqual("None", item.find("GraphicType").text) self.assertEqual("News - Optional", item.find("Ranking").text) self.assertEqual("5", item.find("RankingValue").text) self.assertEqual(updates["creditline"], item.find("Credit").text) self.assertEqual("Photo", item.find("ContentType").text) self.assertEqual(updates["slugline"], item.find("SlugProper").text) self.assertEqual(updates["original_source"], item.find("Source").text) self.assertEqual(updates["extra"][cp.FILENAME], item.find("OrigTransRef").text) self.assertEqual("STF", item.find("BylineTitle").text) self.assertEqual(updates["copyrightnotice"][:50], item.find("Copyright").text) self.assertEqual(updates["description_text"], item.find("EnglishCaption").text) self.assertEqual("2020-06-03T17:00:56", item.find("DateTaken").text) self.assertEqual("media_id", item.find("FileName").text) self.assertEqual("media_id.jpg", item.find("ViewFile").text) self.assertEqual("media_id.jpg", item.find("ContentRef").text) self.assertEqual(updates["guid"], item.find("SystemSlug").text) self.assertEqual(1, len(item.findall("FileName"))) def test_picture_amazon(self): updates = { "type": "picture", "renditions": { "original": { "media": "20200807100836/5f2d12c8ced0b19f31ea318ajpeg.jpg", }, }, } item = self.format_item(updates) filename = updates["renditions"]["original"]["media"].replace("/", "-") self.assertEqual( os.path.splitext(filename)[0], item.find("FileName").text) self.assertEqual(filename, item.find("ViewFile").text) self.assertEqual(filename, item.find("ContentRef").text) def test_embargo(self): embargo = datetime(2020, 7, 22, 13, 10, 5, tzinfo=UTC) updates = { SCHEDULE_SETTINGS: { "utc_embargo": embargo, }, } item = self.format_item(updates) self.assertEqual("2020-07-22T09:10:05", item.find("EmbargoTime").text) item = self.format_item({"embargoed": embargo}) self.assertEqual("2020-07-22T09:10:05", item.find("EmbargoTime").text) def test_format_credit(self): item = self.format_item({"source": "CP", "creditline": None}) self.assertEqual("THE CANADIAN PRESS", item.find("Credit").text) def test_item_with_picture(self): updates = { "source": "CP", "associations": { "gallery--1": { "_id": "foo", "type": "picture", "guid": "foo:guid", "renditions": { "original": { "media": "foo", "mimetype": "image/jpeg", }, }, }, "gallery--2": { "_id": "bar", "type": "picture", "guid": "bar:guid", "renditions": { "original": { "media": "bar", "mimetype": "image/jpeg", }, }, }, "gallery--3": { # same picture twice "_id": "bar", "type": "picture", "guid": "bar:guid", "renditions": { "original": { "media": "bar", "mimetype": "image/jpeg", }, }, }, }, } item = self.format_item(updates) self.assertEqual("Many", item.find("PhotoType").text) self.assertEqual("foo,bar", item.find("PhotoReference").text) def test_format_filename_rewrite(self): date_1am_et = datetime(2020, 8, 12, 5, tzinfo=UTC) date_2am_et = date_1am_et + timedelta(hours=1) date_3am_et = date_1am_et + timedelta(hours=2) resources["archive"].service.find_one.side_effect = [ { "guid": "same-cycle", "rewrite_of": "prev-cycle", "firstcreated": date_2am_et, "unique_id": 2, "type": "text", }, { "guid": "prev-cycle", "firstcreated": date_1am_et, "unique_id": 1, "type": "text", }, ] item = self.format_item({ "guid": "last", "rewrite_of": "same-cycle", "extra": {}, "firstcreated": date_3am_et, "type": "text", }) resources["archive"].service.find_one.side_effect = None self.assertEqual("prev-cycle", item.find("FileName").text) self.assertEqual("prev-cycle", item.find("SystemSlug").text) def test_format_fr_CA(self): updates = { "language": "fr-CA", "anpa_category": [{ "name": "National", "qcode": "g" }], "rewrite_sequence": 2, "subject": [ { "name": "Broadcast", "qcode": cp.BROADCAST, "scheme": cp.DISTRIBUTION }, ], } item = self.format_item(updates) self.assertEqual("2", item.find("Language").text) self.assertEqual("Nouvelles Générales", item.find("Category").text) self.assertEqual("Nouvelles Générales", item.find("IndexCode").text) self.assertEqual("Alerte", item.find("VersionType").text) self.assertEqual("Nouvelle - Majeur", item.find("Ranking").text) self.assertEqual("Radio", item.find("..").find("Services").text) self.assertEqual("2", item.find("WritethruValue").text) self.assertEqual("2ème", item.find("WritethruNum").text) self.assertEqual("Lead", item.find("WriteThruType").text) def test_correction_update(self): item = self.format_item({ "extra": { cp.UPDATE: "update text", cp.CORRECTION: "correction text", } }) self.assertEqual("update text", item.find("UpdateNote").text) self.assertEqual("correction text", item.find("Corrections").text) def test_writethru_keeps_newscompid(self): resources["archive"].service.find_one.side_effect = [ { "guid": "same-cycle", "rewrite_of": "prev-cycle", "unique_id": 2, "type": "text", }, { "guid": "prev-cycle", "unique_id": 1, "type": "text" }, ] item = self.format_item({ "type": "text", "rewrite_of": "same-cycle", "unique_id": 3, }) resources["archive"].service.find_one.side_effect = None self.assertEqual("00000001", item.find("NewsCompID").text) def test_ap_update_keeps_newscomip(self): resources["ingest"].service.find_one.side_effect = [{ "unique_id": 1, }] item = self.format_item({ "type": "text", "unique_id": 5, }) resources["ingest"].service.find_one.side_effect = None self.assertEqual("00000001", item.find("NewsCompID").text) def test_picture_container_ids(self): resources["news"].service.get.side_effect = [[ { "guid": "canceled", "pubstatus": "canceled", "type": "text" }, { "guid": "usable", "pubstatus": "usable", "type": "text" }, { "guid": "usable2", "pubstatus": "usable", "type": "text", "extra": { cp.ORIG_ID: 32 * "a", # slug constraints }, }, ]] item = self.format_item({ "type": "picture", "unique_id": 3, }) resources["news"].service.get.side_effect = None self.assertEqual("{}, usable".format(32 * "a"), item.find("ContainerIDs").text) def test_placeline_washington(self): item = self.format_item({ "dateline": { "source": "AAP", "text": "sample dateline", "located": { "dateline": "city", "country_code": "US", "tz": "America/New_York", "city_code": "Washington", "state_code": "DC", "state": "Washington, D.C.", "city": "Washington", "country": "United States", "code": "4140963", "scheme": "geonames", }, }, }) self.assertEqual("Washington;District of Columbia;United States", item.find("Placeline").text) self.assertEqual("District of Columbia", item.find("Province").text) def test_format_content(self): item = self.format_item({ "body_html": "<p>Body HTML<br>test remove bold <b> </b> and <b>bold1</b> and <i>idiom</i></p>" }) content_text = item.find("ContentText").text self.assertEqual( "<p>Body HTML<br />test remove bold and <strong>bold1</strong> and <em>idiom</em></p>", str(" ".join(content_text.split())), ) def test_ap_translated(self): item = self.format_item({ "language": "fr-CA", "extra": { cp.ORIG_ID: 'a' * 32 }, }) self.assertEqual('a' * 30 + 'fa', item.find('SystemSlug').text)
class JimiFormatterTestCase(BaseXmlFormatterTestCase): formatter = JimiFormatter() article = { '_id': 'id', 'guid': 'id', 'family_id': 'famid', 'type': 'text', 'headline': 'Headline', 'slugline': 'slug', 'creditline': 'Credit', 'source': 'Source', 'ednote': 'Ednote', 'word_count': 123, 'abstract': '<p>Abstract</p>', 'body_html': '<p>Body HTML<br>test <b>bold</b> and <i>idiom</i></p>', 'keywords': ['Foo bar', 'baz'], 'anpa_category': [{ 'name': 'National', 'qcode': 'n' }], 'subject': [ { 'name': 'health', 'qcode': '07000000', 'scheme': 'subject_custom' }, { 'name': 'citizens', 'qcode': '20000575', 'scheme': 'subject_custom' }, { 'name': 'Foo', 'qcode': '1231245', 'scheme': 'foo' }, { 'name': 'Print', 'qcode': 'Print', 'scheme': cp.DISTRIBUTION }, { 'name': 'The Associated Press', 'qcode': 'ap---', 'scheme': cp.DESTINATIONS }, ], 'urgency': 2, 'language': 'en-CA', 'unique_id': 123, 'firstcreated': datetime(2020, 4, 1, 11, 13, 12, 25, tzinfo=UTC), 'versioncreated': datetime(2020, 4, 1, 11, 23, 12, 25, tzinfo=UTC), 'firstpublished': datetime(2020, 4, 1, 11, 33, 12, 25, tzinfo=UTC), 'genre': [ { 'name': 'NewsAlert', 'qcode': 'NewsAlert' }, ], 'extra': { cp.HEADLINE2: 'headline2', cp.FILENAME: 'filename', }, } def format_item(self, updates=None, return_root=False): xml = self.format(updates) root = self.parse(xml) if return_root: return root return root.find('ContentItem') def test_can_format(self): self.assertTrue(self.formatter.can_format('jimi', {})) def test_format(self): xml = self.format() self.assertIn("<?xml version='1.0' encoding='utf-8'?>", xml) self.assertIn('<ContentText><p>Body HTML<br />test', xml) root = self.parse(xml) self.assertEqual('Publish', root.tag) self.assertEqual('false', root.find('Reschedule').text) self.assertEqual('false', root.find('IsRegional').text) self.assertEqual('true', root.find('CanAutoRoute').text) self.assertEqual(str(SEQUENCE_NUMBER), root.find('PublishID').text) self.assertEqual('Print', root.find('Services').text) self.assertEqual(None, root.find('Username').text) self.assertEqual('false', root.find('UseLocalsOut').text) self.assertEqual('ap---', root.find('PscCodes').text) self.assertEqual('2020-04-01T11:33:12', root.find('PublishDateTime').text) item = root.find('ContentItem') self.assertEqual(None, item.find('Name').text) self.assertEqual('false', item.find('Cachable').text) # ids self.assertEqual('00000100', item.find('ContentItemID').text) self.assertEqual('00000123', item.find('NewsCompID').text) self.assertEqual(self.article['guid'], item.find('SystemSlug').text) self.assertEqual(self.article['guid'], item.find('FileName').text) self.assertEqual(self.article['extra'][cp.FILENAME], item.find('OrigTransRef').text) # obvious self.assertEqual('Text', item.find('ContentType').text) # SDCP-309 self.assertEqual(self.article['headline'], item.find('Headline2').text) self.assertEqual('headline2', item.find('Headline').text) self.assertEqual(self.article['creditline'], item.find('Credit').text) self.assertEqual(self.article['slugline'], item.find('SlugProper').text) self.assertEqual(self.article['source'], item.find('Source').text) self.assertEqual(self.article['ednote'], item.find('EditorNote').text) self.assertEqual('6', item.find('WordCount').text) self.assertEqual('6', item.find('BreakWordCount').text) self.assertEqual('6', item.find('Length').text) self.assertEqual('Body HTMLtest bold and idiom', item.find('DirectoryText').text) self.assertEqual( '<p>Body HTML<br />test <strong>bold</strong> and <em>idiom</em></p>', item.find('ContentText').text) self.assertEqual(None, item.find('Placeline').text) self.assertEqual('0', item.find('WritethruValue').text) self.assertEqual('Foo bar,baz', item.find('Keyword').text) self.assertEqual('National', item.find('Category').text) self.assertEqual('National,Health,Politics', item.find('IndexCode').text) self.assertEqual(str(self.article['urgency']), item.find('RankingValue').text) self.assertEqual('News - Need to Know', item.find('Ranking').text) self.assertEqual('1', item.find('Language').text) # timestamps self.assertEqual('0001-01-01T00:00:00', item.find('EmbargoTime').text) self.assertEqual('2020-04-01T11:33:12', item.find('CreatedDateTime').text) self.assertEqual('2020-04-01T07:23:12-04:00', item.find('UpdatedDateTime').text) # etc self.assertEqual('NewsAlert', item.find('VersionType').text) def test_writethru(self): expected_data = { 1: '1st', 2: '2nd', 3: '3rd', 4: '4th', 5: '5th', 10: '10th', 100: '100th', 101: '101st', } for val, num in expected_data.items(): item = self.format_item({'rewrite_sequence': val}) self.assertEqual(num, item.find('WritethruNum').text) self.assertEqual(str(val), item.find('WritethruValue').text) self.assertEqual('Writethru', item.find('WriteThruType').text) def test_dateline(self): item = self.format_item({ 'dateline': { 'source': 'AAP', 'text': 'sample dateline', 'located': { 'alt_name': '', 'state': 'California', 'city_code': 'Los Angeles', 'city': 'Los Angeles', 'dateline': 'city', 'country_code': 'US', 'country': 'USA', 'tz': 'America/Los_Angeles', 'state_code': 'CA', 'location': { 'lat': 34.0522, 'lon': -118.2347, }, } }, }) self.assertEqual('Los Angeles', item.find('City').text) self.assertEqual('California', item.find('Province').text) self.assertEqual('USA', item.find('Country').text) self.assertEqual('Los Angeles;California;USA', item.find('Placeline').text) self.assertEqual('34.0522', item.find('Latitude').text) self.assertEqual('-118.2347', item.find('Longitude').text) def test_globenewswire(self): output = self.format( { 'source': globenewswire.SOURCE, 'headline': 'Foo', 'keywords': ['TSX VENTURE:AXL', 'OTC:NTGSF'], 'anpa_category': [{ 'name': globenewswire.DESCRIPTION['en'], 'qcode': 'p', }], 'subject': [ { 'name': 'FOO', 'qcode': 'FOO', 'scheme': cp.SERVICE }, { 'name': 'BAR', 'qcode': 'BAR', 'scheme': cp.SERVICE }, ], 'extra': {}, }, _all=True) self.assertEqual(2, len(output)) root = self.parse(output[0][1]) item = root.find('ContentItem') self.assertEqual('Print', root.find('Services').text) self.assertEqual('FOO', root.find('PscCodes').text) self.assertEqual('Press Release', item.find('Category').text) self.assertEqual('Press Release', item.find('IndexCode').text) self.assertEqual('FOO,BAR', item.find('Note').text) self.assertEqual('TSX VENTURE:AXL,OTC:NTGSF', item.find('Stocks').text) self.assertEqual('Foo', item.find('Headline').text) self.assertEqual('Foo', item.find('Headline2').text) def test_limits(self): long = 'foo bar {}'.format('x' * 200) item = self.format_item({ 'headline': long, 'extra': { 'headline2': long, }, 'keywords': ['foo', 'bar', long], }) self.assertEqual('foo bar', item.find('Headline').text) self.assertEqual('foo bar', item.find('Headline2').text) self.assertEqual('foo,bar,foo bar', item.find('Keyword').text) def test_picture(self): updates = { 'type': 'picture', 'guid': 'urn:picture', 'urgency': 5, 'byline': 'photographer', 'headline': 'some headline', 'slugline': 'slug', 'firstcreated': datetime(2020, 6, 3, 17, 0, 56, tzinfo=UTC), 'extra': { cp.FILENAME: 'NY538', 'photographer_code': 'stf', }, 'subject': [ { 'name': 'Americas', 'qcode': 'A', 'scheme': 'photo_categories' }, ], 'creditline': 'THE ASSOCIATED PRESS', 'original_source': 'The Associated Press', 'copyrightnotice': 'Copyright 2020 The Associated Press. All rights reserved.', 'description_text': 'Pedestrians are silhouetted', 'renditions': { 'original': { 'media': 'media_id', 'mimetype': 'image/jpeg', }, }, } root = self.format_item(updates, True) self.assertEqual('Pictures', root.find('Services').text) self.assertEqual('Online', root.find('PscCodes').text) item = root.find('ContentItem') self.assertEqual(updates['byline'], item.find('Byline').text) self.assertEqual('false', item.find('HeadlineService').text) self.assertEqual('A', item.find('Category').text) self.assertEqual('None', item.find('VideoType').text) self.assertEqual('None', item.find('PhotoType').text) self.assertEqual('None', item.find('GraphicType').text) self.assertEqual('News - Optional', item.find('Ranking').text) self.assertEqual('5', item.find('RankingValue').text) self.assertEqual(updates['creditline'], item.find('Credit').text) self.assertEqual('Photo', item.find('ContentType').text) self.assertEqual(updates['slugline'], item.find('SlugProper').text) self.assertEqual(updates['original_source'], item.find('Source').text) self.assertEqual(updates['extra'][cp.FILENAME], item.find('OrigTransRef').text) self.assertEqual('STF', item.find('BylineTitle').text) self.assertEqual(updates['copyrightnotice'][:50], item.find('Copyright').text) self.assertEqual(updates['description_text'], item.find('EnglishCaption').text) self.assertEqual('2020-06-03T17:00:56', item.find('DateTaken').text) self.assertEqual('media_id', item.find('FileName').text) self.assertEqual('media_id.jpg', item.find('ViewFile').text) self.assertEqual('media_id.jpg', item.find('ContentRef').text) self.assertEqual(updates['guid'], item.find('SystemSlug').text) self.assertEqual(1, len(item.findall('FileName'))) def test_picture_amazon(self): updates = { 'type': 'picture', 'renditions': { 'original': { 'media': '20200807100836/5f2d12c8ced0b19f31ea318ajpeg.jpg', }, }, } item = self.format_item(updates) filename = updates['renditions']['original']['media'].replace('/', '-') self.assertEqual( os.path.splitext(filename)[0], item.find('FileName').text) self.assertEqual(filename, item.find('ViewFile').text) self.assertEqual(filename, item.find('ContentRef').text) def test_embargo(self): embargo = datetime(2020, 7, 22, 13, 10, 5, tzinfo=UTC) updates = { SCHEDULE_SETTINGS: { 'utc_embargo': embargo, }, } item = self.format_item(updates) self.assertEqual('2020-07-22T09:10:05', item.find('EmbargoTime').text) item = self.format_item({'embargoed': embargo}) self.assertEqual('2020-07-22T09:10:05', item.find('EmbargoTime').text) def test_format_credit(self): item = self.format_item({'source': 'CP', 'creditline': None}) self.assertEqual('THE CANADIAN PRESS', item.find('Credit').text) def test_item_with_picture(self): updates = { 'source': 'CP', 'associations': { 'gallery--1': { '_id': 'foo', 'type': 'picture', 'guid': 'foo:guid', 'renditions': { 'original': { 'media': 'foo', 'mimetype': 'image/jpeg', }, }, }, 'gallery--2': { '_id': 'bar', 'type': 'picture', 'guid': 'bar:guid', 'renditions': { 'original': { 'media': 'bar', 'mimetype': 'image/jpeg', }, }, }, 'gallery--3': { # same picture twice '_id': 'bar', 'type': 'picture', 'guid': 'bar:guid', 'renditions': { 'original': { 'media': 'bar', 'mimetype': 'image/jpeg', }, }, }, }, } item = self.format_item(updates) self.assertEqual('Many', item.find('PhotoType').text) self.assertEqual('foo,bar', item.find('PhotoReference').text) def test_format_filename_rewrite(self): date_1am_et = datetime(2020, 8, 12, 5, tzinfo=UTC) date_2am_et = date_1am_et + timedelta(hours=1) date_3am_et = date_1am_et + timedelta(hours=2) resources['archive'].service.find_one.side_effect = [ { 'guid': 'same-cycle', 'rewrite_of': 'prev-cycle', 'firstcreated': date_2am_et, 'unique_id': 2, 'type': 'text' }, { 'guid': 'prev-cycle', 'firstcreated': date_1am_et, 'unique_id': 1, 'type': 'text' }, ] item = self.format_item({ 'guid': 'last', 'rewrite_of': 'same-cycle', 'extra': {}, 'firstcreated': date_3am_et, 'type': 'text' }) self.assertEqual('prev-cycle', item.find('FileName').text) self.assertEqual('prev-cycle', item.find('SystemSlug').text) def test_format_fr_CA(self): updates = { 'language': 'fr-CA', 'anpa_category': [{ 'name': 'National', 'qcode': 'g' }], 'rewrite_sequence': 2, 'subject': [ { 'name': 'Broadcast', 'qcode': cp.BROADCAST, 'scheme': cp.DISTRIBUTION }, ], } item = self.format_item(updates) self.assertEqual('2', item.find('Language').text) self.assertEqual("Nouvelles Générales", item.find('Category').text) self.assertEqual("Nouvelles Générales", item.find('IndexCode').text) self.assertEqual("Alerte", item.find('VersionType').text) self.assertEqual("Nouvelle - Majeur", item.find('Ranking').text) self.assertEqual("Radio", item.find('..').find('Services').text) self.assertEqual('2', item.find('WritethruValue').text) self.assertEqual('2ème', item.find('WritethruNum').text) self.assertEqual('Lead', item.find('WriteThruType').text) def test_correction_update(self): item = self.format_item({ 'extra': { cp.UPDATE: 'update text', cp.CORRECTION: 'correction text', } }) self.assertEqual('update text', item.find('UpdateNote').text) self.assertEqual('correction text', item.find('Corrections').text) def test_writethru_keeps_newscompid(self): resources['archive'].service.find_one.side_effect = [ { 'guid': 'same-cycle', 'rewrite_of': 'prev-cycle', 'unique_id': 2, 'type': 'text' }, { 'guid': 'prev-cycle', 'unique_id': 1, 'type': 'text' }, ] item = self.format_item({ 'type': 'text', 'rewrite_of': 'same-cycle', 'unique_id': 3, }) self.assertEqual('00000001', item.find('NewsCompID').text) def test_picture_container_ids(self): resources['news'].service.get.side_effect = [[ { 'guid': 'canceled', 'pubstatus': 'canceled', 'type': 'text' }, { 'guid': 'usable', 'pubstatus': 'usable', 'type': 'text' }, { 'guid': 'usable2', 'pubstatus': 'usable', 'type': 'text', 'extra': { cp.ORIG_ID: 32 * 'a', # slug constraints } }, ]] item = self.format_item({ 'type': 'picture', 'unique_id': 3, }) resources['news'].service.get.side_effect = None self.assertEqual('{}, usable'.format(32 * 'a'), item.find('ContainerIDs').text) def test_placeline_washington(self): item = self.format_item({ 'dateline': { 'source': 'AAP', 'text': 'sample dateline', 'located': { "dateline": "city", "country_code": "US", "tz": "America/New_York", "city_code": "Washington", "state_code": "DC", "state": "Washington, D.C.", "city": "Washington", "country": "United States", "code": "4140963", "scheme": "geonames", }, }, }) self.assertEqual('Washington;District of Columbia;United States', item.find('Placeline').text) self.assertEqual('District of Columbia', item.find('Province').text)
class CP_AP_ParseTestCase(unittest.TestCase): app = flask.Flask(__name__) app.locators = MagicMock() app.config.update({"AP_TAGS_MAPPING": settings.AP_TAGS_MAPPING}) subscriber = {} formatter = JimiFormatter() def test_slugline(self): parser = CP_APMediaFeedParser() self.assertEqual("foo-bar-baz", parser.process_slugline("foo bar/baz")) self.assertEqual("foo-bar", parser.process_slugline("foo-bar")) self.assertEqual("foo-bar", parser.process_slugline("foo - bar")) def test_parse(self): with self.app.app_context(): with patch.dict(superdesk.resources, resources): item = parser.parse(data, provider) self.assertEqual("ba7d03f0cd24a17faa81bebc724bcf3f", item["guid"]) self.assertEqual("Story", item["profile"]) self.assertEqual("WY-Exchange-Coronavirus-Tech", item["slugline"]) self.assertEqual("headline1", item["headline"]) self.assertEqual("headline1", item["extra"][cp.HEADLINE2]) self.assertIn("copyright information", item["copyrightnotice"]) self.assertIn("editorial use only", item["usageterms"]) self.assertEqual("The Associated Press", item["source"]) self.assertEqual(5, item["urgency"]) self.assertEqual("Margaret Austin", item["byline"]) self.assertIn("General news", item["keywords"]) self.assertIn( { "name": "Feature", "qcode": "Feature", }, item["genre"], ) self.assertEqual("UPDATES: With AP Photos.", item["extra"]["update"]) self.assertEqual("", item["ednote"]) self.assertEqual("NYSE:WFC", item["extra"]["stocks"]) self.assertEqual("m0012", item["extra"][cp.FILENAME]) self.assertEqual(0, item["extra"]["ap_version"]) self.assertIn( { "name": "International", "qcode": "w", "scheme": "categories", "translations": { "name": { "en-CA": "International", "fr-CA": "International" } } }, item["anpa_category"], ) subjects = [ s["name"] for s in item["subject"] if s.get("scheme") == "subject_custom" ] self.assertEqual(["health"], subjects) tags = [ s["name"] for s in item["subject"] if s.get("scheme") == cp.TAG ] self.assertEqual(2, len(tags)) self.assertIn("APV", tags) self.assertIn("TSX", tags) products = [ s["qcode"] for s in item["subject"] if s.get("scheme") == cp.AP_PRODUCT ] self.assertEqual(6, len(products)) self.assertIn("33381", products) dateline = item["dateline"] self.assertEqual("Wyoming Tribune Eagle", dateline["source"]) self.assertEqual("CHEYENNE, Wyo.", dateline["text"]) self.assertIn("located", dateline) self.assertEqual("Cheyenne", dateline["located"]["city"]) self.assertEqual("Wyoming", dateline["located"]["state"]) self.assertEqual("WY", dateline["located"]["state_code"]) self.assertEqual("United States", dateline["located"]["country"]) self.assertEqual("USA", dateline["located"]["country_code"]) self.assertEqual(41.13998, dateline["located"]["location"]["lat"]) self.assertEqual(-104.82025, dateline["located"]["location"]["lon"]) self.assertIn("associations", item) self.assertIn("media-gallery--1", item["associations"]) self.assertIn("media-gallery--2", item["associations"]) self.assertEqual(1, len(item["place"])) self.assertEqual( { "name": "Cheyenne", "qcode": "Cheyenne", "state": "Wyoming", "country": "United States", "world_region": "North America", "location": { "lat": 41.13998, "lon": -104.82025, }, }, item["place"][0], ) self.assertRegex(item["body_html"], r"^<p>.*</p>$") def test_parse_ignore_associations_based_on_type_config(self): _provider = { "content_types": ["text"], } with self.app.app_context(): with patch.dict(superdesk.resources, resources): item = parser.parse(data, _provider) self.assertFalse(item.get("associations")) def test_parse_picture(self): with self.app.app_context(): with patch.dict(superdesk.resources, resources): with requests_mock.mock() as mock: with open(get_fixture_path("preview.jpg", "ap"), "rb") as f: mock.get( picture_data["data"]["item"]["renditions"] ["preview"]["href"], content=f.read(), ) item = parser.parse(picture_data, provider) self.assertEqual("Jae C. Hong", item["byline"]) self.assertEqual(5, item["urgency"]) self.assertEqual("ASSOCIATED PRESS", item["creditline"]) self.assertEqual("America Protests Racial Economics", item["headline"]) self.assertEqual("stf", item["extra"]["photographer_code"]) self.assertIn("Pedestrians are silhouetted", item["description_text"]) self.assertEqual("AP", item["extra"]["provider"]) def test_parse_embargoed(self): with self.app.app_context(): with patch.dict(superdesk.resources, resources): source = copy.deepcopy(data) embargoed = datetime.now( pytz.utc).replace(microsecond=0) + timedelta(hours=2) source["data"]["item"]["embargoed"] = embargoed.strftime( "%Y-%m-%dT%H:%M:%SZ") source["data"]["item"]["pubstatus"] = "embargoed" item = parser.parse(source, provider) self.assertEqual(embargoed, item["embargoed"]) self.assertIn("embargo", item) self.assertEqual( { "utc_embargo": embargoed, "time_zone": cp.TZ, }, item[SCHEDULE_SETTINGS], ) self.assertEqual(PUB_STATUS.HOLD, item["pubstatus"]) self.assertEqual(["Advance"], [genre["name"] for genre in item["genre"]]) embargoed = embargoed - timedelta(hours=5) source["data"]["item"]["embargoed"] = embargoed.strftime( "%Y-%m-%dT%H:%M:%SZ") item = parser.parse(source, provider) self.assertEqual(embargoed, item["embargoed"]) self.assertNotIn("embargo", item) def test_category_politics_international(self): with open(get_fixture_path("politics.json", "ap")) as fp: _data = json.load(fp) with self.app.app_context(): with patch.dict(superdesk.resources, resources): item = parser.parse(_data, {}) self.assertEqual( [{ "name": "International", "qcode": "w", "scheme": CATEGORY_SCHEME, "translations": { "name": { "en-CA": "International", "fr-CA": "International" } } }], item["anpa_category"], ) self.assertEqual("US-Biden-Staff", item["slugline"]) def test_category_apv(self): with open(get_fixture_path("apv.json", "ap")) as fp: _data = json.load(fp) with self.app.app_context(): with patch.dict(superdesk.resources, resources): item = parser.parse(_data, {}) self.assertEqual( [{ "name": "International", "qcode": "w", "scheme": CATEGORY_SCHEME, "translations": { "name": { "en-CA": "International", "fr-CA": "International" } } }], item["anpa_category"], ) self.assertEqual("EU-Spain-Storm-Aftermath", item["slugline"]) def test_category_tennis(self): with open(get_fixture_path("ap-sports.json", "ap")) as fp: _data = json.load(fp) with self.app.app_context(): with patch.dict(superdesk.resources, resources): item = parser.parse(_data, {}) self.assertEqual( [{ "name": "Agate", "qcode": "r", "scheme": CATEGORY_SCHEME, "translations": { "name": { "en-CA": "Agate", "fr-CA": "Statistiques" } } }], item["anpa_category"], ) self.assertEqual([], [ s["name"] for s in item["subject"] if s.get("scheme") == AP_SUBJECT_CV ]) output = self.format(item) self.assertIn("<Category>Agate</Category>", output) self.assertIn("<IndexCode>Agate</IndexCode>", output) def test_ignore_slugline_to_subject_map(self): with open(get_fixture_path("ap-sports.json", "ap")) as fp: _data = json.load(fp) # Prefix slugline with `BC` so slugline -> subject mapping works # in this case, slugline -> "BC-TEN-" -> "15065000" _data["data"]["item"][ "slugline"] = "BC" + _data["data"]["item"]["slugline"][2:] with self.app.app_context(): with patch.dict(superdesk.resources, resources): item = parser.parse(_data, {}) self.assertEqual( [{ "name": "Agate", "qcode": "r", "scheme": CATEGORY_SCHEME, "translations": { "name": { "en-CA": "Agate", "fr-CA": "Statistiques" } } }], item["anpa_category"], ) self.assertEqual([], [ s["name"] for s in item["subject"] if s.get("scheme") == AP_SUBJECT_CV ]) output = self.format(item) self.assertIn("<Category>Agate</Category>", output) # Make sure `IndexCode` only contains `Agate` and not `Sport` or `Tennis` self.assertIn("<IndexCode>Agate</IndexCode>", output) def test_slugline_prev_version(self): with open(get_fixture_path("ap-sports.json", "ap")) as fp: _data = json.load(fp) with self.app.app_context(): with patch.dict(superdesk.resources, resources): resources["ingest"].service.find_one.return_value = { "slugline": "prev-slugline", } item = parser.parse(_data, {}) resources["ingest"].service.find_one.return_value = None self.assertEqual("prev-slugline", item["slugline"]) def test_aps_category(self): with open(get_fixture_path("ap-aps.json", "ap")) as fp: _data = json.load(fp) with self.app.app_context(): with patch.dict(superdesk.resources, resources): item = parser.parse(_data, {}) self.assertEqual("Advisory", item["anpa_category"][0]["name"]) def format(self, item): with patch.dict(superdesk.resources, resources): item["unique_id"] = 1 return self.formatter.format(item, self.subscriber)[0][1] def test_parse_agate_headings(self): with open(get_fixture_path("ap-agate.json", "ap")) as fp: _data = json.load(fp) with self.app.app_context(): xml = etree.parse(get_fixture_path("ap-agate-nitf.xml", "ap")) parsed = nitf.NITFFeedParser().parse(xml) _data["nitf"] = parsed with patch.dict(superdesk.resources, resources): item = parser.parse(_data, {}) self.assertIn("<p>Atlantic Division</p>", item["body_html"]) def test_parse_table(self): with open(get_fixture_path("ap-table.json", "ap")) as fp: _data = json.load(fp) with self.app.app_context(): xml = etree.parse(get_fixture_path("ap-table-nitf.xml", "ap")) parsed = nitf.NITFFeedParser().parse(xml) _data["nitf"] = parsed with patch.dict(superdesk.resources, resources): item = parser.parse(_data, {}) self.assertIn("<table>", item["body_html"]) output = self.format(item) jimi = etree.fromstring(output.encode("utf-8")) print("jimi", jimi) content = jimi.find("ContentItem").find("ContentText").text self.assertIn("table", content) def test_parse_subject_duplicates(self): with open(get_fixture_path("ap-subject.json", "ap")) as fp: _data = json.load(fp) with self.app.app_context(): with patch.dict(superdesk.resources, resources): item = parser.parse(_data, {}) qcodes = [subj["qcode"] for subj in item["subject"]] self.assertEqual(len(qcodes), len(set(qcodes))) def test_parse_aps_right_now(self): with open(get_fixture_path("ap-aps-mi-right-now.json", "ap")) as fp: _data = json.load(fp) with self.app.app_context(): with patch.dict(superdesk.resources, resources): item = parser.parse(_data, {}) self.assertEqual("International", item["anpa_category"][0]["name"])
def test_fetch_to_jimi(self, update_renditions_mock): service = OrangelogicSearchProvider(self.provider) update_renditions_mock.side_effects = set_rendition self.app.media.get.return_value = io.BytesIO( read_fixture( "9e627f74b97841b3b8562b6547ada9c7-d1538139479c43e88021152.jpg", "rb")) with HTTMock(auth_ok, fetch_ok): with patch.dict(superdesk.resources, resources): fetched = service.fetch({}) update_renditions_mock.assert_called_once_with( fetched, "https://example.com/htm/GetDocumentAPI.aspx?F=TRX&DocID=2RLQZBCB4R4R4&token=token.foo", None, ) self.assertEqual("picture", fetched["type"]) self.assertIsInstance(fetched["firstcreated"], datetime) # populate ids fetched["family_id"] = fetched["guid"] fetched["unique_id"] = 1 with patch.dict(superdesk.resources, resources): formatter = JimiFormatter() xml = formatter.format(fetched, {})[0][1] root = etree.fromstring(xml.encode(formatter.ENCODING)) self.assertEqual("Pictures", root.find("Services").text) item = root.find("ContentItem") self.assertEqual("Zhang Yuwei", item.find("Byline").text) self.assertEqual("I", item.find("Category").text) self.assertEqual("News - Optional", item.find("Ranking").text) self.assertEqual("5", item.find("RankingValue").text) self.assertEqual("THE ASSOCIATED PRESS", item.find("Credit").text) self.assertEqual("Virus Outbreak China Vaccine", item.find("SlugProper").text) self.assertEqual("Unknown AP", item.find("Source").text) self.assertEqual("Beijing", item.find("City").text) self.assertEqual("China", item.find("Country").text) self.assertEqual("Beijing;;China", item.find("Placeline").text) # self.assertEqual('XIN902', item.find('OrigTransRef').text) self.assertEqual("SUB", item.find("BylineTitle").text) self.assertEqual("NHG", item.find("CaptionWriter").text) self.assertEqual("Xinhua", item.find("Copyright").text) self.assertIn( "In this April 10, 2020, photo released by Xinhua News Agency, a staff", item.find("EnglishCaption").text, ) self.assertEqual("2020-04-12T00:09:37", item.find("DateTaken").text) self.assertEqual( "NO SALES, PHOTO RELEASED BY XINHUA NEWS AGENCY APRIL 10, 2020 PHOTO", item.find("SpecialInstructions").text, ) self.assertEqual("Unknown AP", item.find("ArchiveSources").text) self.assertEqual("9e627f74b97841b3b8562b6547ada9c7", item.find("CustomField1").text) self.assertEqual("Xinhua", item.find("CustomField6").text) self.assertEqual("9e627f74b97841b3b8562b6547ada9c7", item.find("SystemSlug").text)