def parse_message(self, tree, provider): """Parse NewsMessage.""" items = [] try: self.root = tree for item_set in tree.findall(self.qname("itemSet")): for item_tree in item_set: item = self.parse_item(item_tree) items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): self.root = xml items = [] try: header = self.parse_header(xml) for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: item = self.parse_item(item_tree) item['priority'] = header['priority'] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: # Ignore the packageItem, it has no guid if 'guid' in item_tree.attrib: item = self.parse_item(item_tree) item['priority'] = 6 item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] item.setdefault('word_count', get_word_count(item['body_html'])) # Hard code the urgency item['urgency'] = 3 # Dateline is always Wellington in NZ located = [c for c in app.locators.find_cities(country_code='NZ', state_code='NZ.G2') if c.get('city', '').lower() == 'wellington'] if len(located) == 1: item['dateline'] = dict() item['dateline']['located'] = located[0] if item.get('body_html') and item['dateline']: parsed = parse_html(item.get('body_html'), content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue # check the first par for a byline if pars.index(par) == 0 and par.text.startswith('By '): item['byline'] = par.text.replace('By ', '') par.getparent().remove(par) date, source, the_rest = par.text.partition(' (BusinessDesk) - ') if source: item['dateline']['date'] = date_parser(date, fuzzy=True) par.text = the_rest # remove the signoff if in the last par if par.text == '(BusinessDesk)' and pars.index(par) + 1 == len(pars): par.getparent().remove(par) item['body_html'] = to_string(parsed, remove_root_div=True) locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') if locator_map: item['place'] = [x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ'] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def test_raise_newsmlTwoParserError(self): with assert_raises(ParserError) as error_context: try: ex = Exception("Testing newsmlTwoParserError") raise ex except Exception: raise ParserError.newsmlTwoParserError(ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 1005) self.assertTrue(exception.message == "NewsML2 input could not be processed") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing newsmlTwoParserError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual(self.mock_logger_handler.messages['error'][0], "ParserError Error 1005 - NewsML2 input could not be processed: " "Testing newsmlTwoParserError on channel TestProvider")
def parse(self, xml, provider=None): self.root = xml try: item = self.parse_item(xml) if not item.get('headline'): item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100] try: abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if abstract: item['abstract'] = abstract return [item] except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): self.root = xml items = [] try: header = self.parse_header(xml) for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: # Ignore the packageItem, it has no guid if 'guid' in item_tree.attrib: item = self.parse_item(item_tree) item['priority'] = header['priority'] item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] item.setdefault('word_count', get_word_count(item['body_html'])) items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): self.root = xml items = [] try: header = self.parse_header(xml) for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: item = self.parse_item(item_tree) item['priority'] = header['priority'] items.append(item) else: if xml.tag.endswith('newsItem') or xml.tag.endswith('packageItem'): item = self.parse_item(xml) item.setdefault('priority', header['priority']) items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): self.root = xml try: item = self.parse_item(xml) if not item.get('headline'): item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100] # populate published for newsroom archive item.setdefault('firstpublished', item.get('versioncreated')) # abstract try: abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if abstract: item['abstract'] = abstract # genre for genre_elt in xml.xpath("//iptc:genre", namespaces={'iptc': IPTC_NS}): qcode = genre_elt.get('qcode') if qcode is None: continue elif qcode.startswith('sttgenre:'): qcode = qcode[9:] genre_data = {'qcode': qcode} name_elt = genre_elt.find(self.qname('name')) name = name_elt.text if name_elt is not None and name_elt.text else "" try: name = self.getVocabulary("genre", qcode, name) except ValueError: continue else: genre_data['name'] = name item.setdefault('genre', []).append(genre_data) elif qcode.startswith('sttversion:'): qcode = qcode[11:] version_data = {'qcode': qcode, 'scheme': 'sttversion'} name_elt = genre_elt.find(self.qname('name')) name = name_elt.text if name_elt is not None and name_elt.text else "" try: name = self.getVocabulary("sttgenre", qcode, name) except ValueError: continue else: version_data['name'] = name item.setdefault('subject', []).append(version_data) # location for location_elt in xml.xpath("//iptc:assert", namespaces={'iptc': IPTC_NS}): qcode = location_elt.get("qcode") if not qcode or not qcode.startswith("sttlocmeta:"): continue qcode = qcode.split(':')[-1] location_data = {"scheme": "sttlocmeta", "qcode": qcode} location_name = location_elt.find(self.qname('name')) if location_name is not None: location_data['name'] = location_name.text for broader_elt in location_elt.xpath(".//iptc:broader[@type='cpnat:geoArea']", namespaces={'iptc': IPTC_NS}): qcode = broader_elt.get('qcode') if not qcode: continue for key, mapping in STT_LOCATION_MAP.items(): if qcode.startswith(key + ":"): if "qcode" in mapping: qcode = qcode[len(key) + 1:] try: name = broader_elt.find(self.qname('name')).text except AttributeError: name = "" try: name = self.getVocabulary(key, qcode, name) except ValueError: continue else: location_data[mapping["qcode"]] = qcode if "name" in mapping: location_data[mapping["name"]] = name item.setdefault('place', []).append(location_data) # public editorial note if 'ednote' in item: # stt has specific roles for public and private editorial notes # so we remove ednote found by parent parser, as it takes first one # as a public note del item['ednote'] try: ednote = xml.xpath("//iptc:edNote[@role='sttnote:public']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if ednote: item['ednote'] = ednote # private editorial note try: private_note = xml.xpath("//iptc:edNote[@role='sttnote:private']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if private_note: item.setdefault('extra', {})['sttnote_private'] = private_note return [item] except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): self.root = xml try: item = self.parse_item(xml) if not item.get('headline'): item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100] # abstract try: abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if abstract: item['abstract'] = abstract # genre for genre_elt in xml.xpath("//iptc:genre", namespaces={'iptc': IPTC_NS}): qcode = genre_elt.get('qcode') if qcode is None: continue elif qcode.startswith('sttgenre:'): qcode = qcode[9:] genre_data = {'qcode': qcode} name_elt = genre_elt.find(self.qname('name')) name = name_elt.text if name_elt is not None and name_elt.text else "" try: name = self.getVocabulary("genre", qcode, name) except ValueError: continue else: genre_data['name'] = name item.setdefault('genre', []).append(genre_data) elif qcode.startswith('sttversion:'): qcode = qcode[11:] version_data = {'qcode': qcode, 'scheme': 'sttversion'} name_elt = genre_elt.find(self.qname('name')) name = name_elt.text if name_elt is not None and name_elt.text else "" try: name = self.getVocabulary("sttgenre", qcode, name) except ValueError: continue else: version_data['name'] = name item.setdefault('subject', []).append(version_data) # location for location_elt in xml.xpath("//iptc:assert", namespaces={'iptc': IPTC_NS}): qcode = location_elt.get("qcode") if not qcode or not qcode.startswith("sttlocmeta:default:"): continue qcode = qcode[19:] location_data = {"scheme": "sttlocmeta:default", "qcode": qcode} for broader_elt in location_elt.xpath(".//iptc:broader[@type='cpnat:geoArea']", namespaces={'iptc': IPTC_NS}): qcode = broader_elt.get('qcode') if not qcode: continue for key, mapping in STT_LOCATION_MAP.items(): if qcode.startswith(key + ":"): if "qcode" in mapping: qcode = qcode[len(key) + 1:] try: name = broader_elt.find(self.qname('name')).text except AttributeError: name = "" try: name = self.getVocabulary(key, qcode, name) except ValueError: continue else: location_data[mapping["qcode"]] = qcode if "name" in mapping: location_data[mapping["name"]] = name item.setdefault('place', []).append(location_data) # public editorial note if 'ednote' in item: # stt has specific roles for public and private editorial notes # so we remove ednote found by parent parser, as it takes first one # as a public note del item['ednote'] try: ednote = xml.xpath("//iptc:edNote[@role='sttnote:public']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if ednote: item['ednote'] = ednote # private editorial note try: private_note = xml.xpath("//iptc:edNote[@role='sttnote:private']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if private_note: item.setdefault('extra', {})['sttnote_private'] = private_note return [item] except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: # Ignore the packageItem, it has no guid if 'guid' in item_tree.attrib: item = self.parse_item(item_tree) item['priority'] = 6 item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{ 'qcode': '04000000', 'name': subject_codes['04000000'] }] item.setdefault('word_count', get_word_count(item['body_html'])) # Hard code the urgency item['urgency'] = 3 # Dateline is always Wellington in NZ located = [ c for c in app.locators.find_cities( country_code='NZ', state_code='NZ.G2') if c.get('city', '').lower() == 'wellington' ] if len(located) == 1: item['dateline'] = dict() item['dateline']['located'] = located[0] if item.get('body_html') and item['dateline']: parsed = parse_html(item.get('body_html'), content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue # check the first par for a byline if pars.index( par) == 0 and par.text.startswith( 'By '): item['byline'] = par.text.replace( 'By ', '') par.getparent().remove(par) date, source, the_rest = par.text.partition( ' (BusinessDesk) - ') if source: item['dateline']['date'] = date_parser( date, fuzzy=True) par.text = the_rest # remove the signoff if in the last par if par.text == '(BusinessDesk)' and pars.index( par) + 1 == len(pars): par.getparent().remove(par) item['body_html'] = to_string(parsed, remove_root_div=True) locator_map = superdesk.get_resource_service( 'vocabularies').find_one(req=None, _id='locators') if locator_map: item['place'] = [ x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ' ] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: item = self.parse_item(item_tree) try: published = item_tree.xpath('.//xhtml:body/xhtml:header/' 'xhtml:time[@class="publicationDate"]/@data-datetime', namespaces=NS)[0] except IndexError: item['firstcreated'] = item['versioncreated'] else: item['firstcreated'] = dateutil.parser.parse(published) item['firstcreated'] = item['firstcreated'].astimezone(pytz.utc) item['versioncreated'] = item['versioncreated'].astimezone(pytz.utc) if item['urgency'] == 4: item['urgency'] = 3 # mapping services-products for cat in item.get('anpa_category', []): qcode = self.MAPPING_CATEGORY.get( cat.get('qcode', '').upper(), 'NEWS/GENERAL' ) item.setdefault('subject', []).append({ 'name': qcode, 'qcode': qcode, 'parent': 'NEWS', 'scheme': 'services-products' }) break else: item.setdefault('subject', []).append({ 'name': 'NEWS/GENERAL', 'qcode': 'NEWS/GENERAL', 'parent': 'NEWS', 'scheme': 'services-products' }) # Source is DPA credit = {"name": 'DPA', "qcode": 'DPA', "scheme": "sources"} item.setdefault('subject', []).append(credit) # Distribution is default dist = {"name": 'default', "qcode": 'default', "scheme": "distribution"} item.setdefault('subject', []).append(dist) # Slugline and keywords is epmty item['slugline'] = None item['keywords'] = [] # Find genres and verify their roles and qcodes to acceptance criteria. genres = item_tree.xpath('//iptc:genre', namespaces=NS) for genre in genres: genre_qcode = genre.get('qcode') if genre_qcode and genre_qcode != 'dpatextgenre:1': genre_names = genre.findall(self.qname('name')) if genre_names: for genre_name in genre_names: try: genre_role = genre_name.attrib['role'] if genre_role == 'nrol:display': item['headline'] = "({genre}): {headline}".format( genre=genre_name.text, headline=item['headline'] ) break except KeyError: continue # remove duplicated subject item['subject'] = [ dict(i) for i, _ in itertools.groupby(sorted(item['subject'], key=lambda k: k['qcode'])) ] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname("itemSet")): for item_tree in item_set: # Ignore the packageItem, it has no guid if "guid" in item_tree.attrib: item = self.parse_item(item_tree) item["priority"] = 6 item["anpa_category"] = [{"qcode": "f"}] item["subject"] = [{ "qcode": "04000000", "name": subject_codes["04000000"] }] item.setdefault("word_count", get_word_count(item["body_html"])) # Hard code the urgency item["urgency"] = 3 # Dateline is always Wellington in NZ located = [ c for c in app.locators.find_cities( country_code="NZ", state_code="NZ.G2") if c.get("city", "").lower() == "wellington" ] if len(located) == 1: item["dateline"] = dict() item["dateline"]["located"] = located[0] if item.get("body_html") and item["dateline"]: parsed = parse_html(item.get("body_html"), content="xml") pars = parsed.xpath("//p") for par in pars: if not par.text: continue # check the first par for a byline if pars.index( par) == 0 and par.text.startswith( "By "): item["byline"] = par.text.replace( "By ", "") par.getparent().remove(par) date, source, the_rest = par.text.partition( " (BusinessDesk) - ") if source: item["dateline"]["date"] = date_parser( date, fuzzy=True) par.text = the_rest # remove the signoff if in the last par if par.text == "(BusinessDesk)" and pars.index( par) + 1 == len(pars): par.getparent().remove(par) item["body_html"] = to_string(parsed, remove_root_div=True) locator_map = superdesk.get_resource_service( "vocabularies").find_one(req=None, _id="locators") if locator_map: item["place"] = [ x for x in locator_map.get("items", []) if x["qcode"].upper() == "NZ" ] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: item = self.parse_item(item_tree) try: published = item_tree.xpath( './/xhtml:body/xhtml:header/' 'xhtml:time[@class="publicationDate"]/@data-datetime', namespaces=NS)[0] except IndexError: item['firstcreated'] = item['versioncreated'] else: item['firstcreated'] = dateutil.parser.parse(published) item['firstcreated'] = item['firstcreated'].astimezone( pytz.utc) item['versioncreated'] = item['versioncreated'].astimezone( pytz.utc) if item['urgency'] == 4: item['urgency'] = 3 # mapping services-products for cat in item.get('anpa_category', []): qcode = self.MAPPING_CATEGORY.get( cat.get('qcode', '').upper(), 'NEWS/GENERAL') item.setdefault('subject', []).append({ 'name': qcode, 'qcode': qcode, 'parent': 'NEWS', 'scheme': 'services-products' }) break else: item.setdefault('subject', []).append({ 'name': 'NEWS/GENERAL', 'qcode': 'NEWS/GENERAL', 'parent': 'NEWS', 'scheme': 'services-products' }) # Source is DPA credit = { "name": 'DPA', "qcode": 'DPA', "scheme": "sources" } item.setdefault('subject', []).append(credit) # Distribution is default dist = { "name": 'default', "qcode": 'default', "scheme": "distribution" } item.setdefault('subject', []).append(dist) # Slugline and keywords is epmty item['slugline'] = None item['keywords'] = [] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)