def _update(self, provider): self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: return for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: if os.path.isfile(os.path.join(self.path, filename)): filepath = os.path.join(self.path, filename) stat = os.lstat(filepath) last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc) if self.is_latest_content(last_updated, provider.get('last_updated')): with open(os.path.join(self.path, filename), 'r') as f: item = self.parser.parse_message(etree.fromstring(f.read()), provider) self.add_timestamps(item) self.move_file(self.path, filename, provider=provider, success=True) yield [item] else: self.move_file(self.path, filename, provider=provider, success=True) except etreeParserError as ex: logger.exception("Ingest Type: AFP - File: {0} could not be processed".format(filename), ex) self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.newsmlOneParserError(ex, provider) except ParserError as ex: self.move_file(self.path, filename, provider=provider, success=False) except Exception as ex: self.move_file(self.path, filename, provider=provider, success=False) raise ProviderError.ingestError(ex, provider) push_notification('ingest:update')
def _update(self, provider): self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: return for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: if os.path.isfile(os.path.join(self.path, filename)): filepath = os.path.join(self.path, filename) stat = os.lstat(filepath) last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc) if self.is_latest_content(last_updated, provider.get('last_updated')): with open(os.path.join(self.path, filename), 'r') as f: item = self.parser.parse_message(etree.fromstring(f.read()), provider) self.add_timestamps(item) self.move_file(self.path, filename, provider=provider, success=True) yield [item] else: self.move_file(self.path, filename, provider=provider, success=True) except etreeParserError as ex: logger.exception("Ingest Type: AFP - File: {0} could not be processed".format(filename), ex) self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.newsmlOneParserError(ex, provider) except ParserError as ex: self.move_file(self.path, filename, provider=provider, success=False) except Exception as ex: self.move_file(self.path, filename, provider=provider, success=False) raise ProviderError.ingestError(ex, provider) push_notification('ingest:update')
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find('NewsItem/NewsComponent/AdministrativeMetadata/Source') if parsed_el is not None: item['original_source'] = parsed_el.find('Party').get('FormalName', '') parsed_el = xml.find('NewsEnvelope/TransmissionId') if parsed_el is not None: item['ingest_provider_sequence'] = parsed_el.text parsed_el = xml.find('NewsEnvelope/Priority') item['priority'] = self.map_priority(parsed_el.text if parsed_el is not None else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Language') if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item['language'] = language[0]['FormalName'] if len(language) else '' keywords = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Property') item['keywords'] = self.parse_attribute_values(keywords, 'Keyword') subjects = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail') subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter') subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject') item['subject'] = self.format_subjects(subjects) # item['ContentItem'] = self.parse_attributes_as_dictionary( # tree.find('NewsItem/NewsComponent/ContentItem')) # item['Content'] = etree.tostring( # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content')) item['body_html'] = etree.tostring( xml.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'), encoding='unicode').replace('<body.content>', '').replace('</body.content>', '') parsed_el = xml.findall('NewsItem/NewsComponent/ContentItem/Characteristics/Property') characteristics = self.parse_attribute_values(parsed_el, 'Words') item['word_count'] = characteristics[0] if len(characteristics) else None parsed_el = xml.find('NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType') if parsed_el is not None: item.setdefault('usageterms', parsed_el.text) parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Genre') if parsed_el is not None: item['genre'] = [] for el in parsed_el: item['genre'].append({'name': el.get('FormalName')}) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse_message(self, tree, provider): """Parse NewsMessage.""" item = {} try: self.root = tree parsed_el = tree.find('NewsItem/NewsComponent/AdministrativeMetadata/Source') if parsed_el is not None: item['original_source'] = parsed_el.find('Party').get('FormalName', '') parsed_el = tree.find('NewsEnvelope/TransmissionId') if parsed_el is not None: item['ingest_provider_sequence'] = parsed_el.text self.parse_news_identifier(item, tree) self.parse_newslines(item, tree) self.parse_news_management(item, tree) parsed_el = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Language') if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item['language'] = language[0]['FormalName'] if len(language) else '' keywords = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Property') item['keywords'] = self.parse_attribute_values(keywords, 'Keyword') subjects = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail') subjects += tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter') subjects += tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject') item['subject'] = self.format_subjects(subjects) # item['ContentItem'] = self.parse_attributes_as_dictionary( # tree.find('NewsItem/NewsComponent/ContentItem')) # item['Content'] = etree.tostring( # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content')) item['body_html'] = etree.tostring( tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'), encoding='unicode').replace('<body.content>', '').replace('</body.content>', '') parsed_el = tree.findall('NewsItem/NewsComponent/ContentItem/Characteristics/Property') characteristics = self.parse_attribute_values(parsed_el, 'Words') item['word_count'] = characteristics[0] if len(characteristics) else None parsed_el = tree.find('NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType') if parsed_el is not None: item.setdefault('usageterms', parsed_el.text) parsed_el = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Genre') if parsed_el is not None: item['genre'] = [] for el in parsed_el: item['genre'].append({'name': el.get('FormalName')}) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find("NewsItem/NewsComponent/AdministrativeMetadata/Source") if parsed_el is not None: item["original_source"] = parsed_el.find("Party").get("FormalName", "") parsed_el = xml.find("NewsEnvelope/TransmissionId") if parsed_el is not None: item["ingest_provider_sequence"] = parsed_el.text parsed_el = xml.find("NewsEnvelope/Priority") item["priority"] = self.map_priority(parsed_el.text if parsed_el is not None else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Language") if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item["language"] = language[0]["FormalName"] if len(language) else "" keywords = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Property") item["keywords"] = self.parse_attribute_values(keywords, "Keyword") subjects = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail") subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter") subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject") item["subject"] = self.format_subjects(subjects) # item['ContentItem'] = self.parse_attributes_as_dictionary( # tree.find('NewsItem/NewsComponent/ContentItem')) # item['Content'] = etree.tostring( # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content')) self.parse_content(item, xml) parsed_el = xml.findall("NewsItem/NewsComponent/ContentItem/Characteristics/Property") characteristics = self.parse_attribute_values(parsed_el, "Words") item["word_count"] = characteristics[0] if len(characteristics) else None parsed_el = xml.find("NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType") if parsed_el is not None: item.setdefault("usageterms", parsed_el.text) parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Genre") if parsed_el is not None: item["genre"] = [] for el in parsed_el: item["genre"].append({"name": el.get("FormalName")}) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse(self, xml, provider=None): """ Parse content the xml newsml file to json object. Example content the xml newsml file: <?xml version="1.0" encoding="utf-8"?> <NewsML Version="1.2"> <!--AFP NewsML text-photo profile evolution2--> <!--Processed by Xafp1-4ToNewsML1-2 rev21--> <Catalog Href="http://www.afp.com/dtd/AFPCatalog.xml"/> <NewsEnvelope> ...... </NewsEnvelope> <NewsItem xml:lang="fr"> <Identification> ....... </Identification> <NewsManagement> ...... </NewsManagement> <NewsComponent> ...... </NewsComponent> </NewsItem> </NewsML> :param xml: :param provider: :return: """ self._provider = provider if self._provider is None: self._provider = {} try: self.root = xml self._items = [] self._item_seed = {} # parser the NewsEnvelope element self._item_seed.update( self.parse_newsenvelop(xml.find('NewsEnvelope')) ) # parser the NewsItem element for newsitem_el in xml.findall('NewsItem'): try: self.parse_newsitem(newsitem_el) except SkipItemException: continue return self._items except Exception as ex: raise ParserError.newsmlOneParserError(ex, self._provider)
def parse(self, xml, provider=None): """ Parser content the xml newsml file to json object. Example content the xml newsml file: <?xml version="1.0" encoding="utf-8"?> <NewsML Version="1.2"> <!--AFP NewsML text-photo profile evolution2--> <!--Processed by Xafp1-4ToNewsML1-2 rev21--> <Catalog Href="http://www.afp.com/dtd/AFPCatalog.xml"/> <NewsEnvelope> ...... </NewsEnvelope> <NewsItem xml:lang="fr"> <Identification> ....... </Identification> <NewsManagement> ...... </NewsManagement> <NewsComponent> ...... </NewsComponent> </NewsItem> </NewsML> :param xml: :param provider: :return: """ try: items = [] self.root = xml # parser the NewsEnvelope element item_envelop = self.parser_newsenvelop(xml.find('NewsEnvelope')) # parser the NewsItem element l_newsitem_el = xml.findall('NewsItem') for newsitem_el in l_newsitem_el: try: item = item_envelop.copy() self.parser_newsitem(item, newsitem_el) item = self.populate_fields(item) except SkipItemException: continue items.append(item) return items except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def test_raise_newsmlOneParserError(self): with assert_raises(ParserError) as error_context: try: raise Exception("Testing newsmlOneParserError") except Exception as ex: raise ParserError.newsmlOneParserError(ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 1004) self.assertTrue(exception.message == "NewsML1 input could not be processed") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing newsmlOneParserError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual(self.mock_logger_handler.messages['error'][0], "ParserError Error 1004 - NewsML1 input could not be processed: " "Testing newsmlOneParserError on channel TestProvider")
class EventFileFeedingService(FileFeedingService): """ Feeding Service class which can read the configured local file system for article(s). """ NAME = 'event_file' ERRORS = [ ParserError.IPTC7901ParserError().get_error_description(), ParserError.nitfParserError().get_error_description(), ParserError.newsmlOneParserError().get_error_description(), ProviderError.ingestError().get_error_description(), ParserError.parseFileError().get_error_description() ] label = 'Event file feed' """ Defines the collection service to be used with this ingest feeding service. """ service = 'events' fields = [{ 'id': 'path', 'type': 'text', 'label': 'Event File Server Folder', 'placeholder': 'path to folder', 'required': True, 'errors': { 3003: 'Path not found on server.', 3004: 'Path should be directory.' } }] def _update(self, provider, update): self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: logger.warn( 'File Feeding Service {} is configured without path. Please check the configuration' .format(provider['name'])) return [] for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: last_updated = None file_path = os.path.join(self.path, filename) if os.path.isfile(file_path): stat = os.lstat(file_path) last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc) if self.is_latest_content(last_updated, provider.get('last_updated')): parser = self.get_feed_parser(provider, file_path) logger.info('Ingesting events with {} parser'.format( parser.__class__.__name__)) if getattr(parser, 'parse_file'): with open(file_path, 'rb') as f: item = parser.parse_file(f, provider) else: item = parser.parse(file_path, provider) self.after_extracting(item, provider) self.move_file(self.path, filename, provider=provider, success=True) if isinstance(item, list): yield item else: yield [item] else: self.move_file(self.path, filename, provider=provider, success=True) except Exception as ex: if last_updated and self.is_old_content(last_updated): self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.parseFileError( '{}-{}'.format(provider['name'], self.NAME), filename, ex, provider) push_notification('ingest:update')
class FileFeedingService(FeedingService): """ Feeding Service class which can read the configured local file system for article(s). """ NAME = 'file' ERRORS = [ ParserError.IPTC7901ParserError().get_error_description(), ParserError.nitfParserError().get_error_description(), ParserError.newsmlOneParserError().get_error_description(), ProviderError.ingestError().get_error_description(), ParserError.parseFileError().get_error_description() ] label = 'File Feed' fields = [{ 'id': 'path', 'type': 'text', 'label': 'Server Folder', 'placeholder': 'path to folder', 'required': True, 'errors': { 3003: 'Path not found on server.', 3004: 'Path should be directory.' } }] def _test(self, provider): path = provider.get('config', {}).get('path', None) if not os.path.exists(path): raise IngestFileError.notExistsError() if not os.path.isdir(path): raise IngestFileError.isNotDirError() def _update(self, provider, update): self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: logger.warn( 'File Feeding Service {} is configured without path. Please check the configuration' .format(provider['name'])) return [] registered_parser = self.get_feed_parser(provider) for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: last_updated = None file_path = os.path.join(self.path, filename) if os.path.isfile(file_path): stat = os.lstat(file_path) last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc) if self.is_latest_content(last_updated, provider.get('last_updated')): if isinstance(registered_parser, XMLFeedParser): with open(file_path, 'rb') as f: xml = etree.parse(f) parser = self.get_feed_parser( provider, xml.getroot()) item = parser.parse(xml.getroot(), provider) else: parser = self.get_feed_parser(provider, file_path) item = parser.parse(file_path, provider) self.after_extracting(item, provider) self.move_file(self.path, filename, provider=provider, success=True) if isinstance(item, list): yield item else: yield [item] else: self.move_file(self.path, filename, provider=provider, success=True) except Exception as ex: if last_updated and self.is_old_content(last_updated): self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.parseFileError( '{}-{}'.format(provider['name'], self.NAME), filename, ex, provider) push_notification('ingest:update') def after_extracting(self, article, provider): """Sub-classes should override this method if something needs to be done to the given article. For example, if the article comes from DPA provider the system needs to derive dateline from the properties in the article. Invoked after parser parses the article received from the provider. :param article: dict having properties that can be saved into ingest collection :type article: dict :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` """ pass def move_file(self, file_path, filename, provider, success=True): """Move the files from the current directory to the _Processed if successful, else _Error if unsuccessful. Creates _Processed and _Error directories within current directory if they don't exist. :param file_path: str - current directory location :param filename: str - file name in the current directory to move :param provider: dict - Ingest provider details to which the current directory has been configured :param success: bool - default value is True. When True moves to _Processed directory else _Error directory. :raises IngestFileError.folderCreateError() if creation of _Processed or _Error directories fails :raises IngestFileError.fileMoveError() if failed to move the file pointed by filename """ try: if not os.path.exists(os.path.join(file_path, "_PROCESSED/")): os.makedirs(os.path.join(file_path, "_PROCESSED/")) if not os.path.exists(os.path.join(file_path, "_ERROR/")): os.makedirs(os.path.join(file_path, "_ERROR/")) except Exception as ex: raise IngestFileError.folderCreateError(ex, provider) try: if success: shutil.copy2(os.path.join(file_path, filename), os.path.join(file_path, "_PROCESSED/")) else: shutil.copy2(os.path.join(file_path, filename), os.path.join(file_path, "_ERROR/")) except Exception as ex: raise IngestFileError.fileMoveError(ex, provider) finally: os.remove(os.path.join(file_path, filename)) def is_latest_content(self, last_updated, provider_last_updated=None): """ Parse file only if it's not older than provider last update -10m """ if not provider_last_updated: provider_last_updated = utcnow() - timedelta(days=7) return provider_last_updated - timedelta(minutes=10) < last_updated def is_old_content(self, last_updated): """Test if file is old so it wouldn't probably work in is_latest_content next time. Such files can be moved to `_ERROR` folder, it wouldn't be ingested anymore. :param last_updated: file last updated datetime """ return last_updated < utcnow() - timedelta(minutes=10)
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find( "NewsItem/NewsComponent/AdministrativeMetadata/Source/Party") if parsed_el is not None: item["original_source"] = parsed_el.attrib.get( "FormalName", "ANA") parsed_el = xml.find("NewsEnvelope/Priority") item["priority"] = self.map_priority( parsed_el.text if parsed_el is not None else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall( "NewsItem/NewsComponent/DescriptiveMetadata/Language") if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item["language"] = language[0]["FormalName"] if len( language) else "" subjects = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]' ) item["subject"] = self.format_subjects(subjects) item["body_html"] = (html.unescape( etree.tostring(xml.find( "NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent" ), encoding="unicode")).replace( "<DataContent>", "").replace("</DataContent>", "").replace( "<P>", "<p>").replace("</P>", "</p>")) item["body_html"] = (item.get("body_html").replace( "<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο " "ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον " "για συγκεκριμένη χρήση.</p>", "", ).strip()) parsed_el = xml.findall( "NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property" ) characteristics = self.parse_attribute_values( parsed_el, "WordCount") item["word_count"] = characteristics[0] if len( characteristics) else None # Extract the city for setting into the dateline city = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]' ).attrib.get("Value") # Anglicise the greek for Athens if required city = "Athens" if city == "Αθήνα" else city country = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]' ).attrib.get("Value") # Normalise the country code country = "GR" if country == "GRC" else country cities = app.locators.find_cities() located = [ c for c in cities if c["city"] == city and c["country_code"] == country ] if len(located) == 1: item["dateline"]["located"] = located[0] item["dateline"]["source"] = provider.get("source") item["dateline"]["text"] = format_dateline_to_locmmmddsrc( item["dateline"]["located"], item.get("dateline", {}).get("date"), provider.get("source")) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse(self, xml, provider=None): item = { 'versioncreated': utcnow(), 'anpa_category': [{ "name": "Formidlingstjenester", "qcode": "r" }], 'genre': [{ "name": "Fulltekstmeldinger", "qcode": "Fulltekstmeldinger", "scheme": "genre_custom" }], 'subject': [{ 'qcode': 'Børsmelding', 'name': 'Børsmelding', 'scheme': 'category' }], 'ednote': '*** Dette er en børsmelding formidlet av NTB pva. andre ***' } self.populate_fields(item) try: # we remove newsml namespace for convenience (to avoid to write prefix each time) # we deepcopy first to avoid modifying original item xml = deepcopy(xml) for elt in xml.iter(): elt.tag = elt.tag.replace('{' + NEWSML_NS + '}', '') news_items = xml.findall('NewsItem') # there may be several items (for different languages), we keep in order of # preference: Norwegian, English, first item (cf. SDNTB-573) selected = None for news_item in news_items: try: lang = news_item.xpath( 'NewsComponent/DescriptiveMetadata/Language/@FormalName', )[0] except IndexError: logger.warning( "missing language in item, ignoring it.\nxml: {xml}". format( xml=etree.tostring(news_item, encoding="unicode"))) continue if selected is None or lang in ('no', 'en'): selected = news_item if lang == 'no': break if selected is None: logger.warning("can't find any valid item\nxml={xml}".format( xml=etree.tostring(news_item, encoding="unicode"))) raise ParserError.parseFileError( source=etree.tostring(xml, encoding="unicode")) self.do_mapping(item, selected) return [item] except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse(self, xml, provider=None): """ Parser content the xml newsml file to json object. Example content the xml newsml file: <?xml version="1.0" encoding="utf-8"?> <NewsML Version="1.2"> <!--AFP NewsML text-photo profile evolution2--> <!--Processed by Xafp1-4ToNewsML1-2 rev21--> <Catalog Href="http://www.afp.com/dtd/AFPCatalog.xml"/> <NewsEnvelope> ...... </NewsEnvelope> <NewsItem xml:lang="fr"> <Identification> ....... </Identification> <NewsManagement> ...... </NewsManagement> <NewsComponent> ...... </NewsComponent> </NewsItem> </NewsML> :param xml: :param provider: :return: """ try: items = [] self.root = xml # parser the NewsEnvelope element item_envelop = self.parse_newsenvelop(xml.find('NewsEnvelope')) # parser the NewsItem element l_newsitem_el = xml.findall('NewsItem') for newsitem_el in l_newsitem_el: try: item = item_envelop.copy() self.parse_newsitem(item, newsitem_el) # add product is NEWS/GENERAL, if product is empty if not [ it for it in item.get('subject', []) if it.get('scheme') == 'services-products' ]: item.setdefault('subject', []).append({ 'name': 'NEWS/GENERAL', 'qcode': 'NEWS/GENERAL', 'parent': 'NEWS', 'scheme': 'services-products' }) # Distribution is default item.setdefault('subject', []).extend([ { "name": 'default', "qcode": 'default', "scheme": "distribution" }, ]) # Slugline and keywords is epmty item['slugline'] = None item['keywords'] = [] # remove duplicated subject item['subject'] = [ dict(i) for i, _ in itertools.groupby( sorted(item['subject'], key=lambda k: k['qcode'])) ] item = self.populate_fields(item) except SkipItemException: continue items.append(item) return items except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
class EventFileFeedingService(FileFeedingService): """ Feeding Service class which can read the configured local file system for article(s). """ NAME = 'event_file' ERRORS = [ ParserError.IPTC7901ParserError().get_error_description(), ParserError.nitfParserError().get_error_description(), ParserError.newsmlOneParserError().get_error_description(), ProviderError.ingestError().get_error_description(), ParserError.parseFileError().get_error_description() ] label = 'Event File Feed' """ Defines the collection service to be used with this ingest feeding service. """ service = 'events' def _update(self, provider, update): self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: logger.warn( 'File Feeding Service {} is configured without path. Please check the configuration' .format(provider['name'])) return [] registered_parser = self.get_feed_parser(provider) for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: last_updated = None file_path = os.path.join(self.path, filename) if os.path.isfile(file_path): stat = os.lstat(file_path) last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc) if self.is_latest_content(last_updated, provider.get('last_updated')): if isinstance(registered_parser, NTBEventXMLFeedParser): logger.info('Ingesting xml events') with open(file_path, 'rb') as f: xml = ElementTree.parse(f) parser = self.get_feed_parser( provider, xml.getroot()) item = parser.parse(xml.getroot(), provider) elif isinstance(registered_parser, IcsTwoFeedParser): logger.info('Ingesting ics events') with open(file_path, 'rb') as f: cal = Calendar.from_ical(f.read()) parser = self.get_feed_parser(provider, cal) item = parser.parse(cal, provider) else: logger.info('Ingesting events with unknown parser') parser = self.get_feed_parser(provider, file_path) item = parser.parse(file_path, provider) self.after_extracting(item, provider) self.move_file(self.path, filename, provider=provider, success=True) if isinstance(item, list): yield item else: yield [item] else: self.move_file(self.path, filename, provider=provider, success=True) except Exception as ex: if last_updated and self.is_old_content(last_updated): self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.parseFileError( '{}-{}'.format(provider['name'], self.NAME), filename, ex, provider) push_notification('ingest:update')
import logging from datetime import datetime from .newsml_1_2 import NewsMLOneParser from superdesk.io.file_ingest_service import FileIngestService from superdesk.utils import get_sorted_files, FileSortAttributes from ..utc import utc from ..etree import etree, ParseError as etreeParserError from superdesk.notification import push_notification from superdesk.io import register_provider from superdesk.errors import ParserError, ProviderError logger = logging.getLogger(__name__) PROVIDER = 'afp' errors = [ParserError.newsmlOneParserError().get_error_description(), ProviderError.ingestError().get_error_description()] class AFPIngestService(FileIngestService): """AFP Ingest Service""" def __init__(self): self.parser = NewsMLOneParser() def _update(self, provider): self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: return
class FileFeedingService(FeedingService): """ Feeding Service class which can read the configured local file system for article(s). """ NAME = 'file' ERRORS = [ ParserError.IPTC7901ParserError().get_error_description(), ParserError.nitfParserError().get_error_description(), ParserError.newsmlOneParserError().get_error_description(), ProviderError.ingestError().get_error_description(), ParserError.parseFileError().get_error_description() ] label = 'File feed' fields = [ { 'id': 'path', 'type': 'text', 'label': 'Server Folder', 'placeholder': 'path to folder', 'required': True, 'errors': {3003: 'Path not found on server.', 3004: 'Path should be directory.'} } ] def _test(self, provider): path = provider.get('config', {}).get('path', None) if not os.path.exists(path): raise IngestFileError.notExistsError() if not os.path.isdir(path): raise IngestFileError.isNotDirError() def _update(self, provider, update): # check if deprecated FILE_INGEST_OLD_CONTENT_MINUTES setting is still used if "FILE_INGEST_OLD_CONTENT_MINUTES" in app.config: deprecated_cont_min = app.config["FILE_INGEST_OLD_CONTENT_MINUTES"] cont_min = app.config[OLD_CONTENT_MINUTES] if deprecated_cont_min != cont_min: logger.warning( "'FILE_INGEST_OLD_CONTENT_MINUTES' is deprecated, please update settings.py to use {new_name!r}" .format(new_name=OLD_CONTENT_MINUTES)) app.config[OLD_CONTENT_MINUTES] = deprecated_cont_min self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: logger.warn('File Feeding Service {} is configured without path. Please check the configuration' .format(provider['name'])) return [] registered_parser = self.get_feed_parser(provider) for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: last_updated = None file_path = os.path.join(self.path, filename) if os.path.isfile(file_path): last_updated = self.get_last_updated(file_path) if self.is_latest_content(last_updated, provider.get('last_updated')): if isinstance(registered_parser, XMLFeedParser): with open(file_path, 'rb') as f: xml = etree.parse(f) parser = self.get_feed_parser(provider, xml.getroot()) item = parser.parse(xml.getroot(), provider) else: parser = self.get_feed_parser(provider, file_path) item = parser.parse(file_path, provider) self.after_extracting(item, provider) if isinstance(item, list): failed = yield item else: failed = yield [item] self.move_file(self.path, filename, provider=provider, success=not failed) else: self.move_file(self.path, filename, provider=provider, success=False) except Exception as ex: if last_updated and self.is_old_content(last_updated): self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.parseFileError('{}-{}'.format(provider['name'], self.NAME), filename, ex, provider) push_notification('ingest:update') def after_extracting(self, article, provider): """Sub-classes should override this method if something needs to be done to the given article. For example, if the article comes from DPA provider the system needs to derive dateline from the properties in the article. Invoked after parser parses the article received from the provider. :param article: dict having properties that can be saved into ingest collection :type article: dict :param provider: dict - Ingest provider details to which the current directory has been configured :type provider: dict :py:class: `superdesk.io.ingest_provider_model.IngestProviderResource` """ pass def move_file(self, file_path, filename, provider, success=True): """Move the files from the current directory to the _Processed if successful, else _Error if unsuccessful. Creates _Processed and _Error directories within current directory if they don't exist. :param file_path: str - current directory location :param filename: str - file name in the current directory to move :param provider: dict - Ingest provider details to which the current directory has been configured :param success: bool - default value is True. When True moves to _Processed directory else _Error directory. :raises IngestFileError.folderCreateError() if creation of _Processed or _Error directories fails :raises IngestFileError.fileMoveError() if failed to move the file pointed by filename """ try: if not os.path.exists(os.path.join(file_path, "_PROCESSED/")): os.makedirs(os.path.join(file_path, "_PROCESSED/")) if not os.path.exists(os.path.join(file_path, "_ERROR/")): os.makedirs(os.path.join(file_path, "_ERROR/")) except Exception as ex: raise IngestFileError.folderCreateError(ex, provider) try: if success: shutil.copy2(os.path.join(file_path, filename), os.path.join(file_path, "_PROCESSED/")) else: shutil.copy2(os.path.join(file_path, filename), os.path.join(file_path, "_ERROR/")) except Exception as ex: raise IngestFileError.fileMoveError(ex, provider) finally: os.remove(os.path.join(file_path, filename)) def get_last_updated(self, file_path): """Get last updated time for file. Using both mtime and ctime timestamps not to miss old files being copied around and recent files after changes done in place. """ stat = os.lstat(file_path) timestamp = max(stat.st_mtime, stat.st_ctime) return datetime.fromtimestamp(timestamp, tz=utc)
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find( 'NewsItem/NewsComponent/AdministrativeMetadata/Source/Party') if parsed_el is not None: item['original_source'] = parsed_el.attrib.get( 'FormalName', 'ANA') parsed_el = xml.find('NewsEnvelope/Priority') item['priority'] = self.map_priority( parsed_el.text if parsed_el is not None else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/Language') if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item['language'] = language[0]['FormalName'] if len( language) else '' subjects = xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]' ) subjects += xml.findall( 'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]' ) item['subject'] = self.format_subjects(subjects) item['body_html'] = html.unescape( etree.tostring(xml.find( 'NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent' ), encoding='unicode')).replace( '<DataContent>', '').replace('</DataContent>', '').replace( '<P>', '<p>').replace('</P>', '</p>') item['body_html'] = item.get('body_html').replace( '<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο ' 'ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον ' 'για συγκεκριμένη χρήση.</p>', '').strip() parsed_el = xml.findall( 'NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property' ) characteristics = self.parse_attribute_values( parsed_el, 'WordCount') item['word_count'] = characteristics[0] if len( characteristics) else None # Extract the city for setting into the dateline city = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]' ).attrib.get('Value') # Anglicise the greek for Athens if required city = 'Athens' if city == 'Αθήνα' else city country = xml.find( 'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]' ).attrib.get('Value') # Normalise the country code country = 'GR' if country == 'GRC' else country cities = app.locators.find_cities() located = [ c for c in cities if c['city'] == city and c['country_code'] == country ] if len(located) == 1: item['dateline']['located'] = located[0] item['dateline']['source'] = provider.get('source') item['dateline']['text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], item.get('dateline', {}).get('date'), provider.get('source')) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find("NewsItem/NewsComponent/AdministrativeMetadata/Source") if parsed_el is not None: item["original_source"] = parsed_el.find("Party").get("FormalName", "") parsed_el = xml.find("NewsEnvelope/TransmissionId") if parsed_el is not None: item["ingest_provider_sequence"] = parsed_el.text parsed_el = xml.find("NewsEnvelope/Priority") item["priority"] = self.map_priority(parsed_el.text if parsed_el else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Language") if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item["language"] = language[0]["FormalName"] if len(language) else "" keywords = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Property") item["keywords"] = self.parse_attribute_values(keywords, "Keyword") subjects = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail") subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter") subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject") item["subject"] = self.format_subjects(subjects) # item['ContentItem'] = self.parse_attributes_as_dictionary( # tree.find('NewsItem/NewsComponent/ContentItem')) # item['Content'] = etree.tostring( # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content')) item["body_html"] = ( etree.tostring( xml.find("NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content"), encoding="unicode", ) .replace("<body.content>", "") .replace("</body.content>", "") ) parsed_el = xml.findall("NewsItem/NewsComponent/ContentItem/Characteristics/Property") characteristics = self.parse_attribute_values(parsed_el, "Words") item["word_count"] = characteristics[0] if len(characteristics) else None parsed_el = xml.find("NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType") if parsed_el is not None: item.setdefault("usageterms", parsed_el.text) parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Genre") if parsed_el is not None: item["genre"] = [] for el in parsed_el: item["genre"].append({"name": el.get("FormalName")}) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)