def _update(self, provider): self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: logger.info('No path') return [] for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: filepath = os.path.join(self.path, filename) if os.path.isfile(filepath): stat = os.lstat(filepath) last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc) if self.is_latest_content(last_updated, provider.get('last_updated')): item = self.parser.parse_file(filepath) self.move_file(self.path, filename, provider=provider, success=True) yield [item] else: self.move_file(self.path, filename, provider=provider, success=True) except ParserError.IPTC7901ParserError() as ex: logger.exception("Ingest Type: DPA - File: {0} could not be processed".format(filename)) self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.IPTC7901ParserError(ex, provider) except ParserError as ex: self.move_file(self.path, filename, provider=provider, success=False) except Exception as ex: self.move_file(self.path, filename, provider=provider, success=False) raise ProviderError.ingestError(ex, provider)
def parse_message(self, tree, provider): item = {} try: docdata = tree.find('head/docdata') # set the default type. item['type'] = ITEM_CLASS_TEXT item['guid'] = item['uri'] = docdata.find('doc-id').get('id-string') item['urgency'] = docdata.find('urgency').get('ed-urg', '5') item['pubstatus'] = docdata.attrib.get('management-status', 'usable') item['firstcreated'] = get_norm_datetime(docdata.find('date.issue')) item['versioncreated'] = get_norm_datetime(docdata.find('date.issue')) item['expiry'] = get_norm_datetime(docdata.find('date.expire')) item['subject'] = get_subjects(tree) item['body_html'] = get_content(tree) item['place'] = get_places(docdata) item['keywords'] = get_keywords(docdata) if docdata.find('ed-msg') is not None: item['ednote'] = docdata.find('ed-msg').attrib.get('info') item['headline'] = tree.find('body/body.head/hedline/hl1').text elem = tree.find('body/body.head/abstract') item['abstract'] = elem.text if elem is not None else '' elem = tree.find('body/body.head/dateline/location/city') item['dateline'] = elem.text if elem is not None else '' item['byline'] = get_byline(tree) parse_meta(tree, item) item.setdefault('word_count', get_word_count(item['body_html'])) return item except Exception as ex: raise ParserError.nitfParserError(ex, provider)
def _update(self, provider): self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: return for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: if os.path.isfile(os.path.join(self.path, filename)): filepath = os.path.join(self.path, filename) stat = os.lstat(filepath) last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc) if self.is_latest_content(last_updated, provider.get('last_updated')): with open(os.path.join(self.path, filename), 'r') as f: item = self.parser.parse_message(etree.fromstring(f.read()), provider) self.add_timestamps(item) self.move_file(self.path, filename, provider=provider, success=True) yield [item] else: self.move_file(self.path, filename, provider=provider, success=True) except etreeParserError as ex: logger.exception("Ingest Type: AFP - File: {0} could not be processed".format(filename), ex) self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.newsmlOneParserError(ex, provider) except ParserError as ex: self.move_file(self.path, filename, provider=provider, success=False) except Exception as ex: self.move_file(self.path, filename, provider=provider, success=False) raise ProviderError.ingestError(ex, provider) push_notification('ingest:update')
def _update(self, provider, update): self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: logger.warn( 'File Feeding Service {} is configured without path. Please check the configuration' .format(provider['name'])) return [] registered_parser = self.get_feed_parser(provider) for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: last_updated = None file_path = os.path.join(self.path, filename) if os.path.isfile(file_path): stat = os.lstat(file_path) last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc) if self.is_latest_content(last_updated, provider.get('last_updated')): if isinstance(registered_parser, XMLFeedParser): with open(file_path, 'rb') as f: xml = etree.parse(f) parser = self.get_feed_parser( provider, xml.getroot()) item = parser.parse(xml.getroot(), provider) else: parser = self.get_feed_parser(provider, file_path) item = parser.parse(file_path, provider) self.after_extracting(item, provider) if isinstance(item, list): failed = yield item else: failed = yield [item] self.move_file(self.path, filename, provider=provider, success=not failed) else: self.move_file(self.path, filename, provider=provider, success=True) except Exception as ex: if last_updated and self.is_old_content(last_updated): self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.parseFileError( '{}-{}'.format(provider['name'], self.NAME), filename, ex, provider) push_notification('ingest:update')
def _update(self, provider): """ Check data provider for data updates and returns new items (if any). :param provider: data provider instance :return: a list containing a list of new content items :rtype: list :raises IngestApiError: if data retrieval error occurs :raises ParserError: if retrieved RSS data cannot be parsed """ config = provider.get('config', {}) if config.get('auth_required'): self.auth_info = { 'username': config.get('username', ''), 'password': config.get('password', '') } try: xml_data = self._fetch_data(config, provider) data = feedparser.parse(xml_data) except IngestApiError: raise except Exception as ex: raise ParserError.parseMessageError(ex, provider) # If provider last updated time is not available, set it to 1.1.1970 # so that it will be recognized as "not up to date". # Also convert it to a naive datetime object (removing tzinfo is fine, # because it is in UTC anyway) t_provider_updated = provider.get('last_updated', utcfromtimestamp(0)) t_provider_updated = t_provider_updated.replace(tzinfo=None) new_items = [] field_aliases = config.get('field_aliases') for entry in data.entries: t_entry_updated = utcfromtimestamp(timegm(entry.updated_parsed)) if t_entry_updated <= t_provider_updated: continue item = self._create_item(entry, field_aliases) self.add_timestamps(item) # If the RSS entry references any images, create picture items from # them and create a package referencing them and the entry itself. # If there are no image references, treat entry as a simple text # item, even if it might reference other media types, e.g. videos. image_urls = self._extract_image_links(entry) if image_urls: image_items = self._create_image_items(image_urls, item) new_items.extend(image_items) new_items.append(item) item = self._create_package(item, image_items) new_items.append(item) return [new_items]
def parse_email(self, content, content_type, provider): if content_type != 'text/xml': raise ParserError.parseMessageError('Not supported content type.') content.seek(0) xml = ET.parse(content) return self.parse(xml.getroot(), provider)
def _test(self, provider): """Test connection.""" config = provider.get('config', {}) xml = self._fetch_data(config, provider) data = feedparser.parse(xml) if data.bozo: raise ParserError.parseMessageError(data.bozo_exception, provider)
def parse(self, xml, provider=None): self.root = xml try: item = self.parse_item(xml) return [item] except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def _update(self, provider): self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: logger.info('No path') return [] for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: filepath = os.path.join(self.path, filename) if os.path.isfile(filepath): stat = os.lstat(filepath) last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc) if self.is_latest_content(last_updated, provider.get('last_updated')): item = self.parser.parse_file(filepath, provider) dpa_derive_dateline(item) self.move_file(self.path, filename, provider=provider, success=True) yield [item] else: self.move_file(self.path, filename, provider=provider, success=True) except Exception as ex: self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.parseFileError('DPA', filename, ex, provider)
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: item = self.parse_item(item_tree) try: published = item_tree.xpath('.//xhtml:body/xhtml:header/' 'xhtml:time[@class="publicationDate"]/@data-datetime', namespaces=NS)[0] except IndexError: item['firstcreated'] = item['versioncreated'] else: item['firstcreated'] = dateutil.parser.parse(published) items.append(item) # SDNTB-463 requires that slugline is removed del item['slugline'] sport = bool(item_tree.xpath('.//iptc:subject[@type="dpatype:category" and @qcode="dpacat:sp"]', namespaces=NS)) cat = utils.SPORT_CATEGORY if sport else utils.DEFAULT_CATEGORY category = {'qcode': cat, 'name': cat, 'scheme': 'category'} item['subject'] = utils.filter_missing_subjects(item.get('subject')) item['subject'].append(category) utils.set_default_service(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, file_path, provider=None): try: item = {ITEM_TYPE: CONTENT_TYPE.TEXT} with open(file_path, 'rb') as f: lines = [line for line in f] # parse first header line m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I) if m: item['provider_sequence'] = m.group(2).decode() # parse second header line m = re.match( b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-)([a-z-]+)(.*) ' b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})', lines[1], flags=re.I) if m: item['priority'] = self.map_priority(m.group(1).decode()) item['anpa_category'] = [{'qcode': m.group(2).decode()}] item['word_count'] = int(m.group(10).decode()) if m.group(4) == b'\x12': item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED # parse created date at the end of file m = re.search(b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I) if m: item['firstcreated'] = datetime.strptime(m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc) # parse anpa content body = b''.join(lines[2:]) m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S) if m: text = m.group(1).decode().split('\n') # text body_lines = [l.strip() for l in text if l.startswith('\t')] item['body_text'] = '\n'.join(body_lines) # content metadata header_lines = [l.strip('^<= ') for l in text if l.startswith('^')] if len(header_lines) > 3: item['headline'] = header_lines[1] item['byline'] = header_lines[-2] # slugline if len(header_lines) > 1: m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9]+)', header_lines[0], flags=re.I) if m: item['slugline'] = m.group(1) # ednote for line in header_lines: m = re.search("EDITOR'S NOTE _(.*)", line) if m: item['ednote'] = m.group(1).strip() return item except Exception as ex: raise ParserError.anpaParseFileError(file_path, ex)
def parse(self, xml, provider=None): self.provider = provider item = { ITEM_TYPE: CONTENT_TYPE.TEXT, # set the default type. 'versioncreated': utcnow(), 'anpa_category': [{ "name": "Formidlingstjenester", "qcode": "r" }], 'genre': [{ "name": "Fulltekstmeldinger", "qcode": "Fulltekstmeldinger", "scheme": "genre_custom" }], 'subject': [{ 'qcode': 'PRM-NTB', 'name': 'PRM-NTB', 'scheme': 'category' }], 'urgency': 6, 'ednote': '*** Dette er en pressemelding formidlet av NTB pva. andre ***' } try: self.do_mapping(item, xml) except Exception as ex: raise ParserError.parseMessageError(ex, provider) return [item]
def parse_message(self, tree, provider): item = {} try: docdata = tree.find("head/docdata") # set the default type. item["type"] = ITEM_CLASS_TEXT item["guid"] = item["uri"] = docdata.find("doc-id").get("id-string") item["urgency"] = docdata.find("urgency").get("ed-urg", "5") item["pubstatus"] = docdata.attrib.get("management-status", "usable") item["firstcreated"] = get_norm_datetime(docdata.find("date.issue")) item["versioncreated"] = get_norm_datetime(docdata.find("date.issue")) item["expiry"] = get_norm_datetime(docdata.find("date.expire")) item["subject"] = get_subjects(tree) item["body_html"] = get_content(tree) item["place"] = get_places(docdata) item["keywords"] = get_keywords(docdata) if docdata.find("ed-msg") is not None: item["ednote"] = docdata.find("ed-msg").attrib.get("info") item["headline"] = super().trim_headline(tree.find("body/body.head/hedline/hl1").text) elem = tree.find("body/body.head/abstract") item["abstract"] = elem.text if elem is not None else "" elem = tree.find("body/body.head/dateline/location/city") item["dateline"] = elem.text if elem is not None else "" item["byline"] = get_byline(tree) parse_meta(tree, item) item.setdefault("word_count", get_word_count(item["body_html"])) return item except Exception as ex: raise ParserError.nitfParserError(ex, provider)
def _update(self, provider): self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: return [] for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: filepath = os.path.join(self.path, filename) if os.path.isfile(filepath): stat = os.lstat(filepath) last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc) if self.is_latest_content(last_updated, provider.get('last_updated')): with open(filepath, 'r') as f: item = self.parser.parse_message(etree.fromstring(f.read()), provider) self.move_file(self.path, filename, provider=provider, success=True) yield [item] else: self.move_file(self.path, filename, provider=provider, success=True) except etreeParserError as ex: logger.exception("Ingest Type: AAP - File: {0} could not be processed".format(filename)) self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.nitfParserError(ex, provider) except ParserError as ex: self.move_file(self.path, filename, provider=provider, success=False) except Exception as ex: self.move_file(self.path, filename, provider=provider, success=False) raise ProviderError.ingestError(ex, provider) push_notification('ingest:update')
def _test(self, provider): """Test connection.""" self.provider = provider xml = self._fetch_data() data = feedparser.parse(xml) if data.bozo: raise ParserError.parseMessageError(data.bozo_exception, provider)
def _update(self, provider): """ Check data provider for data updates and returns new items (if any). :param provider: data provider instance :return: a list containing a list of new content items :rtype: list :raises IngestApiError: if data retrieval error occurs :raises ParserError: if retrieved RSS data cannot be parsed """ config = provider.get('config', {}) if config.get('auth_required'): self.auth_info = { 'username': config.get('username', ''), 'password': config.get('password', '') } try: xml_data = self._fetch_data(config, provider) data = feedparser.parse(xml_data) except IngestApiError: raise except Exception as ex: raise ParserError.parseMessageError(ex, provider) # If provider last updated time is not available, set it to 1.1.1970 # so that it will be recognized as "not up to date". # Also convert it to a naive datetime object (removing tzinfo is fine, # because it is in UTC anyway) t_provider_updated = provider.get('last_updated', utcfromtimestamp(0)) t_provider_updated = t_provider_updated.replace(tzinfo=None) new_items = [] field_aliases = config.get('field_aliases') for entry in data.entries: t_entry_updated = utcfromtimestamp(timegm(entry.updated_parsed)) if t_entry_updated <= t_provider_updated: continue item = self._create_item(entry, field_aliases, provider.get('source', None)) self.add_timestamps(item) # If the RSS entry references any images, create picture items from # them and create a package referencing them and the entry itself. # If there are no image references, treat entry as a simple text # item, even if it might reference other media types, e.g. videos. image_urls = self._extract_image_links(entry) if image_urls: image_items = self._create_image_items(image_urls, item) new_items.extend(image_items) new_items.append(item) item = self._create_package(item, image_items) new_items.append(item) return [new_items]
def parse_email(self, content, content_type, provider): if content_type != 'text/calendar': raise ParserError.parseMessageError('Not supported content type.') content.seek(0) cal = Calendar.from_ical(content.read()) return self.parse(cal, provider)
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find('NewsItem/NewsComponent/AdministrativeMetadata/Source') if parsed_el is not None: item['original_source'] = parsed_el.find('Party').get('FormalName', '') parsed_el = xml.find('NewsEnvelope/TransmissionId') if parsed_el is not None: item['ingest_provider_sequence'] = parsed_el.text parsed_el = xml.find('NewsEnvelope/Priority') item['priority'] = self.map_priority(parsed_el.text if parsed_el is not None else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Language') if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item['language'] = language[0]['FormalName'] if len(language) else '' keywords = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Property') item['keywords'] = self.parse_attribute_values(keywords, 'Keyword') subjects = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail') subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter') subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject') item['subject'] = self.format_subjects(subjects) # item['ContentItem'] = self.parse_attributes_as_dictionary( # tree.find('NewsItem/NewsComponent/ContentItem')) # item['Content'] = etree.tostring( # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content')) item['body_html'] = etree.tostring( xml.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'), encoding='unicode').replace('<body.content>', '').replace('</body.content>', '') parsed_el = xml.findall('NewsItem/NewsComponent/ContentItem/Characteristics/Property') characteristics = self.parse_attribute_values(parsed_el, 'Words') item['word_count'] = characteristics[0] if len(characteristics) else None parsed_el = xml.find('NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType') if parsed_el is not None: item.setdefault('usageterms', parsed_el.text) parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Genre') if parsed_el is not None: item['genre'] = [] for el in parsed_el: item['genre'].append({'name': el.get('FormalName')}) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse(self, xml, provider=None): item = {ITEM_TYPE: CONTENT_TYPE.TEXT, # set the default type. } try: self.do_mapping(item, xml, namespaces=NS) except Exception as ex: raise ParserError.parseMessageError(ex, provider) return item
def parse_message(self, tree, provider): """Parse NewsMessage.""" item = {} try: self.root = tree parsed_el = tree.find('NewsItem/NewsComponent/AdministrativeMetadata/Source') if parsed_el is not None: item['original_source'] = parsed_el.find('Party').get('FormalName', '') parsed_el = tree.find('NewsEnvelope/TransmissionId') if parsed_el is not None: item['ingest_provider_sequence'] = parsed_el.text self.parse_news_identifier(item, tree) self.parse_newslines(item, tree) self.parse_news_management(item, tree) parsed_el = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Language') if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item['language'] = language[0]['FormalName'] if len(language) else '' keywords = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Property') item['keywords'] = self.parse_attribute_values(keywords, 'Keyword') subjects = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail') subjects += tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter') subjects += tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject') item['subject'] = self.format_subjects(subjects) # item['ContentItem'] = self.parse_attributes_as_dictionary( # tree.find('NewsItem/NewsComponent/ContentItem')) # item['Content'] = etree.tostring( # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content')) item['body_html'] = etree.tostring( tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'), encoding='unicode').replace('<body.content>', '').replace('</body.content>', '') parsed_el = tree.findall('NewsItem/NewsComponent/ContentItem/Characteristics/Property') characteristics = self.parse_attribute_values(parsed_el, 'Words') item['word_count'] = characteristics[0] if len(characteristics) else None parsed_el = tree.find('NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType') if parsed_el is not None: item.setdefault('usageterms', parsed_el.text) parsed_el = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Genre') if parsed_el is not None: item['genre'] = [] for el in parsed_el: item['genre'].append({'name': el.get('FormalName')}) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def parse(self, xml, provider=None): item = super().parse(xml, provider) try: category = utils.ingest_category_from_subject(item.get('subject')) item.setdefault('subject', []).append(category) utils.set_default_service(item) except Exception as ex: raise ParserError.parseMessageError(ex, provider) return item
def parse(self, xml, provider=None): item = {} try: self.root = xml parsed_el = xml.find("NewsItem/NewsComponent/AdministrativeMetadata/Source") if parsed_el is not None: item["original_source"] = parsed_el.find("Party").get("FormalName", "") parsed_el = xml.find("NewsEnvelope/TransmissionId") if parsed_el is not None: item["ingest_provider_sequence"] = parsed_el.text parsed_el = xml.find("NewsEnvelope/Priority") item["priority"] = self.map_priority(parsed_el.text if parsed_el is not None else None) self.parse_news_identifier(item, xml) self.parse_newslines(item, xml) self.parse_news_management(item, xml) parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Language") if parsed_el is not None: language = self.parse_attributes_as_dictionary(parsed_el) item["language"] = language[0]["FormalName"] if len(language) else "" keywords = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Property") item["keywords"] = self.parse_attribute_values(keywords, "Keyword") subjects = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail") subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter") subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject") item["subject"] = self.format_subjects(subjects) # item['ContentItem'] = self.parse_attributes_as_dictionary( # tree.find('NewsItem/NewsComponent/ContentItem')) # item['Content'] = etree.tostring( # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content')) self.parse_content(item, xml) parsed_el = xml.findall("NewsItem/NewsComponent/ContentItem/Characteristics/Property") characteristics = self.parse_attribute_values(parsed_el, "Words") item["word_count"] = characteristics[0] if len(characteristics) else None parsed_el = xml.find("NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType") if parsed_el is not None: item.setdefault("usageterms", parsed_el.text) parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Genre") if parsed_el is not None: item["genre"] = [] for el in parsed_el: item["genre"].append({"name": el.get("FormalName")}) return self.populate_fields(item) except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def _update(self, provider, update): """ Check data provider for data updates and returns new items (if any). :param provider: data provider instance :return: a list containing a list of new content items :rtype: list :raises IngestApiError: if data retrieval error occurs :raises ParserError: if retrieved RSS data cannot be parsed """ xml_data = self._fetch_data() try: data = feedparser.parse(xml_data) except Exception as ex: raise ParserError.parseMessageError(ex, provider, data=xml_data) # If provider last updated time is not available, set it to 1.1.1970 # so that it will be recognized as "not up to date". # Also convert it to a naive datetime object (removing tzinfo is fine, # because it is in UTC anyway) t_provider_updated = provider.get(LAST_ITEM_UPDATE, utcfromtimestamp(0)) t_provider_updated = t_provider_updated.replace(tzinfo=None) new_items = [] field_aliases = self.config.get("field_aliases") for entry in data.entries: try: t_entry_updated = utcfromtimestamp(timegm( entry.updated_parsed)) if t_entry_updated <= t_provider_updated: continue except (AttributeError, TypeError): # missing updated info, so better ingest it pass item = self._create_item(entry, field_aliases, provider.get("source", None)) self.localize_timestamps(item) # If the RSS entry references any images, create picture items from # them and create a package referencing them and the entry itself. # If there are no image references, treat entry as a simple text # item, even if it might reference other media types, e.g. videos. image_urls = self._extract_image_links(entry) if image_urls: image_items = self._create_image_items(image_urls, item) new_items.extend(image_items) new_items.append(item) item = self._create_package(item, image_items) new_items.append(item) return [new_items]
def parse_file(self, filename, provider): try: item = {} self.set_item_defaults(item, provider) with open(filename, 'r', encoding='ascii') as f: lines = f.readlines() header = False body = False for line in lines: if self.START_OF_MESSAGE in line and not header: item['guid'] = filename + str(uuid.uuid4()) header = True continue if header: if line[0] in self.header_map: if self.header_map[line[0]]: item[self.header_map[line[0]]] = line[1:-1] continue if line[0] == self.CATEGORY: item[self.ITEM_ANPA_CATEGORY] = [{ 'qcode': line[1] }] continue if line[0] == self.FORMAT: if line[1] == self.TEXT: item[ITEM_TYPE] = CONTENT_TYPE.TEXT continue if line[1] == self.TABULAR: item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED continue continue if line[0] == self.IPTC: iptc_code = line[1:-1] item[self.ITEM_SUBJECT] = [{ 'qcode': iptc_code, 'name': subject_codes[iptc_code] }] continue header = False body = True item['body_html'] = line else: if self.END_OF_MESSAGE in line: break if body: item['body_html'] = item.get('body_html', '') + line return self.post_process_item(item, provider) except Exception as ex: raise ParserError.ZCZCParserError(exception=ex, provider=provider)
def parse(self, xml, provider=None): """ Parse content the xml newsml file to json object. Example content the xml newsml file: <?xml version="1.0" encoding="utf-8"?> <NewsML Version="1.2"> <!--AFP NewsML text-photo profile evolution2--> <!--Processed by Xafp1-4ToNewsML1-2 rev21--> <Catalog Href="http://www.afp.com/dtd/AFPCatalog.xml"/> <NewsEnvelope> ...... </NewsEnvelope> <NewsItem xml:lang="fr"> <Identification> ....... </Identification> <NewsManagement> ...... </NewsManagement> <NewsComponent> ...... </NewsComponent> </NewsItem> </NewsML> :param xml: :param provider: :return: """ self._provider = provider if self._provider is None: self._provider = {} try: self.root = xml self._items = [] self._item_seed = {} # parser the NewsEnvelope element self._item_seed.update( self.parse_newsenvelop(xml.find('NewsEnvelope')) ) # parser the NewsItem element for newsitem_el in xml.findall('NewsItem'): try: self.parse_newsitem(newsitem_el) except SkipItemException: continue return self._items except Exception as ex: raise ParserError.newsmlOneParserError(ex, self._provider)
def _update(self, provider, update): json_items = self._fetch_data() parsed_items = [] for item in json_items: try: parser = self.get_feed_parser(provider, item) parsed_items.append(parser.parse(item)) except Exception as ex: raise ParserError.parseMessageError(ex, provider, data=item) return parsed_items
def parse_file(self, filename, provider): try: path = provider.get('config', {}).get('path', None) if not path: return [] item = self.parser.parse_file(os.path.join(path, filename), provider) return [item] except Exception as ex: raise ParserError.parseFileError('Teletype', filename, ex, provider)
def parse(self, file_path, provider=None): try: item = {ITEM_TYPE: CONTENT_TYPE.TEXT, 'guid': generate_guid(type=GUID_TAG), 'versioncreated': utcnow()} with open(file_path, 'rb') as f: lines = [line for line in f] # parse first header line m = re.match(b'\x01([a-zA-Z]*)([0-9]*) (.) (.) ([0-9]*) ([a-zA-Z0-9 ]*)', lines[0], flags=re.I) if m: item['original_source'] = m.group(1).decode('latin-1', 'replace') item['ingest_provider_sequence'] = m.group(2).decode() item['priority'] = self.map_priority(m.group(3).decode()) item['anpa_category'] = [{'qcode': self.map_category(m.group(4).decode())}] item['word_count'] = int(m.group(5).decode()) inHeader = True inText = False inNote = False for line in lines[1:]: # STX starts the body of the story if line[0:1] == b'\x02': # pick the rest of the line off as the headline item['headline'] = line[1:].decode('latin-1', 'replace').rstrip('\r\n') item['body_html'] = '' inText = True inHeader = False continue # ETX denotes the end of the story if line[0:1] == b'\x03': break if inText: if line.decode('latin-1', 'replace')\ .find('The following information is not for publication') != -1 \ or line.decode('latin-1', 'replace').find( 'The following information is not intended for publication') != -1: inNote = True inText = False item['ednote'] = '' continue item['body_html'] += line.decode('latin-1', 'replace') if inNote: item['ednote'] += line.decode('latin-1', 'replace') continue if inHeader: if 'slugline' not in item: item['slugline'] = '' item['slugline'] += line.decode('latin-1', 'replace').rstrip('/\r\n') continue return item except Exception as ex: raise ParserError.IPTC7901ParserError(exception=ex, provider=provider)
def parse_message(self, tree, provider): """Parse NewsMessage.""" items = [] try: self.root = tree for item_set in tree.findall(self.qname("itemSet")): for item_tree in item_set: item = self.parse_item(item_tree) items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def _update(self, provider, update): # check if deprecated FILE_INGEST_OLD_CONTENT_MINUTES setting is still used if "FILE_INGEST_OLD_CONTENT_MINUTES" in app.config: deprecated_cont_min = app.config["FILE_INGEST_OLD_CONTENT_MINUTES"] cont_min = app.config[OLD_CONTENT_MINUTES] if deprecated_cont_min != cont_min: logger.warning( "'FILE_INGEST_OLD_CONTENT_MINUTES' is deprecated, please update settings.py to use {new_name!r}" .format(new_name=OLD_CONTENT_MINUTES)) app.config[OLD_CONTENT_MINUTES] = deprecated_cont_min self.provider = provider self.path = provider.get('config', {}).get('path', None) if not self.path: logger.warn('File Feeding Service {} is configured without path. Please check the configuration' .format(provider['name'])) return [] registered_parser = self.get_feed_parser(provider) for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created): try: last_updated = None file_path = os.path.join(self.path, filename) if os.path.isfile(file_path): last_updated = self.get_last_updated(file_path) if self.is_latest_content(last_updated, provider.get('last_updated')): if isinstance(registered_parser, XMLFeedParser): with open(file_path, 'rb') as f: xml = etree.parse(f) parser = self.get_feed_parser(provider, xml.getroot()) item = parser.parse(xml.getroot(), provider) else: parser = self.get_feed_parser(provider, file_path) item = parser.parse(file_path, provider) self.after_extracting(item, provider) if isinstance(item, list): failed = yield item else: failed = yield [item] self.move_file(self.path, filename, provider=provider, success=not failed) else: self.move_file(self.path, filename, provider=provider, success=False) except Exception as ex: if last_updated and self.is_old_content(last_updated): self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.parseFileError('{}-{}'.format(provider['name'], self.NAME), filename, ex, provider) push_notification('ingest:update')
def parse_message(self, tree, provider): """Parse NewsMessage.""" items = [] try: self.root = tree for item_set in tree.findall(self.qname('itemSet')): for item_tree in item_set: item = self.parse_item(item_tree) items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): item = {} try: docdata = xml.find('head/docdata') # set the default type. item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['guid'] = item['uri'] = docdata.find('doc-id').get('id-string') if docdata.find('urgency') is not None: item['urgency'] = int(docdata.find('urgency').get('ed-urg', '5')) item['pubstatus'] = (docdata.attrib.get('management-status', 'usable')).lower() item['firstcreated'] = get_norm_datetime(docdata.find('date.issue')) item['versioncreated'] = get_norm_datetime(docdata.find('date.issue')) if docdata.find('date.expire') is not None: item['expiry'] = get_norm_datetime(docdata.find('date.expire')) item['subject'] = get_subjects(xml) item['body_html'] = get_content(xml) item['place'] = get_places(docdata) item['keywords'] = get_keywords(docdata) if xml.find('head/tobject/tobject.property') is not None: genre = xml.find('head/tobject/tobject.property').get('tobject.property.type') genre_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='genre') if genre_map is not None: item['genre'] = [x for x in genre_map.get('items', []) if x['name'] == genre] if docdata.find('ed-msg') is not None: item['ednote'] = docdata.find('ed-msg').attrib.get('info') if xml.find('body/body.head/hedline/hl1') is not None: item['headline'] = xml.find('body/body.head/hedline/hl1').text else: if xml.find('head/title') is not None: item['headline'] = xml.find('head/title').text elem = xml.find('body/body.head/abstract/p') item['abstract'] = elem.text if elem is not None else '' if elem is None: elem = xml.find('body/body.head/abstract') item['abstract'] = elem.text if elem is not None else '' elem = xml.find('body/body.head/dateline/location/city') if elem is not None: self.set_dateline(item, city=elem.text) item['byline'] = get_byline(xml) parse_meta(xml, item) item.setdefault('word_count', get_word_count(item['body_html'])) return item except Exception as ex: raise ParserError.nitfParserError(ex, provider)
def parse(self, xml, provider=None): """ Parser content the xml newsml file to json object. Example content the xml newsml file: <?xml version="1.0" encoding="utf-8"?> <NewsML Version="1.2"> <!--AFP NewsML text-photo profile evolution2--> <!--Processed by Xafp1-4ToNewsML1-2 rev21--> <Catalog Href="http://www.afp.com/dtd/AFPCatalog.xml"/> <NewsEnvelope> ...... </NewsEnvelope> <NewsItem xml:lang="fr"> <Identification> ....... </Identification> <NewsManagement> ...... </NewsManagement> <NewsComponent> ...... </NewsComponent> </NewsItem> </NewsML> :param xml: :param provider: :return: """ try: items = [] self.root = xml # parser the NewsEnvelope element item_envelop = self.parser_newsenvelop(xml.find('NewsEnvelope')) # parser the NewsItem element l_newsitem_el = xml.findall('NewsItem') for newsitem_el in l_newsitem_el: try: item = item_envelop.copy() self.parser_newsitem(item, newsitem_el) item = self.populate_fields(item) except SkipItemException: continue items.append(item) return items except Exception as ex: raise ParserError.newsmlOneParserError(ex, provider)
def test_parse_message_error_save_data(self): data = 'some data' with assert_raises(ParserError): try: raise Exception("Err message") except Exception as ex: raise ParserError.parseMessageError(ex, self.provider, data=data) self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) message = self.mock_logger_handler.messages['error'][0] self.assertIn('file=', message) filename = message.split('file=')[1] with open(filename, 'r') as file: self.assertEqual(data, file.read())
def parse(self, xml, provider=None): item = {ITEM_TYPE: CONTENT_TYPE.TEXT, # set the default type. } try: self.do_mapping(item, xml, SETTINGS_MAPPING_PARAM) elem = xml.find('body/body.head/dateline/location/city') if elem is not None: self.set_dateline(item, city=elem.text) item.setdefault('word_count', get_word_count(item['body_html'], no_html=True)) except Exception as ex: raise ParserError.nitfParserError(ex, provider) return item
def _update(self, provider, update): config = provider.get('config', {}) json_items = self._fetch_data(config, provider) parsed_items = [] for item in json_items: try: parser = self.get_feed_parser(provider, item) parsed_items.append(parser.parse(item)) except Exception as ex: raise ParserError.parseMessageError(ex, provider, data=item) return parsed_items
def parse(self, xml, provider=None): self.root = xml items = [] try: header = self.parse_header(xml) for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: item = self.parse_item(item_tree) item['priority'] = header['priority'] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def test_raise_parseMessageError(self): with assert_raises(ParserError) as error_context: ex = Exception("Testing parseMessageError") raise ParserError.parseMessageError(ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 1001) self.assertTrue(exception.message == "Message could not be parsed") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing parseMessageError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual(self.mock_logger_handler.messages['error'][0], "ParserError Error 1001 - Message could not be parsed: " "Testing parseMessageError on channel TestProvider")
def parse_file(self, filename, provider): try: self.path = provider.get('config', {}).get('path', None) if not self.path: return [] with open(os.path.join(self.path, filename), 'r') as f: item = self.parser.parse_message(etree.fromstring(f.read()), provider) return [item] except Exception as ex: self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.parseFileError('AAP', filename, ex, provider)
def parse(self, xml, provider=None): itemList = [] try: for entry in xml.findall(self.qname('entry', self.ATOM_NS)): item = {} self.set_item_defaults(item) self.parse_content_management(item, entry) self.parse_news_management(item, entry) item['body_html'] = self.get_elem_content(entry.find(self.qname('content', self.ATOM_NS))) itemList.append(item) return itemList except Exception as ex: raise ParserError.wennParserError(ex, provider)
def parse_message(self, tree, provider): itemList = [] try: for entry in tree.findall(self.qname('entry', self.ATOM_NS)): item = {} self.set_item_defaults(item) self.parse_content_management(item, entry) self.parse_news_management(item, entry) item['body_html'] = self.get_elem_content(entry.find(self.qname('content', self.ATOM_NS))) itemList.append(item) return itemList except Exception as ex: raise ParserError.wennParserError(ex, provider)
def test_raise_newsmlOneParserError(self): with assert_raises(ParserError) as error_context: try: raise Exception("Testing newsmlOneParserError") except Exception as ex: raise ParserError.newsmlOneParserError(ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 1004) self.assertTrue(exception.message == "NewsML1 input could not be processed") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing newsmlOneParserError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual(self.mock_logger_handler.messages['error'][0], "ParserError Error 1004 - NewsML1 input could not be processed: " "Testing newsmlOneParserError on channel TestProvider")
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: # Ignore the packageItem, it has no guid if 'guid' in item_tree.attrib: item = self.parse_item(item_tree) item['priority'] = 6 item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] item.setdefault('word_count', get_word_count(item['body_html'])) # Hard code the urgency item['urgency'] = 3 # Dateline is always Wellington in NZ located = [c for c in app.locators.find_cities(country_code='NZ', state_code='NZ.G2') if c.get('city', '').lower() == 'wellington'] if len(located) == 1: item['dateline'] = dict() item['dateline']['located'] = located[0] if item.get('body_html') and item['dateline']: parsed = parse_html(item.get('body_html'), content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue # check the first par for a byline if pars.index(par) == 0 and par.text.startswith('By '): item['byline'] = par.text.replace('By ', '') par.getparent().remove(par) date, source, the_rest = par.text.partition(' (BusinessDesk) - ') if source: item['dateline']['date'] = date_parser(date, fuzzy=True) par.text = the_rest # remove the signoff if in the last par if par.text == '(BusinessDesk)' and pars.index(par) + 1 == len(pars): par.getparent().remove(par) item['body_html'] = to_string(parsed, remove_root_div=True) locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') if locator_map: item['place'] = [x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ'] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): itemList = [] try: for entry in xml.findall(self.qname('entry', self.ATOM_NS)): item = {} self.set_item_defaults(item) self.parse_content_management(item, entry) self.parse_news_management(item, entry) item['body_html'] = self.get_elem_content(entry.find(self.qname('content', self.ATOM_NS))) item['body_html'] = item['body_html'].replace('\n\n ', '</p><p>').replace('\n', '<br>') item['body_html'] = '<p>' + item['body_html'] + '</p>' itemList.append(item) return itemList except Exception as ex: raise ParserError.wennParserError(ex, provider)
def _get_decsription(self, lines, provider): """Lookup the bom product to determine the descriptive string, not finding this is a fatal error :param lines: :param provider: :return: """ warning_str = 'Unknown' bom_products_map = get_resource_service('vocabularies').find_one(req=None, _id='bom_products') product = [x for x in bom_products_map.get('items', []) if x['qcode'] == lines[0].strip() and x['is_active']] if len(product) > 0: warning_str = product[0].get('name', '') else: logger.error('No BOM product mapping found for {}'.format(lines[0].strip())) raise ParserError.parseMessageError(Exception('No BOM product'), provider, data=lines[0]) return warning_str
def test_raise_nitfParserError(self): with assert_raises(ParserError) as error_context: try: ex = Exception("Testing nitfParserError") raise ex except Exception: raise ParserError.nitfParserError(ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 1006) self.assertTrue(exception.message == "NITF input could not be processed") self.assertIsNotNone(exception.system_exception) self.assertEquals(exception.system_exception.args[0], "Testing nitfParserError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) self.assertEqual(self.mock_logger_handler.messages['error'][0], "ParserError Error 1006 - NITF input could not be processed: " "Testing nitfParserError on channel TestProvider")
def parse(self, xml, provider=None): self.root = xml try: item = self.parse_item(xml) if not item.get('headline'): item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100] try: abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text except IndexError: pass else: if abstract: item['abstract'] = abstract return [item] except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def test_raise_parseFileError(self): with assert_raises(ParserError) as error_context: try: raise Exception("Testing parseFileError") except Exception as ex: raise ParserError.parseFileError('afp', 'test.txt', ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 1002) self.assertTrue(exception.message == "Ingest file could not be parsed") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing parseFileError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 1) message = self.mock_logger_handler.messages['error'][0] self.assertIn("ParserError Error 1002 - Ingest file could not be parsed", message) self.assertIn("Testing parseFileError on channel TestProvider", message) self.assertIn("source=afp", message) self.assertIn("file=test.txt", message)
def parse_file(self, filename, provider): try: path = provider.get('config', {}).get('path', None) if not path: return [] with open(os.path.join(path, filename), 'r') as f: item = self.parser.parse_message(etree.fromstring(f.read()), provider) item['firstcreated'] = normalize_date(item.get('firstcreated'), self.tz) item['versioncreated'] = normalize_date(item.get('versioncreated'), self.tz) return [item] except Exception as ex: self.move_file(self.path, filename, provider=provider, success=False) raise ParserError.parseFileError('AAP', filename, ex, provider)
def parse(self, xml, provider=None): self.root = xml items = [] try: header = self.parse_header(xml) for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: item = self.parse_item(item_tree) item['priority'] = header['priority'] items.append(item) else: if xml.tag.endswith('newsItem') or xml.tag.endswith('packageItem'): item = self.parse_item(xml) item.setdefault('priority', header['priority']) items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def test_raise_parseFileError(self): with assert_raises(ParserError) as error_context: try: ex = Exception("Testing parseFileError") raise ex except Exception: raise ParserError.parseFileError('afp', 'test.txt', ex, self.provider) exception = error_context.exception self.assertTrue(exception.code == 1002) self.assertTrue(exception.message == "Ingest file could not be parsed") self.assertIsNotNone(exception.system_exception) self.assertEqual(exception.system_exception.args[0], "Testing parseFileError") self.assertEqual(len(self.mock_logger_handler.messages['error']), 2) self.assertEqual(self.mock_logger_handler.messages['error'][0], "Source Type: afp - File: test.txt could not be processed") self.assertEqual(self.mock_logger_handler.messages['error'][1], "ParserError Error 1002 - Ingest file could not be parsed: " "Testing parseFileError on channel TestProvider")
def parse(self, xml, provider=None): self.root = xml items = [] try: header = self.parse_header(xml) for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: # Ignore the packageItem, it has no guid if 'guid' in item_tree.attrib: item = self.parse_item(item_tree) item['priority'] = header['priority'] item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] item.setdefault('word_count', get_word_count(item['body_html'])) items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)