Exemplo n.º 1
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.info('No path')
            return []

        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                filepath = os.path.join(self.path, filename)
                if os.path.isfile(filepath):
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)
                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        item = self.parser.parse_file(filepath)

                        self.move_file(self.path, filename, provider=provider, success=True)
                        yield [item]
                    else:
                        self.move_file(self.path, filename, provider=provider, success=True)
            except ParserError.IPTC7901ParserError() as ex:
                logger.exception("Ingest Type: DPA - File: {0} could not be processed".format(filename))
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.IPTC7901ParserError(ex, provider)
            except ParserError as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ProviderError.ingestError(ex, provider)
Exemplo n.º 2
0
    def parse_message(self, tree, provider):
        item = {}
        try:
            docdata = tree.find('head/docdata')
            # set the default type.
            item['type'] = ITEM_CLASS_TEXT
            item['guid'] = item['uri'] = docdata.find('doc-id').get('id-string')
            item['urgency'] = docdata.find('urgency').get('ed-urg', '5')
            item['pubstatus'] = docdata.attrib.get('management-status', 'usable')
            item['firstcreated'] = get_norm_datetime(docdata.find('date.issue'))
            item['versioncreated'] = get_norm_datetime(docdata.find('date.issue'))
            item['expiry'] = get_norm_datetime(docdata.find('date.expire'))
            item['subject'] = get_subjects(tree)
            item['body_html'] = get_content(tree)
            item['place'] = get_places(docdata)
            item['keywords'] = get_keywords(docdata)

            if docdata.find('ed-msg') is not None:
                item['ednote'] = docdata.find('ed-msg').attrib.get('info')

            item['headline'] = tree.find('body/body.head/hedline/hl1').text

            elem = tree.find('body/body.head/abstract')
            item['abstract'] = elem.text if elem is not None else ''

            elem = tree.find('body/body.head/dateline/location/city')
            item['dateline'] = elem.text if elem is not None else ''
            item['byline'] = get_byline(tree)

            parse_meta(tree, item)
            item.setdefault('word_count', get_word_count(item['body_html']))
            return item
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
Exemplo n.º 3
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)
        if not self.path:
            return

        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                if os.path.isfile(os.path.join(self.path, filename)):
                    filepath = os.path.join(self.path, filename)
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)
                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        with open(os.path.join(self.path, filename), 'r') as f:
                            item = self.parser.parse_message(etree.fromstring(f.read()), provider)

                            self.add_timestamps(item)
                            self.move_file(self.path, filename, provider=provider, success=True)
                            yield [item]
                    else:
                        self.move_file(self.path, filename, provider=provider, success=True)
            except etreeParserError as ex:
                logger.exception("Ingest Type: AFP - File: {0} could not be processed".format(filename), ex)
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.newsmlOneParserError(ex, provider)
            except ParserError as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ProviderError.ingestError(ex, provider)

        push_notification('ingest:update')
Exemplo n.º 4
0
    def _update(self, provider, update):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn(
                'File Feeding Service {} is configured without path. Please check the configuration'
                .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path,
                                         sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    stat = os.lstat(file_path)
                    last_updated = datetime.fromtimestamp(stat.st_mtime,
                                                          tz=utc)

                    if self.is_latest_content(last_updated,
                                              provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'rb') as f:
                                xml = etree.parse(f)
                                parser = self.get_feed_parser(
                                    provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)

                        if isinstance(item, list):
                            failed = yield item
                        else:
                            failed = yield [item]

                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=not failed)
                    else:
                        self.move_file(self.path,
                                       filename,
                                       provider=provider,
                                       success=True)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path,
                                   filename,
                                   provider=provider,
                                   success=False)
                raise ParserError.parseFileError(
                    '{}-{}'.format(provider['name'], self.NAME), filename, ex,
                    provider)

        push_notification('ingest:update')
Exemplo n.º 5
0
    def _update(self, provider):
        """
        Check data provider for data updates and returns new items (if any).

        :param provider: data provider instance
        :return: a list containing a list of new content items
        :rtype: list

        :raises IngestApiError: if data retrieval error occurs
        :raises ParserError: if retrieved RSS data cannot be parsed
        """
        config = provider.get('config', {})

        if config.get('auth_required'):
            self.auth_info = {
                'username': config.get('username', ''),
                'password': config.get('password', '')
            }

        try:
            xml_data = self._fetch_data(config, provider)
            data = feedparser.parse(xml_data)
        except IngestApiError:
            raise
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)

        # If provider last updated time is not available, set it to 1.1.1970
        # so that it will be recognized as "not up to date".
        # Also convert it to a naive datetime object (removing tzinfo is fine,
        # because it is in UTC anyway)
        t_provider_updated = provider.get('last_updated', utcfromtimestamp(0))
        t_provider_updated = t_provider_updated.replace(tzinfo=None)

        new_items = []
        field_aliases = config.get('field_aliases')

        for entry in data.entries:
            t_entry_updated = utcfromtimestamp(timegm(entry.updated_parsed))

            if t_entry_updated <= t_provider_updated:
                continue

            item = self._create_item(entry, field_aliases)
            self.add_timestamps(item)

            # If the RSS entry references any images, create picture items from
            # them and create a package referencing them and the entry itself.
            # If there are no image references, treat entry as a simple text
            # item, even if it might reference other media types, e.g. videos.
            image_urls = self._extract_image_links(entry)
            if image_urls:
                image_items = self._create_image_items(image_urls, item)
                new_items.extend(image_items)
                new_items.append(item)
                item = self._create_package(item, image_items)

            new_items.append(item)

        return [new_items]
Exemplo n.º 6
0
    def parse_email(self, content, content_type, provider):
        if content_type != 'text/xml':
            raise ParserError.parseMessageError('Not supported content type.')

        content.seek(0)
        xml = ET.parse(content)
        return self.parse(xml.getroot(), provider)
Exemplo n.º 7
0
 def _test(self, provider):
     """Test connection."""
     config = provider.get('config', {})
     xml = self._fetch_data(config, provider)
     data = feedparser.parse(xml)
     if data.bozo:
         raise ParserError.parseMessageError(data.bozo_exception, provider)
Exemplo n.º 8
0
 def parse(self, xml, provider=None):
     self.root = xml
     try:
         item = self.parse_item(xml)
         return [item]
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
Exemplo n.º 9
0
 def _test(self, provider):
     """Test connection."""
     config = provider.get('config', {})
     xml = self._fetch_data(config, provider)
     data = feedparser.parse(xml)
     if data.bozo:
         raise ParserError.parseMessageError(data.bozo_exception, provider)
Exemplo n.º 10
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.info('No path')
            return []

        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                filepath = os.path.join(self.path, filename)
                if os.path.isfile(filepath):
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)
                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        item = self.parser.parse_file(filepath, provider)
                        dpa_derive_dateline(item)

                        self.move_file(self.path, filename, provider=provider, success=True)
                        yield [item]
                    else:
                        self.move_file(self.path, filename, provider=provider, success=True)
            except Exception as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.parseFileError('DPA', filename, ex, provider)
 def parse(self, xml, provider=None):
     self.root = xml
     items = []
     try:
         for item_set in xml.findall(self.qname('itemSet')):
             for item_tree in item_set:
                 item = self.parse_item(item_tree)
                 try:
                     published = item_tree.xpath('.//xhtml:body/xhtml:header/'
                                                 'xhtml:time[@class="publicationDate"]/@data-datetime',
                                                 namespaces=NS)[0]
                 except IndexError:
                     item['firstcreated'] = item['versioncreated']
                 else:
                     item['firstcreated'] = dateutil.parser.parse(published)
                 items.append(item)
                 # SDNTB-463 requires that slugline is removed
                 del item['slugline']
                 sport = bool(item_tree.xpath('.//iptc:subject[@type="dpatype:category" and @qcode="dpacat:sp"]',
                                              namespaces=NS))
                 cat = utils.SPORT_CATEGORY if sport else utils.DEFAULT_CATEGORY
                 category = {'qcode': cat, 'name': cat, 'scheme': 'category'}
                 item['subject'] = utils.filter_missing_subjects(item.get('subject'))
                 item['subject'].append(category)
                 utils.set_default_service(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
Exemplo n.º 12
0
    def parse(self, file_path, provider=None):
        try:
            item = {ITEM_TYPE: CONTENT_TYPE.TEXT}

            with open(file_path, 'rb') as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I)
            if m:
                item['provider_sequence'] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-)([a-z-]+)(.*) '
                b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})',
                lines[1], flags=re.I)
            if m:
                item['priority'] = self.map_priority(m.group(1).decode())
                item['anpa_category'] = [{'qcode': m.group(2).decode()}]
                item['word_count'] = int(m.group(10).decode())
                if m.group(4) == b'\x12':
                    item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED

            # parse created date at the end of file
            m = re.search(b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I)
            if m:
                item['firstcreated'] = datetime.strptime(m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc)

            # parse anpa content
            body = b''.join(lines[2:])
            m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode().split('\n')

                # text
                body_lines = [l.strip() for l in text if l.startswith('\t')]
                item['body_text'] = '\n'.join(body_lines)

                # content metadata
                header_lines = [l.strip('^<= ') for l in text if l.startswith('^')]
                if len(header_lines) > 3:
                    item['headline'] = header_lines[1]
                    item['byline'] = header_lines[-2]

                # slugline
                if len(header_lines) > 1:
                    m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9]+)', header_lines[0], flags=re.I)
                    if m:
                        item['slugline'] = m.group(1)

                # ednote
                for line in header_lines:
                    m = re.search("EDITOR'S NOTE _(.*)", line)
                    if m:
                        item['ednote'] = m.group(1).strip()

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)
Exemplo n.º 13
0
    def parse(self, xml, provider=None):
        self.provider = provider
        item = {
            ITEM_TYPE:
            CONTENT_TYPE.TEXT,  # set the default type.
            'versioncreated':
            utcnow(),
            'anpa_category': [{
                "name": "Formidlingstjenester",
                "qcode": "r"
            }],
            'genre': [{
                "name": "Fulltekstmeldinger",
                "qcode": "Fulltekstmeldinger",
                "scheme": "genre_custom"
            }],
            'subject': [{
                'qcode': 'PRM-NTB',
                'name': 'PRM-NTB',
                'scheme': 'category'
            }],
            'urgency':
            6,
            'ednote':
            '*** Dette er en pressemelding formidlet av NTB pva. andre ***'
        }

        try:
            self.do_mapping(item, xml)
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)
        return [item]
Exemplo n.º 14
0
    def parse_message(self, tree, provider):
        item = {}
        try:
            docdata = tree.find('head/docdata')
            # set the default type.
            item['type'] = ITEM_CLASS_TEXT
            item['guid'] = item['uri'] = docdata.find('doc-id').get('id-string')
            item['urgency'] = docdata.find('urgency').get('ed-urg', '5')
            item['pubstatus'] = docdata.attrib.get('management-status', 'usable')
            item['firstcreated'] = get_norm_datetime(docdata.find('date.issue'))
            item['versioncreated'] = get_norm_datetime(docdata.find('date.issue'))
            item['expiry'] = get_norm_datetime(docdata.find('date.expire'))
            item['subject'] = get_subjects(tree)
            item['body_html'] = get_content(tree)
            item['place'] = get_places(docdata)
            item['keywords'] = get_keywords(docdata)

            if docdata.find('ed-msg') is not None:
                item['ednote'] = docdata.find('ed-msg').attrib.get('info')

            item['headline'] = tree.find('body/body.head/hedline/hl1').text

            elem = tree.find('body/body.head/abstract')
            item['abstract'] = elem.text if elem is not None else ''

            elem = tree.find('body/body.head/dateline/location/city')
            item['dateline'] = elem.text if elem is not None else ''
            item['byline'] = get_byline(tree)

            parse_meta(tree, item)
            item.setdefault('word_count', get_word_count(item['body_html']))
            return item
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
Exemplo n.º 15
0
    def parse_message(self, tree, provider):
        item = {}
        try:
            docdata = tree.find("head/docdata")
            # set the default type.
            item["type"] = ITEM_CLASS_TEXT
            item["guid"] = item["uri"] = docdata.find("doc-id").get("id-string")
            item["urgency"] = docdata.find("urgency").get("ed-urg", "5")
            item["pubstatus"] = docdata.attrib.get("management-status", "usable")
            item["firstcreated"] = get_norm_datetime(docdata.find("date.issue"))
            item["versioncreated"] = get_norm_datetime(docdata.find("date.issue"))
            item["expiry"] = get_norm_datetime(docdata.find("date.expire"))
            item["subject"] = get_subjects(tree)
            item["body_html"] = get_content(tree)
            item["place"] = get_places(docdata)
            item["keywords"] = get_keywords(docdata)

            if docdata.find("ed-msg") is not None:
                item["ednote"] = docdata.find("ed-msg").attrib.get("info")

            item["headline"] = super().trim_headline(tree.find("body/body.head/hedline/hl1").text)

            elem = tree.find("body/body.head/abstract")
            item["abstract"] = elem.text if elem is not None else ""

            elem = tree.find("body/body.head/dateline/location/city")
            item["dateline"] = elem.text if elem is not None else ""
            item["byline"] = get_byline(tree)

            parse_meta(tree, item)
            item.setdefault("word_count", get_word_count(item["body_html"]))
            return item
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
Exemplo n.º 16
0
    def _update(self, provider):
        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            return []

        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                filepath = os.path.join(self.path, filename)
                if os.path.isfile(filepath):
                    stat = os.lstat(filepath)
                    last_updated = datetime.fromtimestamp(stat.st_mtime, tz=utc)
                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        with open(filepath, 'r') as f:
                            item = self.parser.parse_message(etree.fromstring(f.read()), provider)
                            self.move_file(self.path, filename, provider=provider, success=True)
                            yield [item]
                    else:
                        self.move_file(self.path, filename, provider=provider, success=True)
            except etreeParserError as ex:
                logger.exception("Ingest Type: AAP - File: {0} could not be processed".format(filename))
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.nitfParserError(ex, provider)
            except ParserError as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                self.move_file(self.path, filename, provider=provider, success=False)
                raise ProviderError.ingestError(ex, provider)

        push_notification('ingest:update')
Exemplo n.º 17
0
 def _test(self, provider):
     """Test connection."""
     self.provider = provider
     xml = self._fetch_data()
     data = feedparser.parse(xml)
     if data.bozo:
         raise ParserError.parseMessageError(data.bozo_exception, provider)
Exemplo n.º 18
0
    def _update(self, provider):
        """
        Check data provider for data updates and returns new items (if any).

        :param provider: data provider instance
        :return: a list containing a list of new content items
        :rtype: list

        :raises IngestApiError: if data retrieval error occurs
        :raises ParserError: if retrieved RSS data cannot be parsed
        """
        config = provider.get('config', {})

        if config.get('auth_required'):
            self.auth_info = {
                'username': config.get('username', ''),
                'password': config.get('password', '')
            }

        try:
            xml_data = self._fetch_data(config, provider)
            data = feedparser.parse(xml_data)
        except IngestApiError:
            raise
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider)

        # If provider last updated time is not available, set it to 1.1.1970
        # so that it will be recognized as "not up to date".
        # Also convert it to a naive datetime object (removing tzinfo is fine,
        # because it is in UTC anyway)
        t_provider_updated = provider.get('last_updated', utcfromtimestamp(0))
        t_provider_updated = t_provider_updated.replace(tzinfo=None)

        new_items = []
        field_aliases = config.get('field_aliases')

        for entry in data.entries:
            t_entry_updated = utcfromtimestamp(timegm(entry.updated_parsed))

            if t_entry_updated <= t_provider_updated:
                continue

            item = self._create_item(entry, field_aliases, provider.get('source', None))
            self.add_timestamps(item)

            # If the RSS entry references any images, create picture items from
            # them and create a package referencing them and the entry itself.
            # If there are no image references, treat entry as a simple text
            # item, even if it might reference other media types, e.g. videos.
            image_urls = self._extract_image_links(entry)
            if image_urls:
                image_items = self._create_image_items(image_urls, item)
                new_items.extend(image_items)
                new_items.append(item)
                item = self._create_package(item, image_items)

            new_items.append(item)

        return [new_items]
Exemplo n.º 19
0
    def parse_email(self, content, content_type, provider):
        if content_type != 'text/calendar':
            raise ParserError.parseMessageError('Not supported content type.')

        content.seek(0)
        cal = Calendar.from_ical(content.read())
        return self.parse(cal, provider)
Exemplo n.º 20
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find('NewsItem/NewsComponent/AdministrativeMetadata/Source')
            if parsed_el is not None:
                item['original_source'] = parsed_el.find('Party').get('FormalName', '')

            parsed_el = xml.find('NewsEnvelope/TransmissionId')
            if parsed_el is not None:
                item['ingest_provider_sequence'] = parsed_el.text

            parsed_el = xml.find('NewsEnvelope/Priority')
            item['priority'] = self.map_priority(parsed_el.text if parsed_el is not None else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Language')
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item['language'] = language[0]['FormalName'] if len(language) else ''

            keywords = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Property')
            item['keywords'] = self.parse_attribute_values(keywords, 'Keyword')

            subjects = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail')
            subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter')
            subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject')

            item['subject'] = self.format_subjects(subjects)

            # item['ContentItem'] = self.parse_attributes_as_dictionary(
            #    tree.find('NewsItem/NewsComponent/ContentItem'))
            # item['Content'] = etree.tostring(
            # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'))

            item['body_html'] = etree.tostring(
                xml.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'),
                encoding='unicode').replace('<body.content>', '').replace('</body.content>', '')

            parsed_el = xml.findall('NewsItem/NewsComponent/ContentItem/Characteristics/Property')
            characteristics = self.parse_attribute_values(parsed_el, 'Words')
            item['word_count'] = characteristics[0] if len(characteristics) else None

            parsed_el = xml.find('NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType')
            if parsed_el is not None:
                item.setdefault('usageterms', parsed_el.text)

            parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Genre')
            if parsed_el is not None:
                item['genre'] = []
                for el in parsed_el:
                    item['genre'].append({'name': el.get('FormalName')})

            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Exemplo n.º 21
0
 def parse(self, xml, provider=None):
     item = {ITEM_TYPE: CONTENT_TYPE.TEXT,  # set the default type.
             }
     try:
         self.do_mapping(item, xml, namespaces=NS)
     except Exception as ex:
         raise ParserError.parseMessageError(ex, provider)
     return item
Exemplo n.º 22
0
 def parse(self, xml, provider=None):
     item = {ITEM_TYPE: CONTENT_TYPE.TEXT,  # set the default type.
             }
     try:
         self.do_mapping(item, xml, namespaces=NS)
     except Exception as ex:
         raise ParserError.parseMessageError(ex, provider)
     return item
Exemplo n.º 23
0
    def parse_message(self, tree, provider):
        """Parse NewsMessage."""
        item = {}
        try:
            self.root = tree

            parsed_el = tree.find('NewsItem/NewsComponent/AdministrativeMetadata/Source')
            if parsed_el is not None:
                item['original_source'] = parsed_el.find('Party').get('FormalName', '')

            parsed_el = tree.find('NewsEnvelope/TransmissionId')
            if parsed_el is not None:
                item['ingest_provider_sequence'] = parsed_el.text

            self.parse_news_identifier(item, tree)
            self.parse_newslines(item, tree)
            self.parse_news_management(item, tree)

            parsed_el = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Language')
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item['language'] = language[0]['FormalName'] if len(language) else ''

            keywords = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Property')
            item['keywords'] = self.parse_attribute_values(keywords, 'Keyword')

            subjects = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail')
            subjects += tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter')
            subjects += tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject')

            item['subject'] = self.format_subjects(subjects)

            # item['ContentItem'] = self.parse_attributes_as_dictionary(
            #    tree.find('NewsItem/NewsComponent/ContentItem'))
            # item['Content'] = etree.tostring(
            # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'))

            item['body_html'] = etree.tostring(
                tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'),
                encoding='unicode').replace('<body.content>', '').replace('</body.content>', '')

            parsed_el = tree.findall('NewsItem/NewsComponent/ContentItem/Characteristics/Property')
            characteristics = self.parse_attribute_values(parsed_el, 'Words')
            item['word_count'] = characteristics[0] if len(characteristics) else None

            parsed_el = tree.find('NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType')
            if parsed_el is not None:
                item.setdefault('usageterms', parsed_el.text)

            parsed_el = tree.findall('NewsItem/NewsComponent/DescriptiveMetadata/Genre')
            if parsed_el is not None:
                item['genre'] = []
                for el in parsed_el:
                    item['genre'].append({'name': el.get('FormalName')})

            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Exemplo n.º 24
0
 def parse(self, xml, provider=None):
     item = super().parse(xml, provider)
     try:
         category = utils.ingest_category_from_subject(item.get('subject'))
         item.setdefault('subject', []).append(category)
         utils.set_default_service(item)
     except Exception as ex:
         raise ParserError.parseMessageError(ex, provider)
     return item
Exemplo n.º 25
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find("NewsItem/NewsComponent/AdministrativeMetadata/Source")
            if parsed_el is not None:
                item["original_source"] = parsed_el.find("Party").get("FormalName", "")

            parsed_el = xml.find("NewsEnvelope/TransmissionId")
            if parsed_el is not None:
                item["ingest_provider_sequence"] = parsed_el.text

            parsed_el = xml.find("NewsEnvelope/Priority")
            item["priority"] = self.map_priority(parsed_el.text if parsed_el is not None else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Language")
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item["language"] = language[0]["FormalName"] if len(language) else ""

            keywords = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Property")
            item["keywords"] = self.parse_attribute_values(keywords, "Keyword")

            subjects = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail")
            subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter")
            subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject")

            item["subject"] = self.format_subjects(subjects)

            # item['ContentItem'] = self.parse_attributes_as_dictionary(
            #    tree.find('NewsItem/NewsComponent/ContentItem'))
            # item['Content'] = etree.tostring(
            # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'))

            self.parse_content(item, xml)

            parsed_el = xml.findall("NewsItem/NewsComponent/ContentItem/Characteristics/Property")
            characteristics = self.parse_attribute_values(parsed_el, "Words")
            item["word_count"] = characteristics[0] if len(characteristics) else None

            parsed_el = xml.find("NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType")
            if parsed_el is not None:
                item.setdefault("usageterms", parsed_el.text)

            parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Genre")
            if parsed_el is not None:
                item["genre"] = []
                for el in parsed_el:
                    item["genre"].append({"name": el.get("FormalName")})

            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Exemplo n.º 26
0
    def _update(self, provider, update):
        """
        Check data provider for data updates and returns new items (if any).

        :param provider: data provider instance
        :return: a list containing a list of new content items
        :rtype: list

        :raises IngestApiError: if data retrieval error occurs
        :raises ParserError: if retrieved RSS data cannot be parsed
        """
        xml_data = self._fetch_data()

        try:
            data = feedparser.parse(xml_data)
        except Exception as ex:
            raise ParserError.parseMessageError(ex, provider, data=xml_data)

        # If provider last updated time is not available, set it to 1.1.1970
        # so that it will be recognized as "not up to date".
        # Also convert it to a naive datetime object (removing tzinfo is fine,
        # because it is in UTC anyway)
        t_provider_updated = provider.get(LAST_ITEM_UPDATE,
                                          utcfromtimestamp(0))
        t_provider_updated = t_provider_updated.replace(tzinfo=None)

        new_items = []
        field_aliases = self.config.get("field_aliases")

        for entry in data.entries:
            try:
                t_entry_updated = utcfromtimestamp(timegm(
                    entry.updated_parsed))
                if t_entry_updated <= t_provider_updated:
                    continue
            except (AttributeError, TypeError):
                # missing updated info, so better ingest it
                pass

            item = self._create_item(entry, field_aliases,
                                     provider.get("source", None))
            self.localize_timestamps(item)

            # If the RSS entry references any images, create picture items from
            # them and create a package referencing them and the entry itself.
            # If there are no image references, treat entry as a simple text
            # item, even if it might reference other media types, e.g. videos.
            image_urls = self._extract_image_links(entry)
            if image_urls:
                image_items = self._create_image_items(image_urls, item)
                new_items.extend(image_items)
                new_items.append(item)
                item = self._create_package(item, image_items)

            new_items.append(item)

        return [new_items]
Exemplo n.º 27
0
    def parse_file(self, filename, provider):
        try:
            item = {}
            self.set_item_defaults(item, provider)

            with open(filename, 'r', encoding='ascii') as f:
                lines = f.readlines()
                header = False
                body = False
                for line in lines:
                    if self.START_OF_MESSAGE in line and not header:
                        item['guid'] = filename + str(uuid.uuid4())
                        header = True
                        continue
                    if header:
                        if line[0] in self.header_map:
                            if self.header_map[line[0]]:
                                item[self.header_map[line[0]]] = line[1:-1]
                            continue
                        if line[0] == self.CATEGORY:
                            item[self.ITEM_ANPA_CATEGORY] = [{
                                'qcode': line[1]
                            }]
                            continue
                        if line[0] == self.FORMAT:
                            if line[1] == self.TEXT:
                                item[ITEM_TYPE] = CONTENT_TYPE.TEXT
                                continue
                            if line[1] == self.TABULAR:
                                item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED
                                continue
                            continue
                        if line[0] == self.IPTC:
                            iptc_code = line[1:-1]
                            item[self.ITEM_SUBJECT] = [{
                                'qcode':
                                iptc_code,
                                'name':
                                subject_codes[iptc_code]
                            }]
                            continue
                        header = False
                        body = True
                        item['body_html'] = line
                    else:
                        if self.END_OF_MESSAGE in line:
                            break
                        if body:
                            item['body_html'] = item.get('body_html',
                                                         '') + line
            return self.post_process_item(item, provider)

        except Exception as ex:
            raise ParserError.ZCZCParserError(exception=ex, provider=provider)
Exemplo n.º 28
0
    def parse(self, xml, provider=None):
        """
        Parse content the xml newsml file to json object.

        Example content the xml newsml file:

        <?xml version="1.0" encoding="utf-8"?>
        <NewsML Version="1.2">
          <!--AFP NewsML text-photo profile evolution2-->
          <!--Processed by Xafp1-4ToNewsML1-2 rev21-->
          <Catalog Href="http://www.afp.com/dtd/AFPCatalog.xml"/>
          <NewsEnvelope>
            ......
          </NewsEnvelope>
          <NewsItem xml:lang="fr">
            <Identification>
                .......
            </Identification>
            <NewsManagement>
                ......
            </NewsManagement>
            <NewsComponent>
                ......
            </NewsComponent>
          </NewsItem>
        </NewsML>

        :param xml:
        :param provider:
        :return:
        """

        self._provider = provider
        if self._provider is None:
            self._provider = {}

        try:
            self.root = xml
            self._items = []
            self._item_seed = {}
            # parser the NewsEnvelope element
            self._item_seed.update(
                self.parse_newsenvelop(xml.find('NewsEnvelope'))
            )
            # parser the NewsItem element
            for newsitem_el in xml.findall('NewsItem'):
                try:
                    self.parse_newsitem(newsitem_el)
                except SkipItemException:
                    continue

            return self._items
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, self._provider)
Exemplo n.º 29
0
    def _update(self, provider, update):
        json_items = self._fetch_data()
        parsed_items = []

        for item in json_items:
            try:
                parser = self.get_feed_parser(provider, item)
                parsed_items.append(parser.parse(item))
            except Exception as ex:
                raise ParserError.parseMessageError(ex, provider, data=item)

        return parsed_items
Exemplo n.º 30
0
    def parse_file(self, filename, provider):
        try:
            path = provider.get('config', {}).get('path', None)

            if not path:
                return []

            item = self.parser.parse_file(os.path.join(path, filename), provider)

            return [item]
        except Exception as ex:
            raise ParserError.parseFileError('Teletype', filename, ex, provider)
Exemplo n.º 31
0
    def parse_file(self, filename, provider):
        try:
            path = provider.get('config', {}).get('path', None)

            if not path:
                return []

            item = self.parser.parse_file(os.path.join(path, filename), provider)

            return [item]
        except Exception as ex:
            raise ParserError.parseFileError('Teletype', filename, ex, provider)
Exemplo n.º 32
0
    def parse(self, file_path, provider=None):
        try:
            item = {ITEM_TYPE: CONTENT_TYPE.TEXT, 'guid': generate_guid(type=GUID_TAG),
                    'versioncreated': utcnow()}

            with open(file_path, 'rb') as f:
                lines = [line for line in f]
            # parse first header line
            m = re.match(b'\x01([a-zA-Z]*)([0-9]*) (.) (.) ([0-9]*) ([a-zA-Z0-9 ]*)', lines[0], flags=re.I)
            if m:
                item['original_source'] = m.group(1).decode('latin-1', 'replace')
                item['ingest_provider_sequence'] = m.group(2).decode()
                item['priority'] = self.map_priority(m.group(3).decode())
                item['anpa_category'] = [{'qcode': self.map_category(m.group(4).decode())}]
                item['word_count'] = int(m.group(5).decode())

            inHeader = True
            inText = False
            inNote = False
            for line in lines[1:]:
                # STX starts the body of the story
                if line[0:1] == b'\x02':
                    # pick the rest of the line off as the headline
                    item['headline'] = line[1:].decode('latin-1', 'replace').rstrip('\r\n')
                    item['body_html'] = ''
                    inText = True
                    inHeader = False
                    continue
                # ETX denotes the end of the story
                if line[0:1] == b'\x03':
                    break
                if inText:
                    if line.decode('latin-1', 'replace')\
                            .find('The following information is not for publication') != -1 \
                            or line.decode('latin-1', 'replace').find(
                                'The following information is not intended for publication') != -1:
                        inNote = True
                        inText = False
                        item['ednote'] = ''
                        continue
                    item['body_html'] += line.decode('latin-1', 'replace')
                if inNote:
                    item['ednote'] += line.decode('latin-1', 'replace')
                    continue
                if inHeader:
                    if 'slugline' not in item:
                        item['slugline'] = ''
                    item['slugline'] += line.decode('latin-1', 'replace').rstrip('/\r\n')
                    continue

            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(exception=ex, provider=provider)
Exemplo n.º 33
0
 def parse_message(self, tree, provider):
     """Parse NewsMessage."""
     items = []
     try:
         self.root = tree
         for item_set in tree.findall(self.qname("itemSet")):
             for item_tree in item_set:
                 item = self.parse_item(item_tree)
                 items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
Exemplo n.º 34
0
    def _update(self, provider, update):
        # check if deprecated FILE_INGEST_OLD_CONTENT_MINUTES setting is still used
        if "FILE_INGEST_OLD_CONTENT_MINUTES" in app.config:
            deprecated_cont_min = app.config["FILE_INGEST_OLD_CONTENT_MINUTES"]
            cont_min = app.config[OLD_CONTENT_MINUTES]
            if deprecated_cont_min != cont_min:
                logger.warning(
                    "'FILE_INGEST_OLD_CONTENT_MINUTES' is deprecated, please update settings.py to use {new_name!r}"
                    .format(new_name=OLD_CONTENT_MINUTES))
                app.config[OLD_CONTENT_MINUTES] = deprecated_cont_min

        self.provider = provider
        self.path = provider.get('config', {}).get('path', None)

        if not self.path:
            logger.warn('File Feeding Service {} is configured without path. Please check the configuration'
                        .format(provider['name']))
            return []

        registered_parser = self.get_feed_parser(provider)
        for filename in get_sorted_files(self.path, sort_by=FileSortAttributes.created):
            try:
                last_updated = None
                file_path = os.path.join(self.path, filename)
                if os.path.isfile(file_path):
                    last_updated = self.get_last_updated(file_path)

                    if self.is_latest_content(last_updated, provider.get('last_updated')):
                        if isinstance(registered_parser, XMLFeedParser):
                            with open(file_path, 'rb') as f:
                                xml = etree.parse(f)
                                parser = self.get_feed_parser(provider, xml.getroot())
                                item = parser.parse(xml.getroot(), provider)
                        else:
                            parser = self.get_feed_parser(provider, file_path)
                            item = parser.parse(file_path, provider)

                        self.after_extracting(item, provider)

                        if isinstance(item, list):
                            failed = yield item
                        else:
                            failed = yield [item]

                        self.move_file(self.path, filename, provider=provider, success=not failed)
                    else:
                        self.move_file(self.path, filename, provider=provider, success=False)
            except Exception as ex:
                if last_updated and self.is_old_content(last_updated):
                    self.move_file(self.path, filename, provider=provider, success=False)
                raise ParserError.parseFileError('{}-{}'.format(provider['name'], self.NAME), filename, ex, provider)

        push_notification('ingest:update')
Exemplo n.º 35
0
 def parse_message(self, tree, provider):
     """Parse NewsMessage."""
     items = []
     try:
         self.root = tree
         for item_set in tree.findall(self.qname('itemSet')):
             for item_tree in item_set:
                 item = self.parse_item(item_tree)
                 items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
Exemplo n.º 36
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            docdata = xml.find('head/docdata')
            # set the default type.
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item['guid'] = item['uri'] = docdata.find('doc-id').get('id-string')
            if docdata.find('urgency') is not None:
                item['urgency'] = int(docdata.find('urgency').get('ed-urg', '5'))
            item['pubstatus'] = (docdata.attrib.get('management-status', 'usable')).lower()
            item['firstcreated'] = get_norm_datetime(docdata.find('date.issue'))
            item['versioncreated'] = get_norm_datetime(docdata.find('date.issue'))

            if docdata.find('date.expire') is not None:
                item['expiry'] = get_norm_datetime(docdata.find('date.expire'))
            item['subject'] = get_subjects(xml)
            item['body_html'] = get_content(xml)
            item['place'] = get_places(docdata)
            item['keywords'] = get_keywords(docdata)

            if xml.find('head/tobject/tobject.property') is not None:
                genre = xml.find('head/tobject/tobject.property').get('tobject.property.type')
                genre_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='genre')
                if genre_map is not None:
                    item['genre'] = [x for x in genre_map.get('items', []) if x['name'] == genre]

            if docdata.find('ed-msg') is not None:
                item['ednote'] = docdata.find('ed-msg').attrib.get('info')

            if xml.find('body/body.head/hedline/hl1') is not None:
                item['headline'] = xml.find('body/body.head/hedline/hl1').text
            else:
                if xml.find('head/title') is not None:
                    item['headline'] = xml.find('head/title').text

            elem = xml.find('body/body.head/abstract/p')
            item['abstract'] = elem.text if elem is not None else ''
            if elem is None:
                elem = xml.find('body/body.head/abstract')
                item['abstract'] = elem.text if elem is not None else ''

            elem = xml.find('body/body.head/dateline/location/city')
            if elem is not None:
                self.set_dateline(item, city=elem.text)

            item['byline'] = get_byline(xml)

            parse_meta(xml, item)
            item.setdefault('word_count', get_word_count(item['body_html']))
            return item
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
Exemplo n.º 37
0
    def parse(self, xml, provider=None):
        """
        Parser content the xml newsml file to json object.

        Example content the xml newsml file:

        <?xml version="1.0" encoding="utf-8"?>
        <NewsML Version="1.2">
          <!--AFP NewsML text-photo profile evolution2-->
          <!--Processed by Xafp1-4ToNewsML1-2 rev21-->
          <Catalog Href="http://www.afp.com/dtd/AFPCatalog.xml"/>
          <NewsEnvelope>
            ......
          </NewsEnvelope>
          <NewsItem xml:lang="fr">
            <Identification>
                .......
            </Identification>
            <NewsManagement>
                ......
            </NewsManagement>
            <NewsComponent>
                ......
            </NewsComponent>
          </NewsItem>
        </NewsML>

        :param xml:
        :param provider:
        :return:
        """
        try:
            items = []
            self.root = xml

            # parser the NewsEnvelope element
            item_envelop = self.parser_newsenvelop(xml.find('NewsEnvelope'))

            # parser the NewsItem element
            l_newsitem_el = xml.findall('NewsItem')
            for newsitem_el in l_newsitem_el:
                try:
                    item = item_envelop.copy()
                    self.parser_newsitem(item, newsitem_el)
                    item = self.populate_fields(item)
                except SkipItemException:
                    continue
                items.append(item)
            return items

        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
Exemplo n.º 38
0
 def test_parse_message_error_save_data(self):
     data = 'some data'
     with assert_raises(ParserError):
         try:
             raise Exception("Err message")
         except Exception as ex:
             raise ParserError.parseMessageError(ex, self.provider, data=data)
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     message = self.mock_logger_handler.messages['error'][0]
     self.assertIn('file=', message)
     filename = message.split('file=')[1]
     with open(filename, 'r') as file:
         self.assertEqual(data, file.read())
Exemplo n.º 39
0
    def parse(self, xml, provider=None):
        item = {ITEM_TYPE: CONTENT_TYPE.TEXT,  # set the default type.
                }
        try:
            self.do_mapping(item, xml, SETTINGS_MAPPING_PARAM)
            elem = xml.find('body/body.head/dateline/location/city')
            if elem is not None:
                self.set_dateline(item, city=elem.text)

            item.setdefault('word_count', get_word_count(item['body_html'], no_html=True))
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
        return item
Exemplo n.º 40
0
    def _update(self, provider, update):
        config = provider.get('config', {})
        json_items = self._fetch_data(config, provider)
        parsed_items = []

        for item in json_items:
            try:
                parser = self.get_feed_parser(provider, item)
                parsed_items.append(parser.parse(item))
            except Exception as ex:
                raise ParserError.parseMessageError(ex, provider, data=item)

        return parsed_items
Exemplo n.º 41
0
 def parse(self, xml, provider=None):
     self.root = xml
     items = []
     try:
         header = self.parse_header(xml)
         for item_set in xml.findall(self.qname('itemSet')):
             for item_tree in item_set:
                 item = self.parse_item(item_tree)
                 item['priority'] = header['priority']
                 items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
Exemplo n.º 42
0
 def parse(self, xml, provider=None):
     self.root = xml
     items = []
     try:
         header = self.parse_header(xml)
         for item_set in xml.findall(self.qname('itemSet')):
             for item_tree in item_set:
                 item = self.parse_item(item_tree)
                 item['priority'] = header['priority']
                 items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
Exemplo n.º 43
0
 def test_raise_parseMessageError(self):
     with assert_raises(ParserError) as error_context:
         ex = Exception("Testing parseMessageError")
         raise ParserError.parseMessageError(ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1001)
     self.assertTrue(exception.message == "Message could not be parsed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing parseMessageError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "ParserError Error 1001 - Message could not be parsed: "
                      "Testing parseMessageError on channel TestProvider")
Exemplo n.º 44
0
    def parse_file(self, filename, provider):
        try:
            self.path = provider.get('config', {}).get('path', None)

            if not self.path:
                return []

            with open(os.path.join(self.path, filename), 'r') as f:
                item = self.parser.parse_message(etree.fromstring(f.read()), provider)

            return [item]
        except Exception as ex:
            self.move_file(self.path, filename, provider=provider, success=False)
            raise ParserError.parseFileError('AAP', filename, ex, provider)
Exemplo n.º 45
0
    def parse(self, xml, provider=None):
        itemList = []
        try:
            for entry in xml.findall(self.qname('entry', self.ATOM_NS)):
                item = {}
                self.set_item_defaults(item)
                self.parse_content_management(item, entry)
                self.parse_news_management(item, entry)
                item['body_html'] = self.get_elem_content(entry.find(self.qname('content', self.ATOM_NS)))
                itemList.append(item)
            return itemList

        except Exception as ex:
            raise ParserError.wennParserError(ex, provider)
Exemplo n.º 46
0
    def parse_message(self, tree, provider):
        itemList = []
        try:
            for entry in tree.findall(self.qname('entry', self.ATOM_NS)):
                item = {}
                self.set_item_defaults(item)
                self.parse_content_management(item, entry)
                self.parse_news_management(item, entry)
                item['body_html'] = self.get_elem_content(entry.find(self.qname('content', self.ATOM_NS)))
                itemList.append(item)
            return itemList

        except Exception as ex:
            raise ParserError.wennParserError(ex, provider)
Exemplo n.º 47
0
 def test_raise_newsmlOneParserError(self):
     with assert_raises(ParserError) as error_context:
         try:
             raise Exception("Testing newsmlOneParserError")
         except Exception as ex:
             raise ParserError.newsmlOneParserError(ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1004)
     self.assertTrue(exception.message == "NewsML1 input could not be processed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing newsmlOneParserError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "ParserError Error 1004 - NewsML1 input could not be processed: "
                      "Testing newsmlOneParserError on channel TestProvider")
    def parse(self, xml, provider=None):
        self.root = xml
        items = []
        try:
            for item_set in xml.findall(self.qname('itemSet')):
                for item_tree in item_set:
                    # Ignore the packageItem, it has no guid
                    if 'guid' in item_tree.attrib:
                        item = self.parse_item(item_tree)
                        item['priority'] = 6
                        item['anpa_category'] = [{'qcode': 'f'}]
                        item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]
                        item.setdefault('word_count', get_word_count(item['body_html']))
                        # Hard code the urgency
                        item['urgency'] = 3
                        # Dateline is always Wellington in NZ
                        located = [c for c in app.locators.find_cities(country_code='NZ', state_code='NZ.G2') if
                                   c.get('city', '').lower() == 'wellington']
                        if len(located) == 1:
                            item['dateline'] = dict()
                            item['dateline']['located'] = located[0]

                        if item.get('body_html') and item['dateline']:
                            parsed = parse_html(item.get('body_html'), content='xml')
                            pars = parsed.xpath('//p')
                            for par in pars:
                                if not par.text:
                                    continue
                                # check the first par for a byline
                                if pars.index(par) == 0 and par.text.startswith('By '):
                                    item['byline'] = par.text.replace('By ', '')
                                    par.getparent().remove(par)
                                date, source, the_rest = par.text.partition(' (BusinessDesk) - ')
                                if source:
                                    item['dateline']['date'] = date_parser(date, fuzzy=True)
                                    par.text = the_rest
                                # remove the signoff if in the last par
                                if par.text == '(BusinessDesk)' and pars.index(par) + 1 == len(pars):
                                    par.getparent().remove(par)
                            item['body_html'] = to_string(parsed, remove_root_div=True)
                        locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators')
                        if locator_map:
                            item['place'] = [x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ']

                        items.append(item)
            return items
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
Exemplo n.º 49
0
    def parse(self, xml, provider=None):
        itemList = []
        try:
            for entry in xml.findall(self.qname('entry', self.ATOM_NS)):
                item = {}
                self.set_item_defaults(item)
                self.parse_content_management(item, entry)
                self.parse_news_management(item, entry)
                item['body_html'] = self.get_elem_content(entry.find(self.qname('content', self.ATOM_NS)))
                item['body_html'] = item['body_html'].replace('\n\n  ', '</p><p>').replace('\n', '<br>')
                item['body_html'] = '<p>' + item['body_html'] + '</p>'
                itemList.append(item)
            return itemList

        except Exception as ex:
            raise ParserError.wennParserError(ex, provider)
Exemplo n.º 50
0
    def _get_decsription(self, lines, provider):
        """Lookup the bom product to determine the descriptive string, not finding this is a fatal error

        :param lines:
        :param provider:
        :return:
        """
        warning_str = 'Unknown'
        bom_products_map = get_resource_service('vocabularies').find_one(req=None, _id='bom_products')
        product = [x for x in bom_products_map.get('items', []) if x['qcode'] == lines[0].strip() and x['is_active']]
        if len(product) > 0:
            warning_str = product[0].get('name', '')
        else:
            logger.error('No BOM product mapping found for {}'.format(lines[0].strip()))
            raise ParserError.parseMessageError(Exception('No BOM product'), provider, data=lines[0])
        return warning_str
Exemplo n.º 51
0
 def test_raise_nitfParserError(self):
     with assert_raises(ParserError) as error_context:
         try:
             ex = Exception("Testing nitfParserError")
             raise ex
         except Exception:
             raise ParserError.nitfParserError(ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1006)
     self.assertTrue(exception.message == "NITF input could not be processed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEquals(exception.system_exception.args[0], "Testing nitfParserError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "ParserError Error 1006 - NITF input could not be processed: "
                      "Testing nitfParserError on channel TestProvider")
Exemplo n.º 52
0
    def parse(self, xml, provider=None):
        self.root = xml
        try:
            item = self.parse_item(xml)
            if not item.get('headline'):
                item['headline'] = text_utils.get_text(item.get('body_html', ''), 'html')[:100]

            try:
                abstract = xml.xpath("//iptc:description[@role='drol:summary']", namespaces={'iptc': IPTC_NS})[0].text
            except IndexError:
                pass
            else:
                if abstract:
                    item['abstract'] = abstract
            return [item]
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
Exemplo n.º 53
0
 def test_raise_parseFileError(self):
     with assert_raises(ParserError) as error_context:
         try:
             raise Exception("Testing parseFileError")
         except Exception as ex:
             raise ParserError.parseFileError('afp', 'test.txt', ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1002)
     self.assertTrue(exception.message == "Ingest file could not be parsed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing parseFileError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 1)
     message = self.mock_logger_handler.messages['error'][0]
     self.assertIn("ParserError Error 1002 - Ingest file could not be parsed", message)
     self.assertIn("Testing parseFileError on channel TestProvider", message)
     self.assertIn("source=afp", message)
     self.assertIn("file=test.txt", message)
Exemplo n.º 54
0
    def parse_file(self, filename, provider):
        try:
            path = provider.get('config', {}).get('path', None)

            if not path:
                return []

            with open(os.path.join(path, filename), 'r') as f:
                item = self.parser.parse_message(etree.fromstring(f.read()), provider)

                item['firstcreated'] = normalize_date(item.get('firstcreated'), self.tz)
                item['versioncreated'] = normalize_date(item.get('versioncreated'), self.tz)

            return [item]
        except Exception as ex:
            self.move_file(self.path, filename, provider=provider, success=False)
            raise ParserError.parseFileError('AAP', filename, ex, provider)
Exemplo n.º 55
0
 def parse(self, xml, provider=None):
     self.root = xml
     items = []
     try:
         header = self.parse_header(xml)
         for item_set in xml.findall(self.qname('itemSet')):
             for item_tree in item_set:
                 item = self.parse_item(item_tree)
                 item['priority'] = header['priority']
                 items.append(item)
         else:
             if xml.tag.endswith('newsItem') or xml.tag.endswith('packageItem'):
                 item = self.parse_item(xml)
                 item.setdefault('priority', header['priority'])
                 items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
Exemplo n.º 56
0
 def test_raise_parseFileError(self):
     with assert_raises(ParserError) as error_context:
         try:
             ex = Exception("Testing parseFileError")
             raise ex
         except Exception:
             raise ParserError.parseFileError('afp', 'test.txt', ex, self.provider)
     exception = error_context.exception
     self.assertTrue(exception.code == 1002)
     self.assertTrue(exception.message == "Ingest file could not be parsed")
     self.assertIsNotNone(exception.system_exception)
     self.assertEqual(exception.system_exception.args[0], "Testing parseFileError")
     self.assertEqual(len(self.mock_logger_handler.messages['error']), 2)
     self.assertEqual(self.mock_logger_handler.messages['error'][0],
                      "Source Type: afp - File: test.txt could not be processed")
     self.assertEqual(self.mock_logger_handler.messages['error'][1],
                      "ParserError Error 1002 - Ingest file could not be parsed: "
                      "Testing parseFileError on channel TestProvider")
Exemplo n.º 57
0
 def parse(self, xml, provider=None):
     self.root = xml
     items = []
     try:
         header = self.parse_header(xml)
         for item_set in xml.findall(self.qname('itemSet')):
             for item_tree in item_set:
                 # Ignore the packageItem, it has no guid
                 if 'guid' in item_tree.attrib:
                     item = self.parse_item(item_tree)
                     item['priority'] = header['priority']
                     item['anpa_category'] = [{'qcode': 'f'}]
                     item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]
                     item.setdefault('word_count', get_word_count(item['body_html']))
                     items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)