예제 #1
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find('NewsItem/NewsComponent/AdministrativeMetadata/Source')
            if parsed_el is not None:
                item['original_source'] = parsed_el.find('Party').get('FormalName', '')

            parsed_el = xml.find('NewsEnvelope/TransmissionId')
            if parsed_el is not None:
                item['ingest_provider_sequence'] = parsed_el.text

            parsed_el = xml.find('NewsEnvelope/Priority')
            item['priority'] = self.map_priority(parsed_el.text if parsed_el is not None else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Language')
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item['language'] = language[0]['FormalName'] if len(language) else ''

            keywords = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Property')
            item['keywords'] = self.parse_attribute_values(keywords, 'Keyword')

            subjects = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail')
            subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter')
            subjects += xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject')

            item['subject'] = self.format_subjects(subjects)

            # item['ContentItem'] = self.parse_attributes_as_dictionary(
            #    tree.find('NewsItem/NewsComponent/ContentItem'))
            # item['Content'] = etree.tostring(
            # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'))

            item['body_html'] = etree.tostring(
                xml.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'),
                encoding='unicode').replace('<body.content>', '').replace('</body.content>', '')

            parsed_el = xml.findall('NewsItem/NewsComponent/ContentItem/Characteristics/Property')
            characteristics = self.parse_attribute_values(parsed_el, 'Words')
            item['word_count'] = characteristics[0] if len(characteristics) else None

            parsed_el = xml.find('NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType')
            if parsed_el is not None:
                item.setdefault('usageterms', parsed_el.text)

            parsed_el = xml.findall('NewsItem/NewsComponent/DescriptiveMetadata/Genre')
            if parsed_el is not None:
                item['genre'] = []
                for el in parsed_el:
                    item['genre'].append({'name': el.get('FormalName')})

            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
예제 #2
0
 def parse_content(self, item, xml):
     item["body_html"] = (
         etree.tostring(
             xml.find("NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content"),
             encoding="unicode",
         )
         .replace("<body.content>", "")
         .replace("</body.content>", "")
     )
예제 #3
0
def get_word_count(html):
    """Get word count for given html.

    :param html: html string to count
    """
    try:
        root = etree.fromstringlist('<doc>{0}</doc>'.format(html))
        text = etree.tostring(root, encoding='unicode', method='text')
        return get_text_word_count(text)
    except ParseError:
        return get_text_word_count(html)
예제 #4
0
    def parse_content(self, item, xml):
        components = xml.findall("NewsItem/NewsComponent/NewsComponent/NewsComponent")
        for component in components:
            role = component.find("Role")
            if role is None:
                continue
            dest = self.COMPONENT_ROLE_MAPPING.get(role.get("FormalName"))
            if not dest:
                continue
            body = component.find(
                "ContentItem/DataContent/xhtml:html/xhtml:body", namespaces=NS
            )
            if dest == "headline":
                item[dest] = etree.tostring(
                    body, encoding="unicode", method="text"
                ).strip()
            elif dest == "abstract":
                item[dest] = component.find("ContentItem/DataContent").text
            else:
                item[dest] = "\n".join(
                    [
                        etree.tostring(elem, encoding="unicode", method="html").replace(
                            ' xmlns="http://www.w3.org/1999/xhtml"', ""
                        )
                        for elem in body
                    ]
                )

        party = xml.find("NewsItem/NewsComponent/AdministrativeMetadata/Source/Party")
        if party is not None and party.get("FormalName"):
            item.setdefault("subject", []).append(
                {
                    "name": party.get("FormalName"),
                    "qcode": party.get("FormalName"),
                    "scheme": cp.ORGANISATION,
                }
            )
예제 #5
0
    def get_body(self, news_item):
        try:
            raw_content = news_item.xpath(
                'NewsComponent/ContentItem[@Euid="announcement_html"]/DataContent/text()'
            )[0]
        except IndexError:
            logger.warning("No content found in element: {xml}".format(
                xml=etree.tostring(news_item, encoding="unicode")))
            return ""

        content_elt = sd_etree.parse_html(raw_content)
        h1 = content_elt.find('h1')
        if h1 is not None:
            content_elt.remove(h1)

        categories = news_item.xpath(
            'NewsComponent/Metadata/Property[@FormalName="Message Category"]/@Value'
        )

        if categories:
            category = categories[0]
            p_elt = etree.Element('p')
            p_elt.text = category
            content_elt.insert(0, p_elt)

        ori_ann_urls = news_item.xpath(
            'NewsComponent/Metadata/Property[@FormalName="nordicAgencyWebsite"]/@Value'
        )
        if ori_ann_urls:
            url = ori_ann_urls[0]
            if not url.startswith('http'):
                raise ValueError("Invalid url: {url}".format(url=url))
            p_elt = etree.SubElement(content_elt, "p")
            p_elt.text = 'Se saken i sin helhet: '
            a_elt = etree.SubElement(p_elt, "a", attrib={'href': url})
            a_elt.text = url

        ret = sd_etree.to_string(content_elt)
        return ret
예제 #6
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find(
                "NewsItem/NewsComponent/AdministrativeMetadata/Source/Party")
            if parsed_el is not None:
                item["original_source"] = parsed_el.attrib.get(
                    "FormalName", "ANA")

            parsed_el = xml.find("NewsEnvelope/Priority")
            item["priority"] = self.map_priority(
                parsed_el.text if parsed_el is not None else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall(
                "NewsItem/NewsComponent/DescriptiveMetadata/Language")
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item["language"] = language[0]["FormalName"] if len(
                    language) else ""

            subjects = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]'
            )

            item["subject"] = self.format_subjects(subjects)

            item["body_html"] = (html.unescape(
                etree.tostring(xml.find(
                    "NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent"
                ),
                               encoding="unicode")).replace(
                                   "<DataContent>",
                                   "").replace("</DataContent>", "").replace(
                                       "<P>", "<p>").replace("</P>", "</p>"))

            item["body_html"] = (item.get("body_html").replace(
                "<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο "
                "ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον "
                "για συγκεκριμένη χρήση.</p>",
                "",
            ).strip())
            parsed_el = xml.findall(
                "NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property"
            )
            characteristics = self.parse_attribute_values(
                parsed_el, "WordCount")
            item["word_count"] = characteristics[0] if len(
                characteristics) else None

            # Extract the city for setting into the dateline
            city = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]'
            ).attrib.get("Value")
            # Anglicise the greek for Athens if required
            city = "Athens" if city == "Αθήνα" else city
            country = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]'
            ).attrib.get("Value")
            # Normalise the country code
            country = "GR" if country == "GRC" else country

            cities = app.locators.find_cities()
            located = [
                c for c in cities
                if c["city"] == city and c["country_code"] == country
            ]
            if len(located) == 1:
                item["dateline"]["located"] = located[0]
                item["dateline"]["source"] = provider.get("source")
                item["dateline"]["text"] = format_dateline_to_locmmmddsrc(
                    item["dateline"]["located"],
                    item.get("dateline", {}).get("date"),
                    provider.get("source"))
            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
예제 #7
0
    def parse(self, xml, provider=None):
        item = {
            'versioncreated':
            utcnow(),
            'anpa_category': [{
                "name": "Formidlingstjenester",
                "qcode": "r"
            }],
            'genre': [{
                "name": "Fulltekstmeldinger",
                "qcode": "Fulltekstmeldinger",
                "scheme": "genre_custom"
            }],
            'subject': [{
                'qcode': 'Børsmelding',
                'name': 'Børsmelding',
                'scheme': 'category'
            }],
            'ednote':
            '*** Dette er en børsmelding formidlet av NTB pva. andre ***'
        }
        self.populate_fields(item)

        try:
            # we remove newsml namespace for convenience (to avoid to write prefix each time)
            # we deepcopy first to avoid modifying original item
            xml = deepcopy(xml)
            for elt in xml.iter():
                elt.tag = elt.tag.replace('{' + NEWSML_NS + '}', '')
            news_items = xml.findall('NewsItem')

            # there may be several items (for different languages), we keep in order of
            # preference: Norwegian, English, first item (cf. SDNTB-573)
            selected = None
            for news_item in news_items:
                try:
                    lang = news_item.xpath(
                        'NewsComponent/DescriptiveMetadata/Language/@FormalName',
                    )[0]
                except IndexError:
                    logger.warning(
                        "missing language in item, ignoring it.\nxml: {xml}".
                        format(
                            xml=etree.tostring(news_item, encoding="unicode")))
                    continue

                if selected is None or lang in ('no', 'en'):
                    selected = news_item

                if lang == 'no':
                    break

            if selected is None:
                logger.warning("can't find any valid item\nxml={xml}".format(
                    xml=etree.tostring(news_item, encoding="unicode")))
                raise ParserError.parseFileError(
                    source=etree.tostring(xml, encoding="unicode"))

            self.do_mapping(item, selected)
            return [item]
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
예제 #8
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find(
                'NewsItem/NewsComponent/AdministrativeMetadata/Source/Party')
            if parsed_el is not None:
                item['original_source'] = parsed_el.attrib.get(
                    'FormalName', 'ANA')

            parsed_el = xml.find('NewsEnvelope/Priority')
            item['priority'] = self.map_priority(
                parsed_el.text if parsed_el is not None else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/Language')
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item['language'] = language[0]['FormalName'] if len(
                    language) else ''

            subjects = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter[@Scheme="IptcSubjectCodes"]'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject[@Scheme="IptcSubjectCodes"]'
            )

            item['subject'] = self.format_subjects(subjects)

            item['body_html'] = html.unescape(
                etree.tostring(xml.find(
                    'NewsItem/NewsComponent/NewsComponent/ContentItem/DataContent'
                ),
                               encoding='unicode')).replace(
                                   '<DataContent>',
                                   '').replace('</DataContent>', '').replace(
                                       '<P>', '<p>').replace('</P>', '</p>')

            item['body_html'] = item.get('body_html').replace(
                '<p>© ΑΠΕ-ΜΠΕ ΑΕ. Τα πνευματικά δικαιώματα ανήκουν στο '
                'ΑΠΕ-ΜΠΕ ΑΕ και παραχωρούνται σε συνδρομητές μόνον '
                'για συγκεκριμένη χρήση.</p>', '').strip()
            parsed_el = xml.findall(
                'NewsItem/NewsComponent/NewsComponent/ContentItem/Characteristics/Property'
            )
            characteristics = self.parse_attribute_values(
                parsed_el, 'WordCount')
            item['word_count'] = characteristics[0] if len(
                characteristics) else None

            # Extract the city for setting into the dateline
            city = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="City"]'
            ).attrib.get('Value')
            # Anglicise the greek for Athens if required
            city = 'Athens' if city == 'Αθήνα' else city
            country = xml.find(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property[@FormalName="Country"]'
            ).attrib.get('Value')
            # Normalise the country code
            country = 'GR' if country == 'GRC' else country

            cities = app.locators.find_cities()
            located = [
                c for c in cities
                if c['city'] == city and c['country_code'] == country
            ]
            if len(located) == 1:
                item['dateline']['located'] = located[0]
                item['dateline']['source'] = provider.get('source')
                item['dateline']['text'] = format_dateline_to_locmmmddsrc(
                    item['dateline']['located'],
                    item.get('dateline', {}).get('date'),
                    provider.get('source'))
            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
예제 #9
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find(
                'NewsItem/NewsComponent/AdministrativeMetadata/Source')
            if parsed_el is not None:
                item['original_source'] = parsed_el.find('Party').get(
                    'FormalName', '')

            parsed_el = xml.find('NewsEnvelope/TransmissionId')
            if parsed_el is not None:
                item['ingest_provider_sequence'] = parsed_el.text

            parsed_el = xml.find('NewsEnvelope/Priority')
            item['priority'] = self.map_priority(
                parsed_el.text if parsed_el else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/Language')
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item['language'] = language[0]['FormalName'] if len(
                    language) else ''

            keywords = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/Property')
            item['keywords'] = self.parse_attribute_values(keywords, 'Keyword')

            subjects = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter'
            )
            subjects += xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject'
            )

            item['subject'] = self.format_subjects(subjects)

            # item['ContentItem'] = self.parse_attributes_as_dictionary(
            #    tree.find('NewsItem/NewsComponent/ContentItem'))
            # item['Content'] = etree.tostring(
            # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'))

            item['body_html'] = etree.tostring(xml.find(
                'NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'
            ),
                                               encoding='unicode').replace(
                                                   '<body.content>',
                                                   '').replace(
                                                       '</body.content>', '')

            parsed_el = xml.findall(
                'NewsItem/NewsComponent/ContentItem/Characteristics/Property')
            characteristics = self.parse_attribute_values(parsed_el, 'Words')
            item['word_count'] = characteristics[0] if len(
                characteristics) else None

            parsed_el = xml.find(
                'NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType')
            if parsed_el is not None:
                item.setdefault('usageterms', parsed_el.text)

            parsed_el = xml.findall(
                'NewsItem/NewsComponent/DescriptiveMetadata/Genre')
            if parsed_el is not None:
                item['genre'] = []
                for el in parsed_el:
                    item['genre'].append({'name': el.get('FormalName')})

            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)
예제 #10
0
    def parser_contentitem(self, item, content_el):
        """
        Function parser DescriptiveMetadata in NewsComponent element.

        Example:
        <ContentItem>
            <MediaType FormalName="Text"/>
            <Format FormalName="NITF3.1"/>
            <Characteristics>
              <SizeInBytes>2520</SizeInBytes>
              <Property FormalName="Words" Value="420"/>
            </Characteristics>
            <DataContent>
              <nitf>
                <body>
                  <body.content>
                    <p>Un an après la mort de Johnny Hallyday, plus d'un millier de fans sont venus assister dimanche
                    <p>A l'intérieur de l'église, plus d'un millier de personnes étaient réunies pour assister à une
                    <p>
                      <org idsrc="isin" value="US38259P5089">GOOGLE</org>
                    </p>
                  </body.content>
                </body>
              </nitf>
            </DataContent>
        </ContentItem>

        :param item:
        :param content_el:
        :return:
        """
        if content_el is None:
            return

        element = content_el.find('MediaType')
        if element is not None:
            item['type'] = element.get('FormalName', '')

        element = content_el.find('MimeType')
        if element is not None:
            item['mimetype'] = element.get('FormalName', '')

        element = content_el.find('Format')
        if element is not None:
            item['format'] = element.get('FormalName', '')

        character_el = content_el.find('Characteristics')
        if character_el is not None:
            item['characteristics'] = {}
            element = character_el.find('SizeInBytes')
            if element is not None:
                item['characteristics']['size_bytes'] = element.text
            elements = character_el.findall('Property')
            for element in elements:
                if element.attrib.get('FormalName') == 'Words':
                    item['characteristics']['word_count'] = element.attrib.get(
                        'Value')
                if element.attrib.get('FormalName') == 'SizeInBytes':
                    item['characteristics']['size_bytes'] = element.attrib.get(
                        'Value')
                if element.attrib.get('FormalName') == 'Creator':
                    item['characteristics']['creator'] = element.attrib.get(
                        'Value')
                if element.attrib.get('FormalName') == 'Characters':
                    item['characteristics']['characters'] = element.attrib.get(
                        'Value')

        if content_el.find('DataContent/nitf/body/body.content') is not None:
            item['body_html'] = etree.tostring(
                content_el.find('DataContent/nitf/body/body.content'),
                encoding='unicode').replace('<body.content>',
                                            '').replace('</body.content>', '')

        if content_el.find('DataContent/nitf/head') is not None:
            item['header_content'] = etree.tostring(
                content_el.find('DataContent/nitf/head'), encoding='unicode')

        if content_el.find('DataContent/nitf/body/body.head') is not None:
            item['body_head'] = etree.tostring(
                content_el.find('DataContent/nitf/body/body.head'),
                encoding='unicode')
예제 #11
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            self.root = xml
            parsed_el = xml.find("NewsItem/NewsComponent/AdministrativeMetadata/Source")
            if parsed_el is not None:
                item["original_source"] = parsed_el.find("Party").get("FormalName", "")

            parsed_el = xml.find("NewsEnvelope/TransmissionId")
            if parsed_el is not None:
                item["ingest_provider_sequence"] = parsed_el.text

            parsed_el = xml.find("NewsEnvelope/Priority")
            item["priority"] = self.map_priority(parsed_el.text if parsed_el else None)

            self.parse_news_identifier(item, xml)
            self.parse_newslines(item, xml)
            self.parse_news_management(item, xml)

            parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Language")
            if parsed_el is not None:
                language = self.parse_attributes_as_dictionary(parsed_el)
                item["language"] = language[0]["FormalName"] if len(language) else ""

            keywords = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Property")
            item["keywords"] = self.parse_attribute_values(keywords, "Keyword")

            subjects = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectDetail")
            subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/SubjectMatter")
            subjects += xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/SubjectCode/Subject")

            item["subject"] = self.format_subjects(subjects)

            # item['ContentItem'] = self.parse_attributes_as_dictionary(
            #    tree.find('NewsItem/NewsComponent/ContentItem'))
            # item['Content'] = etree.tostring(
            # tree.find('NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content'))

            item["body_html"] = (
                etree.tostring(
                    xml.find("NewsItem/NewsComponent/ContentItem/DataContent/nitf/body/body.content"),
                    encoding="unicode",
                )
                .replace("<body.content>", "")
                .replace("</body.content>", "")
            )

            parsed_el = xml.findall("NewsItem/NewsComponent/ContentItem/Characteristics/Property")
            characteristics = self.parse_attribute_values(parsed_el, "Words")
            item["word_count"] = characteristics[0] if len(characteristics) else None

            parsed_el = xml.find("NewsItem/NewsComponent/RightsMetadata/UsageRights/UsageType")
            if parsed_el is not None:
                item.setdefault("usageterms", parsed_el.text)

            parsed_el = xml.findall("NewsItem/NewsComponent/DescriptiveMetadata/Genre")
            if parsed_el is not None:
                item["genre"] = []
                for el in parsed_el:
                    item["genre"].append({"name": el.get("FormalName")})

            return self.populate_fields(item)
        except Exception as ex:
            raise ParserError.newsmlOneParserError(ex, provider)