예제 #1
0
    def test_get_word_count(self):
        self.assertEqual(2, get_word_count('plain text'), 'plain text')
        self.assertEqual(2, get_word_count('<p> html text </p>'), 'paragraph')

        self.assertEqual(22, get_word_count(
            '<doc><p xml:lang="en-US">The weather was superb today in Norfolk, Virginia. Made me want to take\n'
            'out my boat, manufactured by the <org value="acm" idsrc="iptc.org">Acme Boat Company</org>.</p></doc>'))
예제 #2
0
    def test_get_word_count(self):
        self.assertEqual(2, get_word_count('plain text'), 'plain text')
        self.assertEqual(2, get_word_count('<p> html text </p>'), 'paragraph')

        self.assertEqual(22, get_word_count(
            '<doc><p xml:lang="en-US">The weather was superb today in Norfolk, Virginia. Made me want to take\n'
            'out my boat, manufactured by the <org value="acm" idsrc="iptc.org">Acme Boat Company</org>.</p></doc>'))
예제 #3
0
    def parse_message(self, tree, provider):
        item = {}
        try:
            docdata = tree.find("head/docdata")
            # set the default type.
            item["type"] = ITEM_CLASS_TEXT
            item["guid"] = item["uri"] = docdata.find("doc-id").get("id-string")
            item["urgency"] = docdata.find("urgency").get("ed-urg", "5")
            item["pubstatus"] = docdata.attrib.get("management-status", "usable")
            item["firstcreated"] = get_norm_datetime(docdata.find("date.issue"))
            item["versioncreated"] = get_norm_datetime(docdata.find("date.issue"))
            item["expiry"] = get_norm_datetime(docdata.find("date.expire"))
            item["subject"] = get_subjects(tree)
            item["body_html"] = get_content(tree)
            item["place"] = get_places(docdata)
            item["keywords"] = get_keywords(docdata)

            if docdata.find("ed-msg") is not None:
                item["ednote"] = docdata.find("ed-msg").attrib.get("info")

            item["headline"] = super().trim_headline(tree.find("body/body.head/hedline/hl1").text)

            elem = tree.find("body/body.head/abstract")
            item["abstract"] = elem.text if elem is not None else ""

            elem = tree.find("body/body.head/dateline/location/city")
            item["dateline"] = elem.text if elem is not None else ""
            item["byline"] = get_byline(tree)

            parse_meta(tree, item)
            item.setdefault("word_count", get_word_count(item["body_html"]))
            return item
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
예제 #4
0
    def parse_message(self, tree, provider):
        item = {}
        try:
            docdata = tree.find('head/docdata')
            # set the default type.
            item['type'] = ITEM_CLASS_TEXT
            item['guid'] = item['uri'] = docdata.find('doc-id').get('id-string')
            item['urgency'] = docdata.find('urgency').get('ed-urg', '5')
            item['pubstatus'] = docdata.attrib.get('management-status', 'usable')
            item['firstcreated'] = get_norm_datetime(docdata.find('date.issue'))
            item['versioncreated'] = get_norm_datetime(docdata.find('date.issue'))
            item['expiry'] = get_norm_datetime(docdata.find('date.expire'))
            item['subject'] = get_subjects(tree)
            item['body_html'] = get_content(tree)
            item['place'] = get_places(docdata)
            item['keywords'] = get_keywords(docdata)

            if docdata.find('ed-msg') is not None:
                item['ednote'] = docdata.find('ed-msg').attrib.get('info')

            item['headline'] = tree.find('body/body.head/hedline/hl1').text

            elem = tree.find('body/body.head/abstract')
            item['abstract'] = elem.text if elem is not None else ''

            elem = tree.find('body/body.head/dateline/location/city')
            item['dateline'] = elem.text if elem is not None else ''
            item['byline'] = get_byline(tree)

            parse_meta(tree, item)
            item.setdefault('word_count', get_word_count(item['body_html']))
            return item
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
예제 #5
0
def update_word_count(doc):
    """Update word count if there was change in content.

    :param doc: created/udpated document
    """
    if doc.get('body_html'):
        doc.setdefault('word_count', get_word_count(doc.get('body_html')))
예제 #6
0
    def parse_message(self, tree, provider):
        item = {}
        try:
            docdata = tree.find('head/docdata')
            # set the default type.
            item['type'] = ITEM_CLASS_TEXT
            item['guid'] = item['uri'] = docdata.find('doc-id').get('id-string')
            item['urgency'] = docdata.find('urgency').get('ed-urg', '5')
            item['pubstatus'] = docdata.attrib.get('management-status', 'usable')
            item['firstcreated'] = get_norm_datetime(docdata.find('date.issue'))
            item['versioncreated'] = get_norm_datetime(docdata.find('date.issue'))
            item['expiry'] = get_norm_datetime(docdata.find('date.expire'))
            item['subject'] = get_subjects(tree)
            item['body_html'] = get_content(tree)
            item['place'] = get_places(docdata)
            item['keywords'] = get_keywords(docdata)

            if docdata.find('ed-msg') is not None:
                item['ednote'] = docdata.find('ed-msg').attrib.get('info')

            item['headline'] = tree.find('body/body.head/hedline/hl1').text

            elem = tree.find('body/body.head/abstract')
            item['abstract'] = elem.text if elem is not None else ''

            elem = tree.find('body/body.head/dateline/location/city')
            item['dateline'] = elem.text if elem is not None else ''
            item['byline'] = get_byline(tree)

            parse_meta(tree, item)
            item.setdefault('word_count', get_word_count(item['body_html']))
            return item
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
예제 #7
0
    def parse(self, filename, provider=None):
        try:
            item = {}
            self.set_item_defaults(item, filename)
            with open(filename, 'r', encoding='windows-1252') as f:
                # read the whole file into a single string
                lines = f.read()
                # Construct pattern for the regular expression
                pattern = '(.*)\n'
                for f in self.field_list:
                    pattern = pattern + f[0] + '(.*)\n'
                m = re.match(pattern, ''.join(lines), re.MULTILINE | re.DOTALL)
                if m:
                    for f in self.field_list:
                        if f[1] is not None:
                            item[f[1]] = m.group(f[2])

            # fix the formatting
            item[self.ITEM_VERSION_CREATED] = self.datetime(item[self.ITEM_VERSION_CREATED])
            item[self.ITEM_BODY_HTML] = '<p>' + html.escape(item[self.ITEM_BODY_HTML].strip()).replace('\n', '</p><p>')\
                                        + '</p>'
            item.setdefault('word_count', get_word_count(item['body_html']))

            return item
        except Exception as ex:
            raise AAPParserError.NewsBitesParserError(exception=ex, provider=provider)
예제 #8
0
def update_word_count(doc):
    """Update word count if there was change in content.

    :param doc: created/udpated document
    """
    if doc.get('body_html'):
        doc.setdefault('word_count', get_word_count(doc.get('body_html')))
예제 #9
0
 def test_word_count_nitf(self):
     self.assertEqual(
         40,
         get_word_count("""
     <p>2014: Northern Ireland beat <location>Greece</location> 2-0 in <location>Athens</location>
     with goals from <person>Jamie Ward</person> and <person>Kyle Lafferty</person> to boost their
     hopes of qualifying for <money>Euro 2016</money>. <person>Michael O'Neill's</person> side
     sealed their place at the finals in <chron>October 2015</chron>.</p>"""
                        ))
예제 #10
0
 def test_word_count_ul(self):
     self.assertEqual(3, sd_etree.get_word_count("""
         <ul>
             <li>foo</li>
             <li>bar</li>
             <li>baz</li>
             <li></li>
         </ul>
     """))
예제 #11
0
def update_word_count(update, original=None):
    """Update word count if there was change in content.

    :param update: created/updated document
    :param original: original document if updated
    """
    if update.get('body_html'):
        update.setdefault('word_count', get_word_count(update.get('body_html')))
    else:
        # If the body is removed then set the count to zero
        if original and 'word_count' in original and 'body_html' in update:
            update['word_count'] = 0
예제 #12
0
def update_word_count(update, original=None):
    """Update word count if there was change in content.

    :param update: created/updated document
    :param original: original document if updated
    """
    if update.get('body_html'):
        update.setdefault('word_count', get_word_count(update.get('body_html')))
    else:
        # If the body is removed then set the count to zero
        if original and 'word_count' in original and 'body_html' in update:
            update['word_count'] = 0
예제 #13
0
    def parse(self, xml, provider=None):
        item = {}
        try:
            docdata = xml.find('head/docdata')
            # set the default type.
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item['guid'] = item['uri'] = docdata.find('doc-id').get('id-string')
            if docdata.find('urgency') is not None:
                item['urgency'] = int(docdata.find('urgency').get('ed-urg', '5'))
            item['pubstatus'] = (docdata.attrib.get('management-status', 'usable')).lower()
            item['firstcreated'] = get_norm_datetime(docdata.find('date.issue'))
            item['versioncreated'] = get_norm_datetime(docdata.find('date.issue'))

            if docdata.find('date.expire') is not None:
                item['expiry'] = get_norm_datetime(docdata.find('date.expire'))
            item['subject'] = get_subjects(xml)
            item['body_html'] = get_content(xml)
            item['place'] = get_places(docdata)
            item['keywords'] = get_keywords(docdata)

            if xml.find('head/tobject/tobject.property') is not None:
                genre = xml.find('head/tobject/tobject.property').get('tobject.property.type')
                genre_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='genre')
                if genre_map is not None:
                    item['genre'] = [x for x in genre_map.get('items', []) if x['name'] == genre]

            if docdata.find('ed-msg') is not None:
                item['ednote'] = docdata.find('ed-msg').attrib.get('info')

            if xml.find('body/body.head/hedline/hl1') is not None:
                item['headline'] = xml.find('body/body.head/hedline/hl1').text
            else:
                if xml.find('head/title') is not None:
                    item['headline'] = xml.find('head/title').text

            elem = xml.find('body/body.head/abstract/p')
            item['abstract'] = elem.text if elem is not None else ''
            if elem is None:
                elem = xml.find('body/body.head/abstract')
                item['abstract'] = elem.text if elem is not None else ''

            elem = xml.find('body/body.head/dateline/location/city')
            if elem is not None:
                self.set_dateline(item, city=elem.text)

            item['byline'] = get_byline(xml)

            parse_meta(xml, item)
            item.setdefault('word_count', get_word_count(item['body_html']))
            return item
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
예제 #14
0
    def parse(self, xml, provider=None):
        item = {
            ITEM_TYPE: CONTENT_TYPE.TEXT,  # set the default type.
        }
        try:
            self.do_mapping(item, xml, SETTINGS_MAPPING_PARAM)
            elem = xml.find('body/body.head/dateline/location/city')
            if elem is not None:
                self.set_dateline(item, city=elem.text)

            item.setdefault('word_count',
                            get_word_count(item['body_html'], no_html=True))
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
        return item
예제 #15
0
 def parse(self, xml, provider=None):
     self.root = xml
     items = []
     try:
         header = self.parse_header(xml)
         for item_set in xml.findall(self.qname('itemSet')):
             for item_tree in item_set:
                 # Ignore the packageItem, it has no guid
                 if 'guid' in item_tree.attrib:
                     item = self.parse_item(item_tree)
                     item['priority'] = header['priority']
                     item['anpa_category'] = [{'qcode': 'f'}]
                     item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]
                     item.setdefault('word_count', get_word_count(item['body_html']))
                     items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
예제 #16
0
 def test_word_count_nitf_2(self):
     self.assertEqual(316, sd_etree.get_word_count("""
     <p>Rio Tinto has kept intact its target for iron ore shipments in 2017 after hitting the mid-point
     of its revised guidance range for 2016. </p><p>The world's second largest iron ore exporter shipped
     327.6 million tonnes of iron ore from its Pilbara operations in 2016, in line with the slightly lowered
     full-year guidance of between 325 and 330 million tonnes.</p><p>It expects to ship between 330 to 340
     million tonnes in 2017 from its main mining hub in WA.</p><p>"We have delivered a strong operational
     performance in 2016, underpinned by our drive for efficiency and maximising cash flow," chief executive
     Jean Sebastien Jacques said in a statement.</p><p>"Our disciplined approach remains in place in 2017,
     with the continued focus on productivity, cost reduction and commercial excellence."</p><p>Rio shipped
     87.7 million tonnes of iron ore in the December quarter - up eight per cent from the preceding three
     months - mainly helped by minimal weather disruption.</p><p>Fourth-quarter production was also up four
     per cent from a year ago to 85.5 million tonnes.</p><p>Sales in the quarter exceeded production by 2.2
     million tonnes, primarily through a drawdown on inventories built at the ports in the third quarter,
     the company said.</p><p>The miner also looks to have capitalised on a strong rebound in iron ore prices
     in 2016, saying 80 per cent of its sales were either on the spot market or on current quarter or current
     month average.</p><p>Rio’s copper production rose four per cent from a year ago to 523,000 tonnes, but
     still came in below its guidance range of 535,000 to 565,000 tonnes due to lower-than-expected production
     at its Kennecott mine in the US and no supplies from the Grasberg joint venture in Indonesia.</p><p>It has
     forecast a wide guidance range of 525,000 to 665,000 tonnes for 2017.</p><p>The miner topped production
     forecasts for bauxite and coking coal, while aluminium output jumped 10 per cent in 2016.</p>"""))
예제 #17
0
 def parse(self, xml, provider=None):
     self.root = xml
     items = []
     try:
         header = self.parse_header(xml)
         for item_set in xml.findall(self.qname('itemSet')):
             for item_tree in item_set:
                 # Ignore the packageItem, it has no guid
                 if 'guid' in item_tree.attrib:
                     item = self.parse_item(item_tree)
                     item['priority'] = header['priority']
                     item['anpa_category'] = [{'qcode': 'f'}]
                     item['subject'] = [{
                         'qcode': '04000000',
                         'name': subject_codes['04000000']
                     }]
                     item.setdefault('word_count',
                                     get_word_count(item['body_html']))
                     items.append(item)
         return items
     except Exception as ex:
         raise ParserError.newsmlTwoParserError(ex, provider)
예제 #18
0
    def parse(self, xml, provider=None):
        if self.metadata_mapping is None:
            self._generate_mapping()
        item = {ITEM_TYPE: CONTENT_TYPE.TEXT}  # set the default type.
        try:
            for key, mapping in self.metadata_mapping.items():
                if not mapping:
                    # key is ignored
                    continue
                try:
                    xpath = mapping["xpath"]
                except KeyError:
                    # no xpath, we must have a callable
                    try:
                        value = mapping["callback"](xml)
                    except KeyError:
                        logging.warn("invalid mapping for key {}, ignoring it".format(key))
                        continue
                    except SkipValue:
                        continue
                else:
                    elem = xml.find(xpath)
                    if elem is None:
                        try:
                            value = mapping["default"]
                        except KeyError:
                            # if there is not default value we skip the key
                            continue
                    else:
                        # we have an element,
                        # do we want a filter, an attribute or the content?
                        try:
                            # filter
                            value = mapping["filter"](elem)
                        except KeyError:
                            try:
                                attribute = mapping["attribute"]
                            except KeyError:
                                # content
                                value = "".join(elem.itertext())
                            else:
                                # attribute
                                value = elem.get(attribute, mapping.get("default_attr"))

                try:
                    # filter_value is applied on found value
                    value = mapping["filter_value"](value)
                except KeyError:
                    pass

                if "key_hook" in mapping:
                    mapping["key_hook"](item, value)
                else:
                    item[key] = value

            elem = xml.find("body/body.head/dateline/location/city")
            if elem is not None:
                self.set_dateline(item, city=elem.text)

            item.setdefault("word_count", get_word_count(item["body_html"]))
            return item
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
예제 #19
0
 def test_word_count_brs(self):
     self.assertEqual(2, get_word_count('<p>foo<br><br>bar</p>'))
     self.assertEqual(2, get_word_count('<p>foo<br /><br />bar</p>'))
예제 #20
0
 def test_word_count_hrs(self):
     self.assertEqual(2, sd_etree.get_word_count('<p>foo<br><hr>bar</p>'))
     self.assertEqual(2, sd_etree.get_word_count('<p>foo<br /><hr />bar</p>'))
예제 #21
0
 def test_word_count_nitf(self):
     self.assertEqual(40, sd_etree.get_word_count("""
     <p>2014: Northern Ireland beat <location>Greece</location> 2-0 in <location>Athens</location>
     with goals from <person>Jamie Ward</person> and <person>Kyle Lafferty</person> to boost their
     hopes of qualifying for <money>Euro 2016</money>. <person>Michael O'Neill's</person> side
     sealed their place at the finals in <chron>October 2015</chron>.</p>"""))
예제 #22
0
    def parse(self, xml, provider=None):
        if self.metadata_mapping is None:
            self._generate_mapping()
        item = {
            ITEM_TYPE: CONTENT_TYPE.TEXT,  # set the default type.
        }
        try:
            for key, mapping in self.metadata_mapping.items():
                if not mapping:
                    # key is ignored
                    continue
                try:
                    xpath = mapping['xpath']
                except KeyError:
                    # no xpath, we must have a callable
                    try:
                        value = mapping['callback'](xml)
                    except KeyError:
                        logging.warn(
                            "invalid mapping for key {}, ignoring it".format(
                                key))
                        continue
                    except SkipValue:
                        continue
                else:
                    elem = xml.find(xpath)
                    if elem is None:
                        try:
                            value = mapping['default']
                        except KeyError:
                            # if there is not default value we skip the key
                            continue
                    else:
                        # we have an element,
                        # do we want a filter, an attribute or the content?
                        try:
                            # filter
                            value = mapping['filter'](elem)
                        except KeyError:
                            try:
                                attribute = mapping['attribute']
                            except KeyError:
                                # content
                                value = ''.join(elem.itertext())
                            else:
                                # attribute
                                value = elem.get(attribute,
                                                 mapping.get('default_attr'))

                try:
                    # filter_value is applied on found value
                    value = mapping['filter_value'](value)
                except KeyError:
                    pass

                if 'key_hook' in mapping:
                    mapping['key_hook'](item, value)
                else:
                    item[key] = value

            elem = xml.find('body/body.head/dateline/location/city')
            if elem is not None:
                self.set_dateline(item, city=elem.text)

            item.setdefault('word_count', get_word_count(item['body_html']))
            return item
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
예제 #23
0
 def test_word_count_p_tags(self):
     self.assertEqual(2, sd_etree.get_word_count('<p>foo<strong>s</strong></p><p>bar</p>'))