def test_get_word_count(self): self.assertEqual(2, get_word_count('plain text'), 'plain text') self.assertEqual(2, get_word_count('<p> html text </p>'), 'paragraph') self.assertEqual(22, get_word_count( '<doc><p xml:lang="en-US">The weather was superb today in Norfolk, Virginia. Made me want to take\n' 'out my boat, manufactured by the <org value="acm" idsrc="iptc.org">Acme Boat Company</org>.</p></doc>'))
def parse_message(self, tree, provider): item = {} try: docdata = tree.find("head/docdata") # set the default type. item["type"] = ITEM_CLASS_TEXT item["guid"] = item["uri"] = docdata.find("doc-id").get("id-string") item["urgency"] = docdata.find("urgency").get("ed-urg", "5") item["pubstatus"] = docdata.attrib.get("management-status", "usable") item["firstcreated"] = get_norm_datetime(docdata.find("date.issue")) item["versioncreated"] = get_norm_datetime(docdata.find("date.issue")) item["expiry"] = get_norm_datetime(docdata.find("date.expire")) item["subject"] = get_subjects(tree) item["body_html"] = get_content(tree) item["place"] = get_places(docdata) item["keywords"] = get_keywords(docdata) if docdata.find("ed-msg") is not None: item["ednote"] = docdata.find("ed-msg").attrib.get("info") item["headline"] = super().trim_headline(tree.find("body/body.head/hedline/hl1").text) elem = tree.find("body/body.head/abstract") item["abstract"] = elem.text if elem is not None else "" elem = tree.find("body/body.head/dateline/location/city") item["dateline"] = elem.text if elem is not None else "" item["byline"] = get_byline(tree) parse_meta(tree, item) item.setdefault("word_count", get_word_count(item["body_html"])) return item except Exception as ex: raise ParserError.nitfParserError(ex, provider)
def parse_message(self, tree, provider): item = {} try: docdata = tree.find('head/docdata') # set the default type. item['type'] = ITEM_CLASS_TEXT item['guid'] = item['uri'] = docdata.find('doc-id').get('id-string') item['urgency'] = docdata.find('urgency').get('ed-urg', '5') item['pubstatus'] = docdata.attrib.get('management-status', 'usable') item['firstcreated'] = get_norm_datetime(docdata.find('date.issue')) item['versioncreated'] = get_norm_datetime(docdata.find('date.issue')) item['expiry'] = get_norm_datetime(docdata.find('date.expire')) item['subject'] = get_subjects(tree) item['body_html'] = get_content(tree) item['place'] = get_places(docdata) item['keywords'] = get_keywords(docdata) if docdata.find('ed-msg') is not None: item['ednote'] = docdata.find('ed-msg').attrib.get('info') item['headline'] = tree.find('body/body.head/hedline/hl1').text elem = tree.find('body/body.head/abstract') item['abstract'] = elem.text if elem is not None else '' elem = tree.find('body/body.head/dateline/location/city') item['dateline'] = elem.text if elem is not None else '' item['byline'] = get_byline(tree) parse_meta(tree, item) item.setdefault('word_count', get_word_count(item['body_html'])) return item except Exception as ex: raise ParserError.nitfParserError(ex, provider)
def update_word_count(doc): """Update word count if there was change in content. :param doc: created/udpated document """ if doc.get('body_html'): doc.setdefault('word_count', get_word_count(doc.get('body_html')))
def parse(self, filename, provider=None): try: item = {} self.set_item_defaults(item, filename) with open(filename, 'r', encoding='windows-1252') as f: # read the whole file into a single string lines = f.read() # Construct pattern for the regular expression pattern = '(.*)\n' for f in self.field_list: pattern = pattern + f[0] + '(.*)\n' m = re.match(pattern, ''.join(lines), re.MULTILINE | re.DOTALL) if m: for f in self.field_list: if f[1] is not None: item[f[1]] = m.group(f[2]) # fix the formatting item[self.ITEM_VERSION_CREATED] = self.datetime(item[self.ITEM_VERSION_CREATED]) item[self.ITEM_BODY_HTML] = '<p>' + html.escape(item[self.ITEM_BODY_HTML].strip()).replace('\n', '</p><p>')\ + '</p>' item.setdefault('word_count', get_word_count(item['body_html'])) return item except Exception as ex: raise AAPParserError.NewsBitesParserError(exception=ex, provider=provider)
def test_word_count_nitf(self): self.assertEqual( 40, get_word_count(""" <p>2014: Northern Ireland beat <location>Greece</location> 2-0 in <location>Athens</location> with goals from <person>Jamie Ward</person> and <person>Kyle Lafferty</person> to boost their hopes of qualifying for <money>Euro 2016</money>. <person>Michael O'Neill's</person> side sealed their place at the finals in <chron>October 2015</chron>.</p>""" ))
def test_word_count_ul(self): self.assertEqual(3, sd_etree.get_word_count(""" <ul> <li>foo</li> <li>bar</li> <li>baz</li> <li></li> </ul> """))
def update_word_count(update, original=None): """Update word count if there was change in content. :param update: created/updated document :param original: original document if updated """ if update.get('body_html'): update.setdefault('word_count', get_word_count(update.get('body_html'))) else: # If the body is removed then set the count to zero if original and 'word_count' in original and 'body_html' in update: update['word_count'] = 0
def parse(self, xml, provider=None): item = {} try: docdata = xml.find('head/docdata') # set the default type. item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['guid'] = item['uri'] = docdata.find('doc-id').get('id-string') if docdata.find('urgency') is not None: item['urgency'] = int(docdata.find('urgency').get('ed-urg', '5')) item['pubstatus'] = (docdata.attrib.get('management-status', 'usable')).lower() item['firstcreated'] = get_norm_datetime(docdata.find('date.issue')) item['versioncreated'] = get_norm_datetime(docdata.find('date.issue')) if docdata.find('date.expire') is not None: item['expiry'] = get_norm_datetime(docdata.find('date.expire')) item['subject'] = get_subjects(xml) item['body_html'] = get_content(xml) item['place'] = get_places(docdata) item['keywords'] = get_keywords(docdata) if xml.find('head/tobject/tobject.property') is not None: genre = xml.find('head/tobject/tobject.property').get('tobject.property.type') genre_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='genre') if genre_map is not None: item['genre'] = [x for x in genre_map.get('items', []) if x['name'] == genre] if docdata.find('ed-msg') is not None: item['ednote'] = docdata.find('ed-msg').attrib.get('info') if xml.find('body/body.head/hedline/hl1') is not None: item['headline'] = xml.find('body/body.head/hedline/hl1').text else: if xml.find('head/title') is not None: item['headline'] = xml.find('head/title').text elem = xml.find('body/body.head/abstract/p') item['abstract'] = elem.text if elem is not None else '' if elem is None: elem = xml.find('body/body.head/abstract') item['abstract'] = elem.text if elem is not None else '' elem = xml.find('body/body.head/dateline/location/city') if elem is not None: self.set_dateline(item, city=elem.text) item['byline'] = get_byline(xml) parse_meta(xml, item) item.setdefault('word_count', get_word_count(item['body_html'])) return item except Exception as ex: raise ParserError.nitfParserError(ex, provider)
def parse(self, xml, provider=None): item = { ITEM_TYPE: CONTENT_TYPE.TEXT, # set the default type. } try: self.do_mapping(item, xml, SETTINGS_MAPPING_PARAM) elem = xml.find('body/body.head/dateline/location/city') if elem is not None: self.set_dateline(item, city=elem.text) item.setdefault('word_count', get_word_count(item['body_html'], no_html=True)) except Exception as ex: raise ParserError.nitfParserError(ex, provider) return item
def parse(self, xml, provider=None): self.root = xml items = [] try: header = self.parse_header(xml) for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: # Ignore the packageItem, it has no guid if 'guid' in item_tree.attrib: item = self.parse_item(item_tree) item['priority'] = header['priority'] item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] item.setdefault('word_count', get_word_count(item['body_html'])) items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def test_word_count_nitf_2(self): self.assertEqual(316, sd_etree.get_word_count(""" <p>Rio Tinto has kept intact its target for iron ore shipments in 2017 after hitting the mid-point of its revised guidance range for 2016. </p><p>The world's second largest iron ore exporter shipped 327.6 million tonnes of iron ore from its Pilbara operations in 2016, in line with the slightly lowered full-year guidance of between 325 and 330 million tonnes.</p><p>It expects to ship between 330 to 340 million tonnes in 2017 from its main mining hub in WA.</p><p>"We have delivered a strong operational performance in 2016, underpinned by our drive for efficiency and maximising cash flow," chief executive Jean Sebastien Jacques said in a statement.</p><p>"Our disciplined approach remains in place in 2017, with the continued focus on productivity, cost reduction and commercial excellence."</p><p>Rio shipped 87.7 million tonnes of iron ore in the December quarter - up eight per cent from the preceding three months - mainly helped by minimal weather disruption.</p><p>Fourth-quarter production was also up four per cent from a year ago to 85.5 million tonnes.</p><p>Sales in the quarter exceeded production by 2.2 million tonnes, primarily through a drawdown on inventories built at the ports in the third quarter, the company said.</p><p>The miner also looks to have capitalised on a strong rebound in iron ore prices in 2016, saying 80 per cent of its sales were either on the spot market or on current quarter or current month average.</p><p>Rio’s copper production rose four per cent from a year ago to 523,000 tonnes, but still came in below its guidance range of 535,000 to 565,000 tonnes due to lower-than-expected production at its Kennecott mine in the US and no supplies from the Grasberg joint venture in Indonesia.</p><p>It has forecast a wide guidance range of 525,000 to 665,000 tonnes for 2017.</p><p>The miner topped production forecasts for bauxite and coking coal, while aluminium output jumped 10 per cent in 2016.</p>"""))
def parse(self, xml, provider=None): self.root = xml items = [] try: header = self.parse_header(xml) for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: # Ignore the packageItem, it has no guid if 'guid' in item_tree.attrib: item = self.parse_item(item_tree) item['priority'] = header['priority'] item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{ 'qcode': '04000000', 'name': subject_codes['04000000'] }] item.setdefault('word_count', get_word_count(item['body_html'])) items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def parse(self, xml, provider=None): if self.metadata_mapping is None: self._generate_mapping() item = {ITEM_TYPE: CONTENT_TYPE.TEXT} # set the default type. try: for key, mapping in self.metadata_mapping.items(): if not mapping: # key is ignored continue try: xpath = mapping["xpath"] except KeyError: # no xpath, we must have a callable try: value = mapping["callback"](xml) except KeyError: logging.warn("invalid mapping for key {}, ignoring it".format(key)) continue except SkipValue: continue else: elem = xml.find(xpath) if elem is None: try: value = mapping["default"] except KeyError: # if there is not default value we skip the key continue else: # we have an element, # do we want a filter, an attribute or the content? try: # filter value = mapping["filter"](elem) except KeyError: try: attribute = mapping["attribute"] except KeyError: # content value = "".join(elem.itertext()) else: # attribute value = elem.get(attribute, mapping.get("default_attr")) try: # filter_value is applied on found value value = mapping["filter_value"](value) except KeyError: pass if "key_hook" in mapping: mapping["key_hook"](item, value) else: item[key] = value elem = xml.find("body/body.head/dateline/location/city") if elem is not None: self.set_dateline(item, city=elem.text) item.setdefault("word_count", get_word_count(item["body_html"])) return item except Exception as ex: raise ParserError.nitfParserError(ex, provider)
def test_word_count_brs(self): self.assertEqual(2, get_word_count('<p>foo<br><br>bar</p>')) self.assertEqual(2, get_word_count('<p>foo<br /><br />bar</p>'))
def test_word_count_hrs(self): self.assertEqual(2, sd_etree.get_word_count('<p>foo<br><hr>bar</p>')) self.assertEqual(2, sd_etree.get_word_count('<p>foo<br /><hr />bar</p>'))
def test_word_count_nitf(self): self.assertEqual(40, sd_etree.get_word_count(""" <p>2014: Northern Ireland beat <location>Greece</location> 2-0 in <location>Athens</location> with goals from <person>Jamie Ward</person> and <person>Kyle Lafferty</person> to boost their hopes of qualifying for <money>Euro 2016</money>. <person>Michael O'Neill's</person> side sealed their place at the finals in <chron>October 2015</chron>.</p>"""))
def parse(self, xml, provider=None): if self.metadata_mapping is None: self._generate_mapping() item = { ITEM_TYPE: CONTENT_TYPE.TEXT, # set the default type. } try: for key, mapping in self.metadata_mapping.items(): if not mapping: # key is ignored continue try: xpath = mapping['xpath'] except KeyError: # no xpath, we must have a callable try: value = mapping['callback'](xml) except KeyError: logging.warn( "invalid mapping for key {}, ignoring it".format( key)) continue except SkipValue: continue else: elem = xml.find(xpath) if elem is None: try: value = mapping['default'] except KeyError: # if there is not default value we skip the key continue else: # we have an element, # do we want a filter, an attribute or the content? try: # filter value = mapping['filter'](elem) except KeyError: try: attribute = mapping['attribute'] except KeyError: # content value = ''.join(elem.itertext()) else: # attribute value = elem.get(attribute, mapping.get('default_attr')) try: # filter_value is applied on found value value = mapping['filter_value'](value) except KeyError: pass if 'key_hook' in mapping: mapping['key_hook'](item, value) else: item[key] = value elem = xml.find('body/body.head/dateline/location/city') if elem is not None: self.set_dateline(item, city=elem.text) item.setdefault('word_count', get_word_count(item['body_html'])) return item except Exception as ex: raise ParserError.nitfParserError(ex, provider)
def test_word_count_p_tags(self): self.assertEqual(2, sd_etree.get_word_count('<p>foo<strong>s</strong></p><p>bar</p>'))