def lint(name): harvester = registry[name] try: linter.lint(harvester.harvest, harvester.normalize) except Exception as e: print('Harvester {} raise the following exception'.format(harvester.short_name)) print(e)
date = parse(date_updated).isoformat() return copy_to_unicode(date) def normalize(raw_doc): raw_doc_text = raw_doc.get('doc') xml_doc = etree.XML(raw_doc_text) # Title title = (xml_doc.xpath('//official_title/node()') or xml_doc.xpath('//brief_title/node()') or [''])[0] # abstract abstract = (xml_doc.xpath('//brief_summary/textblock/node()') or xml_doc.xpath('//brief_summary/textblock/node()') or [''])[0] normalized_dict = { 'title': copy_to_unicode(title), 'contributors': get_contributors(xml_doc), 'properties': get_properties(xml_doc), 'description': copy_to_unicode(abstract), 'id': get_ids(raw_doc, xml_doc), 'source': NAME, 'tags': get_tags(xml_doc), 'dateUpdated': get_date_updated(xml_doc) } return NormalizedDocument(normalized_dict) if __name__ == '__main__': print(lint(consume, normalize))
def normalize(raw_doc): raw_doc_text = raw_doc.get('doc') doc = etree.XML(raw_doc_text) title = (doc.xpath("str[@name='title']/node()") or [''])[0] description = (doc.xpath("str[@name='abstract']/node()") or [''])[0] normalized_dict = { 'title': copy_to_unicode(title), 'contributors': get_contributors(doc), 'properties': get_properties(doc), 'description': copy_to_unicode(description), 'id': get_ids(doc, raw_doc), 'tags': get_tags(doc), 'source': NAME, 'dateCreated': get_date_created(doc), 'dateUpdated': get_date_updated(doc) } if normalized_dict['id']['url'] == u'': return None #import json; print json.dumps(normalized_dict['contributors'], indent=4) return NormalizedDocument(normalized_dict) if __name__ == '__main__': print(lint(consume, normalize))