def test_zaincremental_harvester(self): client = CKANServer() metadata_registry = metadata.MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) metadata_registry.registerWriter('oai_dc', oai_dc_writer) serv = BatchingServer(client, metadata_registry=metadata_registry) oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry)) harv = OAIPMHHarvester() harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://helda.helsinki.fi/oai/request" harvest_job.gather_started = ((datetime.now() + timedelta(days=1))) harvest_job.source.config = '{"incremental":"True"}' harvest_job.source.type = "OAI-PMH" Session.add(harvest_job) rev = model.repo.new_revision() rev.timestamp = ((datetime.now() + timedelta(days=2))) pkg = Package(name='footest', revision=rev) Session.add(pkg) pkg.save() roger = Group.get('roger') roger.add_package_by_name('footest') Session.add(roger) roger.save() gathered = harv.gather_stage(harvest_job) harvest_object = HarvestObject.get(gathered[0]) harv.fetch_stage(harvest_object) harvobj = json.loads(harvest_object.content) self.assert_(harvobj['records'])
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' model.repo.new_revision() master_data = json.loads(harvest_object.content) domain = master_data['domain'] group = Group.get(domain) if not group: group = Group(name=domain, description=domain) if 'records' in master_data: records = master_data['records'] set_name = master_data['set_name'] for rec in records: identifier, metadata, _ = rec if metadata: name = metadata['title'][0] if len(metadata['title'])\ else identifier title = name norm_title = unicodedata.normalize('NFKD', name)\ .encode('ASCII', 'ignore')\ .lower().replace(' ', '_')[:35] slug = ''.join(e for e in norm_title if e in string.ascii_letters + '_') name = slug creator = metadata['creator'][0]\ if len(metadata['creator']) else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg = Package.by_name(name) if not pkg: pkg = Package(name=name, title=title) extras = {} for met in metadata.items(): key, value = met if len(value) > 0: if key == 'subject' or key == 'type': for tag in value: if tag: tag = munge_tag(tag[:100]) tag_obj = model.Tag.by_name(tag) if not tag_obj: tag_obj = model.Tag(name=tag) if tag_obj: pkgtag = model.PackageTag( tag=tag_obj, package=pkg) Session.add(tag_obj) Session.add(pkgtag) else: extras[key] = ' '.join(value) pkg.author = creator pkg.author_email = creator pkg.title = title pkg.notes = description pkg.extras = extras pkg.url = \ "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\ % (harvest_object.job.source.url, identifier) pkg.save() harvest_object.package_id = pkg.id Session.add(harvest_object) setup_default_user_roles(pkg) url = '' for ids in metadata['identifier']: if ids.startswith('http://'): url = ids title = metadata['title'][0] if len(metadata['title'])\ else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg.add_resource(url, description=description, name=title) group.add_package_by_name(pkg.name) subg_name = "%s - %s" % (domain, set_name) subgroup = Group.by_name(subg_name) if not subgroup: subgroup = Group(name=subg_name, description=subg_name) subgroup.add_package_by_name(pkg.name) Session.add(group) Session.add(subgroup) setup_default_user_roles(group) setup_default_user_roles(subgroup) model.repo.commit() else: self._save_object_error('Could not receive any objects from fetch!' , harvest_object, stage='Import') return False return True
def _oai_dc2ckan(data, namespaces, group, harvest_object): model.repo.new_revision() identifier = data['identifier'] metadata_oai_dc = data['metadata']['oai_dc'] titles = _handle_title(metadata_oai_dc.get('titleNode', []), namespaces) # Store title in pkg.title and keep all in extras as well. That way # UI will work some way in any case. title = titles.get('title_0', identifier) #title = metadata['title'][0] if len(metadata['title']) else identifier name = data['package_name'] esc_identifier = identifier.replace('/','-') pkg = Package.get(esc_identifier) if not pkg: pkg = Package(name=name, title=title, id=esc_identifier) pkg.save() setup_default_user_roles(pkg) else: log.debug('Updating: %s' % name) # There are old resources which are replaced by new ones if they are # relevant anymore so "delete" all existing resources now. for r in pkg.resources: r.state = 'deleted' extras = titles idx = 0 for s in ('subject', 'type'): for tag in metadata_oai_dc.get(s, []): # Turn each subject or type field into it's own tag. tagi = tag.strip() if tagi.startswith('http://www.yso.fi'): tags = label_list_yso(tagi) extras['tag_source_%i' % idx] = tagi idx += 1 elif tagi.startswith('http://') or tagi.startswith('https://'): extras['tag_source_%i' % idx] = tagi idx += 1 tags = [] # URL tags break links in UI. else: tags = [tagi] for tagi in tags: tagi = tagi[:100] # 100 char limit in DB. #tagi = munge_tag(tagi[:100]) # 100 char limit in DB. tag_obj = model.Tag.by_name(tagi) if not tag_obj: tag_obj = model.Tag(name=tagi) tag_obj.save() pkgtag = model.Session.query(model.PackageTag).filter( model.PackageTag.package_id == pkg.id).filter( model.PackageTag.tag_id == tag_obj.id).limit(1).first() if pkgtag is None: pkgtag = model.PackageTag(tag=tag_obj, package=pkg) pkgtag.save() # Avoids duplicates if tags have duplicates. lastidx = 0 for auth in metadata_oai_dc.get('creator', []): extras['organization_%d' % lastidx] = '' extras['author_%d' % lastidx] = auth lastidx += 1 extras.update(_handle_contributor(metadata_oai_dc.get('contributorNode', []), namespaces)) extras.update(_handle_publisher(metadata_oai_dc.get('publisherNode', []), namespaces)) # This value belongs to elsewhere. if 'package.maintainer_email' in extras: pkg.maintainer_email = extras['package.maintainer_email'] del extras['package.maintainer_email'] extras.update(_handle_rights(metadata_oai_dc.get('rightsNode', []), namespaces)) if 'package.license' in extras: pkg.license = extras['package.license'] del extras['package.license'] # Causes failure in commit for some reason. #for f in _handle_format(metadata.get('formatNode', []), namespaces): # pprint.pprint(f) # pkg.add_resource(**f) # There may be multiple identifiers (URL, ISBN, ...) in the metadata. id_idx = 0 for ident in metadata_oai_dc.get('identifier', []): extras['identifier_%i' % id_idx] = ident id_idx += 1 # Check that we have a language. lang = metadata_oai_dc.get('language', []) if lang and len(lang) and len(lang[0]) > 1: pkg.language = lang[0] if 'date' in extras: pkg.version = extras['date'] del extras['date'] pkg.extras = extras pkg.url = data['package_url'] # Metadata may have different identifiers, pick link, if exists. for ids in metadata_oai_dc['identifier']: if ids.startswith('http://') or ids.startswith('https://'): pkg.add_resource(ids, name=pkg.title, format='html') # All belong to the main group even if they do not belong to any set. if group: group.add_package_by_name(pkg.name) # The rest. # description below goes to pkg.notes. I think it should not added here. for mdp, metadata in data['metadata'].items(): for key, value in metadata.items(): if value is None or len(value) == 0 or key in ('titleNode', 'subject', 'type', 'rightsNode', 'publisherNode', 'creator', 'contributorNode', 'description', 'identifier', 'language', 'formatNode'): continue extras[key] = ' '.join(value) #description = metadata['description'][0] if len(metadata['description']) else '' notes = ' '.join(metadata.get('description', [])) pkg.notes = notes.replace('\n', ' ').replace(' ', ' ') for mdp, resource in data['package_resource'].items(): ofs = get_ofs() ofs.put_stream(BUCKET, data['package_xml_save'][mdp]['label'], data['package_xml_save'][mdp]['xml'], {}) pkg.add_resource(**(resource)) if harvest_object: harvest_object.package_id = pkg.id harvest_object.content = None harvest_object.current = True harvest_object.save() model.repo.commit() return pkg.id
def _oai_dc2ckan(data, namespaces, group, harvest_object): model.repo.new_revision() identifier = data['identifier'] metadata = data['metadata'] # Store title in pkg.title and keep all in extras as well. That way # UI will work some way in any case. title = metadata.get('title', identifier)[0] #title = metadata['title'][0] if len(metadata['title']) else identifier name = data['package_name'] pkg = Package.get(name) if not pkg: pkg = Package(name=name, title=title, id=identifier) pkg.save() setup_default_user_roles(pkg) else: log.debug('Updating: %s' % name) # There are old resources which are replaced by new ones if they are # relevant anymore so "delete" all existing resources now. for r in pkg.resources: r.state = 'deleted' extras = {} idx = 0 for s in ('subject', 'type',): for tag in metadata.get(s, []): # Turn each subject or type field into it's own tag. tagi = tag.strip() if tagi.startswith('http://') or tagi.startswith('https://'): extras['tag_source_%i' % idx] = tagi idx += 1 tags = [] # URL tags break links in UI. else: tags = [tagi] for tagi in tags: tagi = tagi[:100] # 100 char limit in DB. tag_obj = model.Tag.by_name(tagi) if not tag_obj: tag_obj = model.Tag(name=tagi) tag_obj.save() pkgtag = model.Session.query(model.PackageTag).filter( model.PackageTag.package_id == pkg.id).filter( model.PackageTag.tag_id == tag_obj.id ).limit(1).first() if pkgtag is None: pkgtag = model.PackageTag(tag=tag_obj, package=pkg) pkgtag.save() # Avoids duplicates if tags have duplicates. extras.update( _handle_contributor(metadata.get('contributorNode', []), namespaces)) extras.update( _handle_publisher(metadata.get('publisherNode', []), namespaces)) # This value belongs to elsewhere. if 'package.maintainer_email' in extras: pkg.maintainer_email = extras['package.maintainer_email'] del extras['package.maintainer_email'] extras.update(_handle_rights(metadata.get('rightsNode', []), namespaces)) if 'package.license' in extras: pkg.license = extras['package.license'] del extras['package.license'] # Check that we have a language. lang = metadata.get('language', []) if lang is not None and len(lang) and len(lang[0]) > 1: pkg.language = lang[0] # The rest. # description below goes to pkg.notes. I think it should not added here. for key, value in metadata.items(): if value is None or len(value) == 0 or key in ( 'title', 'description', 'publisherNode', 'contributorNode', 'formatNode', 'identifier', 'source', 'rightsNode' ): continue extras[key] = value[0] #description = metadata['description'][0] if len(metadata['description']) else '' notes = ' '.join(metadata.get('description', [])) pkg.notes = notes.replace('\n', ' ').replace(' ', ' ') if 'date' in extras: pkg.version = extras['date'] extras['modified'] = extras['date'] del extras['date'] pkg.extras = extras pkg.url = data['package_url'] if 'package_resource' in data: try: ofs = get_ofs() ofs.put_stream(BUCKET, data['package_xml_save']['label'], data['package_xml_save']['xml'], {}) pkg.add_resource(**(data['package_resource'])) except KeyError: pass if harvest_object is not None: harvest_object.package_id = pkg.id harvest_object.content = None harvest_object.current = True harvest_object.save() # Metadata may have different identifiers, pick link, if exists. # See: https://github.com/okfn/ckan/blob/master/ckan/public/base/images/sprite-resource-icons.png # "Data" format is used by CKAN to identify unknown resources. # You can use it if you want (default format is "html"). For example: # - http://my.data.com/my-generated-resource?data # - http://my.data.com/my-resource.data available_formats = ['data', 'rdf', 'pdf', 'api', 'zip', 'xls', 'csv', 'txt', 'xml', 'json', 'html'] default_format = 'html' for ids in metadata['identifier']: if ids.startswith('http://') or ids.startswith('https://'): # The end of the URL must be the format, otherwise it will use "html" by default infer_format = default_format for ext in available_formats: if ids.endswith(ext): infer_format = ext pkg.add_resource(ids, name=pkg.title, format=infer_format) # All belong to the main group even if they do not belong to any set. if group is not None: group.add_package_by_name(pkg.name) model.repo.commit() return pkg.id