def test_zaincremental_harvester(self): client = CKANServer() metadata_registry = metadata.MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) metadata_registry.registerWriter('oai_dc', oai_dc_writer) serv = BatchingServer(client, metadata_registry=metadata_registry) oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry)) harv = OAIPMHHarvester() harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://helda.helsinki.fi/oai/request" harvest_job.gather_started = ((datetime.now() + timedelta(days=1))) harvest_job.source.config = '{"incremental":"True"}' harvest_job.source.type = "OAI-PMH" Session.add(harvest_job) rev = model.repo.new_revision() rev.timestamp = ((datetime.now() + timedelta(days=2))) pkg = Package(name='footest', revision=rev) Session.add(pkg) pkg.save() roger = Group.get('roger') roger.add_package_by_name('footest') Session.add(roger) roger.save() gathered = harv.gather_stage(harvest_job) harvest_object = HarvestObject.get(gathered[0]) harv.fetch_stage(harvest_object) harvobj = json.loads(harvest_object.content) self.assert_(harvobj['records'])
class MetadataHarvester(HarvesterBase): config = None harvester = None def __init__(self): self.harvester = None def info(self): return { 'name': 'Metadata', 'title': 'Metadata harvester', 'description': 'Universal metadata harvester for various formats', } def gather_stage(self, harvest_job): url = harvest_job.source.url # Test wether we should use OAI-PMH or DDI metadata_registry = MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) client = oaipmh.client.Client(url, metadata_registry) try: client.identify() except XMLSyntaxError: self.harvester = DDIHarvester() except urllib2.URLError: self._save_gather_error('Could not identify source!', harvest_job) return None if not self.harvester: self.harvester = OAIPMHHarvester() objs = self.harvester.gather_stage(harvest_job) ret = [] for obj in objs: obj = HarvestObject.get(obj) cont = obj.content dict = json.loads(cont) dict['harv'] = jsonpickle.encode(self.harvester) obj.content = json.dumps(dict) obj.save() ret.append(obj.id) return ret def fetch_stage(self, harvest_object): harv = jsonpickle.decode(json.loads(harvest_object.content)['harv']) self.harvester = harv bool = self.harvester.fetch_stage(harvest_object) cont = harvest_object.content dict = json.loads(cont) dict['harv'] = jsonpickle.encode(self.harvester) harvest_object.content = json.dumps(dict) harvest_object.save() return bool def import_stage(self, harvest_object): harv = jsonpickle.decode(json.loads(harvest_object.content)['harv']) self.harvester = harv return self.harvester.import_stage(harvest_object)