Exemplo n.º 1
0
    def test_zaincremental_harvester(self):

        client = CKANServer()
        metadata_registry = metadata.MetadataRegistry()
        metadata_registry.registerReader('oai_dc', oai_dc_reader)
        metadata_registry.registerWriter('oai_dc', oai_dc_writer)
        serv = BatchingServer(client, metadata_registry=metadata_registry)
        oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry))
        harv = OAIPMHHarvester()
        harvest_job = HarvestJob()
        harvest_job.source = HarvestSource()
        harvest_job.source.title = "Test"
        harvest_job.source.url = "http://helda.helsinki.fi/oai/request"
        harvest_job.gather_started = ((datetime.now() + timedelta(days=1)))
        harvest_job.source.config = '{"incremental":"True"}'
        harvest_job.source.type = "OAI-PMH"
        Session.add(harvest_job)
        rev = model.repo.new_revision()
        rev.timestamp = ((datetime.now() + timedelta(days=2)))
        pkg = Package(name='footest', revision=rev)
        Session.add(pkg)
        pkg.save()
        roger = Group.get('roger')
        roger.add_package_by_name('footest')
        Session.add(roger)
        roger.save()
        gathered = harv.gather_stage(harvest_job)
        harvest_object = HarvestObject.get(gathered[0])
        harv.fetch_stage(harvest_object)
        harvobj = json.loads(harvest_object.content)
        self.assert_(harvobj['records'])
Exemplo n.º 2
0
class MetadataHarvester(HarvesterBase):
    config = None
    harvester = None

    def __init__(self):
        self.harvester = None

    def info(self):
        return {
            'name': 'Metadata',
            'title': 'Metadata harvester',
            'description': 'Universal metadata harvester for various formats',
            }

    def gather_stage(self, harvest_job):
        url = harvest_job.source.url
        # Test wether we should use OAI-PMH or DDI
        metadata_registry = MetadataRegistry()
        metadata_registry.registerReader('oai_dc', oai_dc_reader)
        client = oaipmh.client.Client(url, metadata_registry)
        try:
            client.identify()
        except XMLSyntaxError:
            self.harvester = DDIHarvester()
        except urllib2.URLError:
            self._save_gather_error('Could not identify source!', harvest_job)
            return None
        if not self.harvester:
            self.harvester = OAIPMHHarvester()
        objs = self.harvester.gather_stage(harvest_job)
        ret = []
        for obj in objs:
            obj = HarvestObject.get(obj)
            cont = obj.content
            dict = json.loads(cont)
            dict['harv'] = jsonpickle.encode(self.harvester)
            obj.content = json.dumps(dict)
            obj.save()
            ret.append(obj.id)
        return ret

    def fetch_stage(self, harvest_object):
        harv = jsonpickle.decode(json.loads(harvest_object.content)['harv'])
        self.harvester = harv
        bool = self.harvester.fetch_stage(harvest_object)
        cont = harvest_object.content
        dict = json.loads(cont)
        dict['harv'] = jsonpickle.encode(self.harvester)
        harvest_object.content = json.dumps(dict)
        harvest_object.save()
        return bool

    def import_stage(self, harvest_object):
        harv = jsonpickle.decode(json.loads(harvest_object.content)['harv'])
        self.harvester = harv
        return self.harvester.import_stage(harvest_object)