def _run_import(self, xml, job): if not model.User.get('harvest'): model.User(name='harvest', sysadmin=True).save() if not model.Group.get('test'): get_action('organization_create')({ 'user': '******' }, { 'name': 'test' }) record = _get_record(xml) metadata = CmdiReader()(record) metadata['unified']['owner_org'] = "test" harvest_object = HarvestObject() harvest_object.content = json.dumps(metadata.getMap()) harvest_object.id = xml harvest_object.guid = xml harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.save() self.harvester.import_stage(harvest_object) return harvest_object
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730180'}) self.assertEquals(package.get('id', None), 'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('name', None), 'urn-nbn-fi-lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = {u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'metadata'} self.assertTrue(expected_pid in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730186'}) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
def test_harvester_urlerror(self): harv, job = self._create_harvester() urllib2.urlopen = realopen self.assert_(harv.gather_stage(job) == None) errs = Session.query(HarvestGatherError).all() self.assert_(len(errs) == 1) harv_obj = HarvestObject() harv_obj.job = job harv_obj.content = json.dumps({'url': "http://foo"}) # XML error and URL error, also the lack of url in content self.assert_(harv.import_stage(harv_obj) == False) errs = Session.query(HarvestObjectError).all() print errs self.assert_(len(errs) == 1)
def gather_stage(self, harvest_job): """Gather the URLs to fetch from a URL which has a list of links to XML documents containing the DDI documents. """ self._set_config(self.config) previous_job = ( Session.query(HarvestJob) .filter(HarvestJob.source == harvest_job.source) .filter(HarvestJob.gather_finished != None) .filter(HarvestJob.id != harvest_job.id) .order_by(HarvestJob.gather_finished.desc()) .limit(1) .first() ) if previous_job: self.incremental = True gather_url = harvest_job.source.url try: urls = urllib2.urlopen(gather_url) harvest_objs = [] for url in urls.readlines(): gather = True if self.incremental: request = urllib2.Request(url) request.get_method = lambda: "HEAD" doc_url = urllib2.urlopen(request) lastmod = parser.parse(doc_url.headers["last-modified"], ignoretz=True) if previous_job.gather_finished < lastmod: log.debug("Gather false") gather = False if gather and not self.incremental: harvest_obj = HarvestObject() harvest_obj.content = json.dumps({"url": url}) harvest_obj.job = harvest_job harvest_obj.save() harvest_objs.append(harvest_obj.id) except urllib2.URLError: self._save_gather_error("Could not gather XML files from URL!", harvest_job) return None return harvest_objs
def _run_import(self, xml, job): if not model.User.get('harvest'): model.User(name='harvest', sysadmin=True).save() if not model.Group.get('test'): get_action('organization_create')({'user': '******'}, {'name': 'test'}) record = _get_record(xml) metadata = CmdiReader()(record) metadata['unified']['owner_org'] = "test" harvest_object = HarvestObject() harvest_object.content = json.dumps(metadata.getMap()) harvest_object.id = xml harvest_object.guid = xml harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.save() self.harvester.import_stage(harvest_object) return harvest_object
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package.get('name', None), utils.pid_to_name(package.get('id', None))) self.assertEquals(utils.get_primary_pid(package), u'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = { u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'relation', u'relation': u'generalRelation' } self.assertTrue(expected_pid not in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
obj = HarvestObject(guid=pkg_dict['id'], job=harvest_job, content=json.dumps(pkg_dict)) obj.save() object_ids.append(obj.id) for deleted_id in deleted_ids: # Original harvest object needs to be updated log.debug('Creating deleting HarvestObject for %s', deleted_id) obj = model.Session.query(HarvestObject)\ .filter( HarvestObject.current == True # noqa )\ .filter(HarvestObject.guid == deleted_id).one() obj.job = harvest_job obj.content = '{"id":"%s", "delete":true}' % deleted_id obj.save() object_ids.append(obj.id) return object_ids except Exception, e: self._save_gather_error('%r' % e.message, harvest_job) def _search_for_datasets(self, remote_ckan_base_url, fq_terms=None): '''Does a dataset search on a remote CKAN and returns the results. Deals with paging to return all the results, not just the first page. ''' base_search_url = remote_ckan_base_url + self._get_search_api_offset() params = {'rows': '100', 'start': '0'}