def setup_class(cls): try: from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra except ImportError: raise SkipTest('The harvester extension is needed for these tests') cls.content1 = '<xml>Content 1</xml>' ho1 = HarvestObject( guid='test-ho-1', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content1) cls.content2 = '<xml>Content 2</xml>' cls.original_content2 = '<xml>Original Content 2</xml>' ho2 = HarvestObject( guid='test-ho-2', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content2) hoe = HarvestObjectExtra(key='original_document', value=cls.original_content2, object=ho2) Session.add(ho1) Session.add(ho2) Session.add(hoe) Session.commit() cls.object_id_1 = ho1.id cls.object_id_2 = ho2.id
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('z3950Harvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL source_url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # get current objects out of db query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) guid_to_package_id = dict((res[0], res[1]) for res in query) current_guids = set(guid_to_package_id.keys()) current_guids_in_harvest = set() # Get contents try: conn = zoom.Connection(source_url, int(self.source_config.get('port', 210))) conn.databaseName = self.source_config.get('database', '') conn.preferredRecordSyntax = 'XML' conn.elementSetName = 'T' query = zoom.Query('CCL', 'metadata') res = conn.search(query) ids = [] for num, result in enumerate(res): hash = hashlib.md5(result.data).hexdigest() if hash in current_guids: current_guids_in_harvest.add(hash) else: obj = HarvestObject( job=harvest_job, guid=hash, extras=[ HOExtra(key='status', value='new'), HOExtra(key='original_document', value=result.data.decode('latin-1')), HOExtra(key='original_format', value='fgdc') ]) obj.save() ids.append(obj.id) for guid in (current_guids - current_guids_in_harvest): obj = HarvestObject( job=harvest_job, guid=guid, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='delete')]) obj.save() ids.append(obj.id) return ids except Exception, e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (source_url, e),harvest_job) return None
def _gather_entry(self, entry, auth=None): # Create a harvest object for each entry entry_guid = entry['guid'] log.debug('gathering %s', entry_guid) entry_name = entry['identifier'].replace('v101_', '').replace('.hdf5', '') # noqa: E501 entry_restart_date = entry['restart_date'] package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = package obj.save() return obj.id elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = None obj.save() return obj.id
def gather_stage(self, harvest_job): if harvest_job.source.url.startswith('basic_test'): obj = HarvestObject(guid='test1', job=harvest_job) obj.extras.append(HarvestObjectExtra(key='key', value='value')) obj2 = HarvestObject(guid='test2', job=harvest_job) obj3 = HarvestObject(guid='test_to_delete', job=harvest_job) obj.add() obj2.add() obj3.save() # this will commit both return [obj.id, obj2.id, obj3.id] return []
def gather_stage(self, harvest_job): log.debug('In SRDAHarvester gather_stage (%s)' % harvest_job.source.url) get_all_packages = True package_ids = [] data = urllib2.urlopen(self.PREFIX_URL + self.CATALOGUE_INDEX_URL) doc = html.parse(data) for td in doc.findall("//td[@class='left_p12_title']/a"): link = td.get('href') if re.match(r"/search/fsciitem", link): id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() package_ids.append(obj.id) self._set_config(harvest_job.source.config) # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source==harvest_job.source) \ .filter(HarvestJob.gather_finished!=None) \ .filter(HarvestJob.id!=harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() return package_ids
def gather_stage(self, harvest_job): """Retrieve datasets""" log.debug('In KoelnCKANHarvester gather_stage (%s)' % harvest_job.source.url) package_ids = [] self._set_config(None) base_url = harvest_job.source.url.rstrip('/') package_list_url = base_url + '/3/action/package_list' content = self._get_content(package_list_url) content_json = json.loads(content) package_ids = content_json['result'] try: object_ids = [] if len(package_ids): for package_id in package_ids: obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error( 'No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
def gather_stage(self,harvest_job): log.debug('In NTPCHarvester gather_stage (%s)' % harvest_job.source.url) url = self.PREFIX_URL + self.CATALOGUE_INDEX_URL get_all_packages = True try: package_ids = [] dataset_count = self._get_ntpc_dataset_count(url) msg_count = 0 for x in range(dataset_count/10 + 1): page_url = url + '?currentPage=%s' % (x + 1) data = urllib2.urlopen(page_url) doc = html.parse(data) for div in doc.findall("//a[@href]"): if '/NTPC/od/query;' in div.attrib['href']: link = div.attrib['href'] id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() package_ids.append(obj.id) msg_count = msg_count + 1 if msg_count == 0: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def gather_stage(self,harvest_job): log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url) get_all_packages = True dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/') log.debug(dcatUrl) adaptorInstance = socrataAdaptor() package_ids = adaptorInstance.listDatasetIds(dcatUrl) try: object_ids = [] if len(package_ids): for package_id in package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def harvest_object_create(context, data_dict): """ Create a new harvest object :type guid: string (optional) :type content: string (optional) :type job_id: string :type source_id: string (optional) :type package_id: string (optional) :type extras: dict (optional) """ check_access('harvest_object_create', context, data_dict) data, errors = _validate(data_dict, harvest_object_create_schema(), context) if errors: raise logic.ValidationError(errors) obj = HarvestObject(guid=data.get('guid'), content=data.get('content'), job=data['job_id'], harvest_source_id=data.get('source_id'), package_id=data.get('package_id'), extras=[ HarvestObjectExtra(key=k, value=v) for k, v in data.get('extras', {}).items() ]) obj.save() return harvest_object_dictize(obj, context)
def gather_stage(self, harvest_job): log.debug('In ZhstatHarvester gather_stage') ids = [] parser = etree.XMLParser(encoding='utf-8') for dataset in etree.fromstring(self._fetch_metadata(), parser=parser): # Get the german data if one is available, # otherwise get the first one base_datas = dataset.xpath("data[@xml:lang='de']") if len(base_datas) != 0: base_data = base_datas[0] else: base_data = dataset.find('data') metadata = self._generate_metadata(base_data, dataset) if metadata: obj = HarvestObject(guid=dataset.get('id'), job=harvest_job, content=json.dumps(metadata)) obj.save() log.debug('adding ' + dataset.get('id') + ' to the queue') ids.append(obj.id) else: log.debug( 'Skipping %s since no resources or groups are available' % dataset.get('id')) return ids
def _gather_object(self, job, url, size, start_date, forecast_date): filename = parse_filename(url) filename_id = (filename.replace('-v02.0-fv02.0', '').replace( '-fv02.0', '').replace('-sv01.00', '').replace('-sv05.00', '').replace( '-v02', '').replace('-sv10.00', '').replace('-sv09.00', '').replace('-sv07.00', '')) status, package = self._was_harvested(filename_id, self.update_all) extras = [HOExtra(key='status', value=status)] assert start_date content = json.dumps( { 'identifier': filename_id, 'ftp_link': url, 'size': size, 'start_date': start_date, 'forecast_date': forecast_date, 'restart_date': start_date }, default=str) obj = HarvestObject(job=job, guid=url, extras=extras, content=content) obj.package = package obj.save() return obj.id
def _run_import(self, xml, job): if not model.User.get('harvest'): model.User(name='harvest', sysadmin=True).save() if not model.Group.get('test'): get_action('organization_create')({ 'user': '******' }, { 'name': 'test' }) record = _get_record(xml) metadata = CmdiReader()(record) metadata['unified']['owner_org'] = "test" harvest_object = HarvestObject() harvest_object.content = json.dumps(metadata.getMap()) harvest_object.id = xml harvest_object.guid = xml harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.save() self.harvester.import_stage(harvest_object) return harvest_object
def _create_harvest_object(self, package_name, ref): package = model.Package.by_name(unicode(package_name)) model.Session.add( HarvestObject(guid='not important', current=True, source=self.source, job=self.job, harvest_source_reference=ref_prefix+ref, package=package) )
def doi_update(context, data_dict): model = context['model'] new_package = data_dict source_hash = hashlib.sha1(json.dumps(data_dict, sort_keys=True)).hexdigest() old_package = p.toolkit.get_action('package_show')({ 'model': model, 'ignore_auth': True }, { "id": new_package['id'] }) for extra in old_package['extras']: if extra['key'] == 'source_hash': old_source_hash = extra['value'] break else: old_source_hash = None if source_hash == old_source_hash and old_package.get('state') == 'active': print str(datetime.datetime.now() ) + ' No change for doi id ' + new_package['id'] return new_package["extras"].append({"key": "source_hash", "value": source_hash}) new_package["extras"].append({"key": "metadata-source", "value": "doi"}) new_package["extras"].append({ "key": "source_doi_import_identifier", "value": True }) new_package.pop("name", None) owner_org = model.Group.get( ORG_MAPPING.get(new_package['organization']['name'])) if not owner_org: print str( datetime.datetime.now()) + ' Fail to update doi id ' + new_package[ 'id'] + '. Organization ' + new_package['organization'][ 'name'] + ' does not exist.' return new_package['owner_org'] = owner_org.name group_name = new_package.pop('owner_name', None) resources = [] for resource in new_package['resources']: resource.pop('resource_group_id', None) resource.pop('revision_id', None) resource.pop('id', None) resources.append(resource) new_package['resources'] = resources obj = HarvestObject(guid=uuid.uuid4().hex, job=context['harvest_job'], content=context['harvestobj']) obj.save() new_package["extras"].append({"key": "harvest_object_id", "value": obj.id}) context['return_id_only'] = True p.toolkit.get_action('package_update')(context, new_package) print str(datetime.datetime.now()) + ' Updated doi id ' + new_package['id']
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('GeminiWafHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url # Get contents try: content = self._get_content(url) except Exception as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None ids = [] try: for url in self._extract_urls(content, url): try: content = self._get_content(url) except Exception as e: msg = 'Couldn\'t harvest WAF link: %s: %s' % (url, e) self._save_gather_error(msg, harvest_job) continue else: # We need to extract the guid to pass it to the next stage try: gemini_string, gemini_guid = self.get_gemini_string_and_guid( content, url) if gemini_guid: log.debug('Got GUID %s' % gemini_guid) # Create a new HarvestObject for this identifier # Generally the content will be set in the fetch stage, but as we alredy # have it, we might as well save a request obj = HarvestObject(guid=gemini_guid, job=harvest_job, content=gemini_string) obj.save() ids.append(obj.id) except Exception as e: msg = 'Could not get GUID for source %s: %r' % (url, e) self._save_gather_error(msg, harvest_job) continue except Exception as e: msg = 'Error extracting URLs from %s' % url self._save_gather_error(msg, harvest_job) return None if len(ids) > 0: return ids else: self._save_gather_error( 'Couldn\'t find any links to metadata files', harvest_job) return None
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.CSW.gather') log.debug('GeminiCswHarvester gather_stage for job: %r', harvest_job) # Get source URL url = harvest_job.source.url try: self._setup_csw_client(url) except Exception as e: self._save_gather_error('Error contacting the CSW server: %s' % e, harvest_job) return None log.debug('Starting gathering for %s' % url) used_identifiers = [] ids = [] try: for identifier in self.csw.getidentifiers(page=10): try: log.info('Got identifier %s from the CSW', identifier) if identifier in used_identifiers: log.error( 'CSW identifier %r already used, skipping...' % identifier) continue if identifier is None: log.error('CSW returned identifier %r, skipping...' % identifier) ## log an error here? happens with the dutch data continue # Create a new HarvestObject for this identifier obj = HarvestObject(guid=identifier, job=harvest_job) obj.save() ids.append(obj.id) used_identifiers.append(identifier) except Exception as e: self._save_gather_error( 'Error for the identifier %s [%r]' % (identifier, e), harvest_job) continue except Exception as e: log.error('Exception: %s' % text_traceback()) self._save_gather_error( 'Error gathering the identifiers from the CSW server [%s]' % six.text_type(e), harvest_job) return None if len(ids) == 0: self._save_gather_error('No records received from the CSW server', harvest_job) return None return ids
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.ITagEnricher.gather') log.debug('ITagEnricher gather_stage for job: %r', harvest_job) # Save a reference self.job = harvest_job self._set_source_config(self.job.source.config) context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } org_id = model.Package.get(harvest_job.source.id).owner_org organization = logic.get_action('organization_show')(context, { 'id': org_id }) # noqa: E501 # Exclude Sentinel-3 because it seems like iTag can't handle the curved # footprints. filter_query = '+organization:{} -itag:tagged -FamilyName:Sentinel-3'.format( organization['name']) # noqa: E501 ids = [] # We'll limit this to 10 datasets per job so that results appear # faster start = 0 rows = self.source_config.get('datasets_per_job', 10) untagged = logic.get_action('package_search')(context, { 'fq': filter_query, 'rows': rows, 'start': start }) results = untagged['results'] for result in results: spatial = None for i in result['extras']: if i['key'] == 'spatial': spatial = i['value'] if spatial: obj = HarvestObject( guid=result['id'], job=self.job, extras=[ HOExtra(key='status', value='change'), # noqa: E501 HOExtra(key='spatial', value=spatial), # noqa: E501 HOExtra(key='package', value=json.dumps(result)) ]) # noqa: E501 obj.save() ids.append(obj.id) return ids
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' log.info("in gather stage: %s" % harvest_job.source.url) try: harvest_obj_ids = [] registry = self._create_metadata_registry() self._set_config(harvest_job.source.config) client = oaipmh.client.Client(harvest_job.source.url, registry, self.credentials, force_http_get=self.force_http_get) # Start looking from here client.identify() # check if identify works for header in self._identifier_generator(client): harvest_obj = HarvestObject(guid=header.identifier(), job=harvest_job) harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) log.info("Harvest obj %s created" % harvest_obj.id) # return harvest_obj_ids # This is to get only one record except urllib.error.HTTPError as e: log.exception( 'Gather stage failed on %s (%s): %s, %s' % (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs)) self._save_gather_error( 'Could not gather anything from %s' % harvest_job.source.url, harvest_job) return None except Exception as e: log.exception('Gather stage failed on %s: %s' % ( harvest_job.source.url, str(e), )) self._save_gather_error( 'Could not gather anything from %s: %s / %s' % (harvest_job.source.url, str(e), traceback.format_exc()), harvest_job) return None log.info("Gather stage successfully finished with %s harvest objects" % len(harvest_obj_ids)) return harvest_obj_ids
def gather_stage(self, harvest_job): log.debug('In DataWienGvAt gather_stage') doc = etree.parse(self.CATALOGUE_FEED_URL) ids = [] for link in doc.findall("//item/link"): link = link.text id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def delete_geocat_ids(self, harvest_job, harvest_obj_ids, packages_to_delete): delete_harvest_obj_ids = [] for package_info in packages_to_delete: obj = HarvestObject(guid=package_info[1].name, job=harvest_job, extras=[ HarvestObjectExtra(key='import_action', value='delete') ]) obj.save() delete_harvest_obj_ids.append(obj.id) return delete_harvest_obj_ids
def gather_stage(self, harvest_job): log.debug('In OpenDataCatHarvester gahter_stage') # Get feed contents doc = etree.parse(self.INDEX_URL) ids = [] for link_element in doc.findall('//item/link'): link = link_element.text.strip() id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' logger.debug("in gather stage: %s" % harvest_job.source.url) try: harvest_obj_ids = [] self._set_config(harvest_job.source.config) skip_licenses = { 'c12c3333-1ad7-4a3a-a629-ed51fcb636ac', 'a270745d-07d5-4e93-94fc-ba6e0afc97fb', } # TODO: switch # for record in json.loads(open('/tmp/data.json').read())['dataset']: for record in requests.get( urlparse.urljoin(harvest_job.source.url, 'data.json')).json()['dataset']: license_id = record.get('license', 'cc-by').strip('/').split('/')[-1] if license_id in skip_licenses: continue if 'hub.pacificdata' == record.get('isPartOf'): continue if 'Info' in record.get('theme', []): continue harvest_obj = HarvestObject(guid=record['identifier'], content=json.dumps(record), job=harvest_job) harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) except urllib2.HTTPError, e: logger.exception( 'Gather stage failed on %s (%s): %s, %s' % (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs)) self._save_gather_error( 'Could not gather anything from %s' % harvest_job.source.url, harvest_job) return None
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' logger.debug("in gather stage: %s" % harvest_job.source.url) try: harvest_obj_ids = [] self._set_config(harvest_job.source.config) url = urljoin(harvest_job.source.url, '/v1/dataset/search') for record in self._fetch_record_outline(url): # if record['key'] != 'a38c7d49-5a5d-4aa6-a64e-421178bd06d7': # continue harvest_obj = HarvestObject(guid=record['key'], content=record['country'], job=harvest_job) harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) # TODO: remove # break except (HTTPError) as e: logger.exception( 'Gather stage failed on %s (%s): %s, %s' % (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs)) self._save_gather_error( 'Could not gather anything from %s' % harvest_job.source.url, harvest_job) return None except (Exception) as e: logger.exception('Gather stage failed on %s: %s' % ( harvest_job.source.url, str(e), )) self._save_gather_error( 'Could not gather anything from %s' % harvest_job.source.url, harvest_job) return None return harvest_obj_ids
class JSONZipBaseHarvester(JSONDumpBaseCKANHarvester): def info(self): return { 'name': 'zipbase', 'title': 'Base Zip Harvester', 'description': 'A Harvester for Portals, which return JSON files in a zip file.' } def gather_stage(self, harvest_job): self._set_config(harvest_job.source.config) # Request all remote packages try: content = self._get_content(harvest_job.source.url) except Exception, e: self._save_gather_error( 'Unable to get content for URL: %s: %s' % (harvest_job.source.url, str(e)), harvest_job) return None object_ids = [] packages = [] import zipfile import StringIO file_content = StringIO.StringIO(content) archive = zipfile.ZipFile(file_content, "r") for name in archive.namelist(): print name if name.endswith(".json"): package = json.loads(archive.read(name)) packages.append(package) obj = HarvestObject(guid=package['name'], job=harvest_job) obj.content = json.dumps(package) obj.save() object_ids.append(obj.id) ''' context = self.build_context() remote_dataset_names = map(lambda d: d['name'], packages) self.delete_deprecated_datasets(context, remote_dataset_names) ''' if object_ids: return object_ids else: self._save_gather_error( 'No packages received for URL: %s' % harvest_job.source.url, harvest_job) return None
def _save_harvest_object(self, metadata, harvest_job): ''' Save the harvest object with the given metadata dict and harvest_job ''' obj = HarvestObject( guid=metadata['datasetID'], job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + metadata['datasetID'] + ' to the queue') return obj.id
def gather_stage(self, harvest_job): log.debug('In OpendataParisFr gather_stage') doc = html.parse(self.PREFIX_URL + self.CATALOGUE_INDEX_URL) ids = [] for link in doc.findall( "//div[@class='animate download-portlet-element']/a"): link = link.get('href') if not "#comments" in link: id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def gather_stage(self, harvest_object): log.debug('In OdgovltHarvester gather_stage') sync = IvpkIrsSync(sa.create_engine(harvest_object.source.url)) sync.sync_groups() ids = [] for ivpk_dataset in sync.get_ivpk_datasets(): content = json.dumps(dict(ivpk_dataset), cls=DatetimeEncoder) obj = HarvestObject(guid=ivpk_dataset.ID, job=harvest_object, content=content) obj.save() ids.append(obj.id) return ids
def _make_harvest_objs(datasets): '''Create HarvestObject with Socrata dataset content.''' obj_ids = [] guids = [] for d in datasets: log.debug('Creating HarvestObject for {} {}'.format( d['resource']['name'], d['resource']['id'])) obj = HarvestObject( guid=d['resource']['id'], job=harvest_job, content=json.dumps(d), extras=[HarvestObjectExtra(key='status', value='hi!')]) obj.save() obj_ids.append(obj.id) guids.append(d['resource']['id']) return obj_ids, guids
def _mark_datasets_for_deletion(self, guids_in_source, harvest_job): ''' Given a list of guids in the remote source, checks which in the DB need to be deleted To do so it queries all guids in the DB for this source and calculates the difference. For each of these creates a HarvestObject with the dataset id, marked for deletion. Returns a list with the ids of the Harvest Objects to delete. ''' object_ids = [] # Get all previous current guids and dataset ids for this source query = model.Session.query(HarvestObject.guid, HarvestObject.package_id)\ .filter( HarvestObject.current == True # noqa ).filter(HarvestObject.harvest_source_id == harvest_job.source.id) guid_to_package_id = {} for guid, package_id in query: guid_to_package_id[guid] = package_id guids_in_db = list(guid_to_package_id.keys()) # Get objects/datasets to delete (ie in the DB but not in the source) guids_to_delete = set(guids_in_db) - set(guids_in_source) # Create a harvest object for each of them, flagged for deletion for guid in guids_to_delete: obj = HarvestObject( guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HarvestObjectExtra(key='status', value='delete')]) # Mark the rest of objects for this guid as not current model.Session.query(HarvestObject) \ .filter_by(guid=guid) \ .update({'current': False}, False) obj.save() object_ids.append(obj.id) return object_ids
def _create_object(self, ebv_type, dataset_info): extras = [HOExtra(key='status', value='new')] if ebv_type == 'tree_species': collectionID = 'TREE_SPECIES_DISTRIBUTION_HABITAT_SUITABILITY' collection_name = 'Tree Species Distribution Habitat Suitability' collection_description = ' European Distribution of the tress species for the years 2000 (Habitat Suitability baseline), 2020, 2050 and 2080 (Habitat Suitability future), based on different models such as ENS, CCCMA, CSIRO, HADCM3.' # noqa: E501 elif ebv_type == 'flood_hazards': collectionID = 'FLOOD_HAZARD_EU_GL' collection_name = 'Flood Hazard Europe/Global' collection_description = 'The maps depict flood prone areas at global/european scale for flood events. Resolution is 30 arcseconds (approx. 1km). Cell values indicate water depth (in m). The map can be used to assess flood exposure and risk of population and assets. NOTE: this dataset is based on JRC elaborations and is not an official flood hazard map.' # noqa: E501 title = dataset_info[0] description = dataset_info[1] start_date = dataset_info[2] end_date = dataset_info[3] spatial = dataset_info[4] filename = dataset_info[5] identifier = dataset_info[6] download_url = dataset_info[7] tags = dataset_info[8] content = json.dumps( { 'collectionID': collectionID, 'title': title, 'description': description, 'start_date': start_date, 'end_date': end_date, # noqa: E501 'identifier': identifier, 'downloadURL': download_url, # noqa: E501 'spatial': spatial, 'filename': filename, 'collection_name': collection_name, 'collection_description': collection_description, 'tags': tags }, default=str) obj = HarvestObject(job=self.job, guid=unicode(uuid.uuid4()), extras=extras, content=content) obj.save() return obj.id