def _gather_ids(self,url = None, jar= None): log.debug('Page %s'%self.page) if jar is None: jar = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar)) url = url or self.INITIAL_INDEX fh = opener.open(url) doc = html.parse(fh) fh.close() new_ids = [] for a in doc.findall(".//div[@class='main']//a"): href = a.get('href').split('?', 1)[0] id = href.split('/').pop() if not id in self.gathered_ids: log.debug('Got Id: %s' % id) obj = HarvestObject(guid=sha1(id).hexdigest(), job=self.job, content=id) obj.save() self.object_ids.append(obj.id) new_ids.append(id) if len(new_ids) == 0: #or self.page == 2: return self.gathered_ids else: self.gathered_ids.extend(new_ids) inputs = [] for input in doc.findall(".//form[@id='main_form']//input"): inputs.append((input.get('name'), input.get('value'))) inputs.append(('listbox_nextPage:method', '')) next_url = self.INDEX_URL + '?' + urllib.urlencode(inputs) self.page = self.page + 1 return self._gather_ids(url=next_url,jar=jar)
def gather_stage(self,harvest_job): log.debug('In ArrayExpressHarvester.gather_stage(%s)' % harvest_job.source.url) # Get feed contents self._set_config(harvest_job.source.config) #previous_job = Session.query(HarvestJob) \ # .filter(HarvestJob.source==harvest_job.source) \ # .filter(HarvestJob.gather_finished!=None) \ # .filter(HarvestJob.id!=harvest_job.id) \ # .order_by(HarvestJob.gather_finished.desc()) \ # .limit(1).first() baseURL = harvest_job.source.url+"/xml/v2/experiments" #if (previous_job and not previous_job.gather_errors # and not len(previous_job.objects) == 0): # if not self.config.get('force_all',False): # last_time = harvest_job.gather_started.isoformat() # today = format(datetime.date.today()) # self.params['date'] = '['+last_time+' '+today+']' url = baseURL + "?" + self.getParams() print "Fetching from "+url doc = etree.parse(url) ids = [] for accessionElement in doc.findall('//experiment/accession'): accession = accessionElement.text.strip() obj = HarvestObject(guid=accession, job=harvest_job, content=accession) print "ArrayExpress accession: "+accession obj.save() ids.append(obj.id) print ids return ids
def gather_stage(self,harvest_job): log.debug('In SRDAHarvester gather_stage (%s)' % harvest_job.source.url) get_all_packages = True package_ids = [] data = urllib2.urlopen(self.PREFIX_URL + self.CATALOGUE_INDEX_URL) doc = html.parse(data) for td in doc.findall("//td[@class='left_p12_title']/a"): link = td.get('href') if re.match(r"/search/fsciitem", link): id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job= harvest_job, content=link) obj.save() package_ids.append(obj.id) self._set_config(harvest_job.source.config) # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source==harvest_job.source) \ .filter(HarvestJob.gather_finished!=None) \ .filter(HarvestJob.id!=harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() return package_ids
def harvest_object_create(context, data_dict): """ Create a new harvest object :type guid: string (optional) :type content: string (optional) :type job_id: string :type source_id: string (optional) :type package_id: string (optional) :type extras: dict (optional) """ check_access('harvest_object_create', context, data_dict) data, errors = _validate(data_dict, harvest_object_create_schema(), context) if errors: raise logic.ValidationError(errors) obj = HarvestObject( guid=data.get('guid'), content=data.get('content'), job=data['job_id'], harvest_source_id=data.get('source_id'), package_id=data.get('package_id'), extras=[ HarvestObjectExtra(key=k, value=v) for k, v in data.get('extras', {}).items() ] ) obj.save() return harvest_object_dictize(obj, context)
def gather_stage(self, harvest_job): """ The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids """ log.debug("in gather stage: %s" % harvest_job.source.url) try: harvest_obj_ids = [] registry = self._create_metadata_registry() self._set_config(harvest_job.source.config) client = oaipmh.client.Client(harvest_job.source.url, registry, self.credentials) client.identify() # check if identify works for header in self._identifier_generator(client): harvest_obj = HarvestObject(guid=header.identifier(), job=harvest_job) harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) except: log.exception("Gather stage failed %s" % harvest_job.source.url) self._save_gather_error("Could not gather anything from %s!" % harvest_job.source.url, harvest_job) return None return harvest_obj_ids
def gather_stage(self, harvest_job): """Retrieve datasets""" log.debug('In RostockTestHarvester gather_stage (%s)' % harvest_job.source.url) package_ids = [] self._set_config(None) base_url = harvest_job.source.url.rstrip('/') package_list_url = base_url + '/api/rest/package' content = self._get_content(package_list_url) package_ids = json.loads(content) try: object_ids = [] if len(package_ids): for package_id in package_ids: obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) log.info('Got ID from source: %s' %package_id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def gather_stage(self,harvest_job): log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url) get_all_packages = True dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/') log.debug(dcatUrl) adaptorInstance = socrataAdaptor() package_ids = adaptorInstance.listDatasetIds(dcatUrl) try: object_ids = [] if len(package_ids): for package_id in package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def gather_stage(self, harvest_job): log.debug('In FSOHarvester gather_stage') http = urllib3.PoolManager() metadata_file = http.request('GET', self.METADATA_FILE_URL) ids = [] parser = etree.XMLParser(encoding='utf-8') for package in etree.fromstring(metadata_file.data, parser=parser): # Get the german dataset if one is available, otherwise get the first one base_datasets = package.xpath("dataset[@xml:lang='de']") if len(base_datasets) != 0: base_dataset = base_datasets[0] else: base_dataset = package.find('dataset') metadata = self._generate_metadata(base_dataset, package) if metadata: obj = HarvestObject( guid = self._create_uuid(base_dataset.get('datasetID')), job = harvest_job, content = json.dumps(metadata) ) obj.save() log.debug('adding ' + base_dataset.get('datasetID') + ' to the queue') ids.append(obj.id) else: log.debug('Skipping ' + base_dataset.get('datasetID') + ' since no resources or groups are available') return ids
def gather_stage(self,harvest_job): log.debug('In NTPCHarvester gather_stage (%s)' % harvest_job.source.url) url = self.PREFIX_URL + self.CATALOGUE_INDEX_URL get_all_packages = True try: package_ids = [] dataset_count = self._get_ntpc_dataset_count(url) msg_count = 0 for x in range(dataset_count/10 + 1): page_url = url + '?currentPage=%s' % (x + 1) data = urllib2.urlopen(page_url) doc = html.parse(data) for div in doc.findall("//a[@href]"): if '/NTPC/od/query;' in div.attrib['href']: link = div.attrib['href'] id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() package_ids.append(obj.id) msg_count = msg_count + 1 if msg_count == 0: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def gather_stage(self, harvest_job): """Retrieve datasets""" log.debug('In ' + self.city + 'CKANHarvester gather_stage (%s)' % harvest_job.source.url) package_ids = [] self._set_config(None) base_url = harvest_job.source.url.rstrip('/') package_list_url = base_url + '/3/action/package_list' content = self._get_content(package_list_url) content_json = json.loads(content) package_ids = content_json['result'] try: object_ids = [] if len(package_ids): for package_id in package_ids: obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
def gather_stage(self, harvest_job): log.debug('In SNLHarvester gather_stage') metadata_path = self._fetch_metadata_file() ids = [] try: parser = MetaDataParser(metadata_path) for dataset in parser.list_datasets(): metadata = parser.parse_set(dataset) metadata['translations'].extend( self._metadata_term_translations() ) log.debug(metadata) obj = HarvestObject( guid=metadata['id'], job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + metadata['id'] + ' to the queue') ids.append(obj.id) finally: temp_dir = os.path.dirname(metadata_path) log.debug('Deleting directory ' + temp_dir) shutil.rmtree(temp_dir) return ids
def gather_stage(self, harvest_job): log.debug('In ZhstatHarvester gather_stage') ids = [] parser = etree.XMLParser(encoding='utf-8') for dataset in etree.fromstring(self._fetch_metadata(), parser=parser): # Get the german data if one is available, # otherwise get the first one base_datas = dataset.xpath("data[@xml:lang='de']") if len(base_datas) != 0: base_data = base_datas[0] else: base_data = dataset.find('data') metadata = self._generate_metadata(base_data, dataset) if metadata: obj = HarvestObject( guid=dataset.get('id'), job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + dataset.get('id') + ' to the queue') ids.append(obj.id) else: log.debug( 'Skipping %s since no resources or groups are available' % dataset.get('id') ) return ids
def gather_stage(self, harvest_job): try: config = json.loads(harvest_job.source.config) ckan_term_url = config['ckan_term_url'] except Exception as e: log.exception(e) raise ConfigError( "In order to run the translation harvester " "you need to specify 'ckan_term_url' " "in your harvester config json" ) log.debug('Gathering term from %s' % ckan_term_url) try: terms = self._get_terms(ckan_term_url) obj = HarvestObject( job=harvest_job, content=json.dumps(terms) ) obj.save() return [obj.id] except Exception as e: log.exception(e) raise e
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.VariantStore.gather') log.debug('VariantStoreHarvester gather_stage for job: %r', harvest_job) self._set_config(harvest_job.source.config, log=log) obj = HarvestObject(guid = self.guid, job = harvest_job) obj.save() return [ obj.id ]
def gather_stage(self, harvest_job): log.debug('In ChangelogHarvester gather_stage') # Get the last harvested AuditId last_audit = model.Session.query(HarvestLastAudit) \ .order_by(HarvestLastAudit.created.desc()) \ .first() if last_audit: audit_id = last_audit.audit_id else: audit_id = '0' # Get all Audits audits = p.toolkit.get_action('changelog_show')( {'ignore_auth': True}, {'audit_id': audit_id, 'top': 1000}) # Check if there are any new audits to process if not len(audits) or ( len(audits) == 1 and audits[0]['AuditId'] == audit_id): log.debug( 'No new audits to process since last run ' + '(Last audit id {0})'.format(audit_id)) return [] # Ignore the first audit if an audit id was defined as start, # as this one will be included in the results audits = audits[1:] if audit_id != '0' and len(audits) > 1 else audits ids = [] update_audits = {} for audit in audits: # We only want to use the most recent update per object per run # Store the most recent audit against a hash of the id fields if 'update' in audit['Command'].lower(): m = hashlib.md5() m.update(json.dumps(audit['CustomProperties'])) ids_hash = m.hexdigest() update_audits[ids_hash] = audit else: obj = HarvestObject(guid=audit['AuditId'], job=harvest_job, content=json.dumps(audit)) obj.save() ids.append(obj.id) # Save the last AuditId to know where to start in the next run save_last_audit_id(audit['AuditId'], harvest_job.id) for key, audit in update_audits.iteritems(): obj = HarvestObject(guid=audit['AuditId'], job=harvest_job, content=json.dumps(audit)) obj.save() ids.append(obj.id) return ids
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' self._set_config(harvest_job.source.config) sets = [] harvest_objs = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = oaipmh.client.Client(harvest_job.source.url, registry) try: identifier = client.identify() except urllib2.URLError: self._save_gather_error('Could not gather anything from %s!' % harvest_job.source.url, harvest_job) return None domain = identifier.repositoryName() group = Group.by_name(domain) if not group: group = Group(name=domain, description=domain) query = self.config['query'] if 'query' in self.config else '' try: for set in client.listSets(): identifier, name, _ = set if 'query' in self.config: if query in name: sets.append((identifier, name)) else: sets.append((identifier, name)) except NoSetHierarchyError: sets.append(('1', 'Default')) self._save_gather_error('Could not fetch sets!', harvest_job) for set_id, set_name in sets: harvest_obj = HarvestObject(job=harvest_job) harvest_obj.content = json.dumps( { 'set': set_id, \ 'set_name': set_name, \ 'domain': domain } ) harvest_obj.save() harvest_objs.append(harvest_obj.id) model.repo.commit() return harvest_objs
def gather_stage(self, harvest_job): log.debug('In SFAHarvester gather_stage') try: file_path = self._fetch_metadata_file() ids = [] de_rows = self._get_row_dict_array(0, file_path) for row in de_rows: # Construct the metadata dict for the dataset on CKAN metadata = { 'datasetID': row[u'id'], 'title': row[u'title'], 'url': row[u'url'], 'notes': row[u'notes'], 'author': row[u'author'], 'maintainer': row[u'maintainer'], 'maintainer_email': row[u'maintainer_email'], 'license_id': row[u'licence'], 'license_url': row[u'licence_url'], 'translations': [], 'tags': row[u'tags'].split(u', '), 'groups': [row[u'groups']] } metadata['resources'] = self._generate_resources_dict_array( row[u'id'] ) metadata['resources'][0]['version'] = row[u'version'] log.debug(metadata['resources']) # Adding term translations metadata['translations'].extend( self._generate_term_translations(1, file_path) # fr ) metadata['translations'].extend( self._generate_term_translations(2, file_path) # it ) metadata['translations'].extend( self._generate_term_translations(3, file_path) # en ) log.debug(metadata['translations']) obj = HarvestObject( guid=self._create_uuid(row[u'id']), job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + row[u'id'] + ' to the queue') ids.append(obj.id) log.debug(de_rows) except Exception: return False return ids
def _gen_harvest_obj_for_files(self, harvest_job): ids = [] for dataset_name, dataset in self.DATASETS.iteritems(): csw = ckan_csw.SwisstopoCkanMetadata() metadata = csw.get_ckan_metadata( dataset['csw_query'], 'de' ).copy() metadata_fr = csw.get_ckan_metadata( dataset['csw_query'], 'fr' ).copy() metadata_it = csw.get_ckan_metadata( dataset['csw_query'], 'it' ).copy() metadata_en = csw.get_ckan_metadata( dataset['csw_query'], 'en' ).copy() log.debug(metadata) metadata['translations'] = self._generate_term_translations() log.debug("Translations: %s" % metadata['translations']) metadata_trans = { u'de': metadata, u'fr': metadata_fr, u'it': metadata_it, u'en': metadata_en, } metadata['translations'].extend( self._generate_metadata_translations(metadata_trans) ) metadata['resources'] = self._generate_resources_dict_array( dataset_name ) metadata['resources'].extend( self._generate_api_resources(metadata, dataset_name) ) log.debug(metadata['resources']) metadata['license_id'], metadata['license_url'] = ( self._get_org_license(dataset_name) ) metadata['layer_name'] = dataset_name obj = HarvestObject( guid=metadata['id'], job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + dataset_name + ' to the queue') ids.append(obj.id) return ids
def gather_stage(self, harvest_job): log.debug('In FOPHHarvester gather_stage') try: file_path = self._fetch_metadata_file() ids = [] de_cols = self._get_col_dict_array(0, file_path) for col in de_cols: # Construct the metadata dict for the dataset on CKAN metadata = { 'datasetID': col[u'id'], 'title': col[u'title'], 'url': col[u'url'], 'notes': col[u'notes'], 'author': col[u'author'], 'author_email': col[u'author_email'], 'maintainer': col[u'maintainer'], 'maintainer_email': col[u'maintainer_email'], 'license_id': col[u'license_id'].lower(), 'version': col[u'version'], 'translations': [], 'tags': [] } tags = col[u'tags'].split(u', ') tags = [munge_tag(tag) for tag in tags] metadata['tags'] = tags metadata['resources'] = self._generate_resources_dict_array( col[u'id']) metadata['resources'][0]['version'] = col[u'version'] log.debug(metadata['resources']) # Adding term translations metadata['translations'].extend( self._generate_term_translations(1, file_path)) # fr metadata['translations'].extend( self._generate_term_translations(2, file_path)) # it metadata['translations'].extend( self._generate_term_translations(3, file_path)) # en log.debug(metadata['translations']) obj = HarvestObject( guid=self._create_uuid(col[u'id']), job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + col[u'id'] + ' to the queue') ids.append(obj.id) log.debug(de_cols) except Exception: return False return ids
def gather_stage(self, harvest_job, encoding=None): self._set_config(harvest_job.source.config) # Request all remote packages try: content = self._get_content(harvest_job.source.url) LOGGER.debug('Grabbing zip file: %s', harvest_job.source.url) object_ids = [] packages = [] file_content = StringIO.StringIO(content) archive = zipfile.ZipFile(file_content, 'r') for name in archive.namelist(): if name.endswith('.json'): archive_content = archive.read(name) if encoding is not None: archive_content = archive_content.decode(encoding) else: archive_content = self.lstrip_bom(archive_content) package = json.loads(archive_content) normalize_api_dataset(package) packages.append(package) obj = HarvestObject(guid=package['name'], job=harvest_job) obj.content = json.dumps(package) obj.save() object_ids.append(obj.id) except zipfile.BadZipfile as err: self._save_gather_error(err.message, harvest_job) return None except ContentFetchError as err: self._save_gather_error(err.message, harvest_job) return None except Exception as err: error_template = 'Unable to get content for URL: %s: %s' error = error_template % (harvest_job.source.url, str(err)) self._save_gather_error(error, harvest_job) return None if object_ids: # delete obsolete packages super(JSONZipBaseHarvester, self).delete_deprecated_datasets( packages, harvest_job ) return object_ids else: self._save_gather_error( 'No packages received for URL: %s' % harvest_job.source.url, harvest_job ) return None
def populate_harvest_job(self, harvest_job, set_ids, config, client): # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source == harvest_job.source) \ .filter(HarvestJob.gather_finished != None) \ .filter(HarvestJob.id != harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() last_time = None if previous_job and previous_job.finished and model.Package.get(harvest_job.source.id).metadata_modified < previous_job.gather_started: last_time = previous_job.gather_started.isoformat() # Collect package ids package_ids = list(self.get_package_ids(set_ids, config, last_time, client)) log.debug('Identifiers: %s', package_ids) if not self._recreate(harvest_job) and package_ids: converted_identifiers = {} for identifier in package_ids: converted_identifiers[datapid_to_name(identifier)] = identifier if identifier.endswith(u'm'): converted_identifiers[datapid_to_name(u"%ss" % identifier[0:-1])] = identifier for package in model.Session.query(model.Package).filter(model.Package.name.in_(converted_identifiers.keys())).all(): converted_name = package.name if converted_identifiers[converted_name] not in package_ids: converted_name = "%sm" % converted_name[0:-1] package_ids.remove(converted_identifiers[converted_name]) if previous_job: for previous_error in [error.guid for error in Session.query(HarvestObject). filter(HarvestObject.harvest_job_id == previous_job.id). filter(HarvestObject.state == 'ERROR').all()]: if previous_error not in package_ids: package_ids.append(previous_error) try: object_ids = [] if len(package_ids): for package_id in islice(package_ids, config['limit']) if 'limit' in config else package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) log.debug('Object ids: {i}'.format(i=object_ids)) return object_ids else: self._save_gather_error('No packages received for URL: {u}'.format( u=harvest_job.source.url), harvest_job) return None except Exception as e: self._save_gather_error('Gather: {e}'.format(e=e), harvest_job) raise
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730180'}) self.assertEquals(package.get('id', None), 'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('name', None), 'urn-nbn-fi-lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = {u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'metadata'} self.assertTrue(expected_pid in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730186'}) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
def gather_stage(self,harvest_job): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('z3950Harvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL source_url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # get current objects out of db query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) guid_to_package_id = dict((res[0], res[1]) for res in query) current_guids = set(guid_to_package_id.keys()) current_guids_in_harvest = set() # Get contents try: conn = zoom.Connection(source_url, int(self.source_config.get('port', 210))) conn.databaseName = self.source_config.get('database', '') conn.preferredRecordSyntax = 'XML' conn.elementSetName = 'T' query = zoom.Query ('CCL', 'metadata') res = conn.search (query) ids = [] for num, result in enumerate(res): hash = hashlib.md5(result.data).hexdigest() if hash in current_guids: current_guids_in_harvest.add(hash) else: obj = HarvestObject(job=harvest_job, guid=hash, extras=[ HOExtra(key='status', value='new'), HOExtra(key='original_document', value=result.data.decode('latin-1')), HOExtra(key='original_format', value='fgdc') ]) obj.save() ids.append(obj.id) for guid in (current_guids - current_guids_in_harvest): obj = HarvestObject(job=harvest_job, guid=guid, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='delete')]) obj.save() ids.append(obj.id) return ids except Exception,e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (source_url, e),harvest_job) return None
def gather_stage(self, harvest_job): log.debug('In DataWienGvAt gather_stage') doc = etree.parse(self.CATALOGUE_FEED_URL) ids = [] for link in doc.findall("//item/link"): link = link.text id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def gather_stage(self, harvest_job): if harvest_job.source.url.startswith('basic_test'): obj = HarvestObject(guid = 'test1', job = harvest_job) obj.extras.append(HarvestObjectExtra(key='key', value='value')) obj2 = HarvestObject(guid = 'test2', job = harvest_job) obj3 = HarvestObject(guid = 'test_to_delete', job = harvest_job) obj.add() obj2.add() obj3.save() # this will commit both return [obj.id, obj2.id, obj3.id] return []
def gather_stage(self,harvest_job): log.debug('In OpenDataCatHarvester gahter_stage') # Get feed contents doc = etree.parse(self.INDEX_URL) ids = [] for link_element in doc.findall('//item/link'): link = link_element.text.strip() id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def gather_stage(self, harvest_job): log.debug('In OpendataParisFr gather_stage') doc = html.parse(self.PREFIX_URL + self.CATALOGUE_INDEX_URL) ids = [] for link in doc.findall("//div[@class='animate download-portlet-element']/a"): link = link.get('href') if not "#comments" in link: id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def gather_stage(self,harvest_job): log.debug('In OpenGovSeHarvester gahter_stage') # Get feed contents doc = etree.parse(self.INDEX_URL) ids = [] for id_element in doc.findall('//{%(ns)s}entry/{%(ns)s}id' % {'ns':self.ATOM_NS}): link = id_element.text.strip() log.debug('Got link: %s' % link) id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def _save_harvest_object(self, metadata, harvest_job): ''' Save the harvest object with the given metadata dict and harvest_job ''' obj = HarvestObject( guid=metadata['datasetID'], job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + metadata['datasetID'] + ' to the queue') return obj.id
def doi_update(context, data_dict): model = context['model'] new_package = data_dict source_hash = hashlib.sha1(json.dumps(data_dict, sort_keys=True)).hexdigest() old_package = p.toolkit.get_action('package_show')( {'model': model, 'ignore_auth': True}, {"id":new_package['id']}) for extra in old_package['extras']: if extra['key'] == 'source_hash': old_source_hash = extra['value'] break else: old_source_hash = None if source_hash == old_source_hash and old_package.get('state') =='active': print str(datetime.datetime.now()) + ' No change for doi id ' + new_package['id'] return new_package["extras"].append({"key": "source_hash", "value": source_hash}) new_package["extras"].append({"key": "metadata-source", "value": "doi"}) new_package["extras"].append({"key": "source_doi_import_identifier", "value": True}) new_package.pop("name", None) owner_org = model.Group.get(ORG_MAPPING.get(new_package['organization']['name'])) if not owner_org: print str(datetime.datetime.now()) + ' Fail to update doi id ' + new_package['id'] + '. Organization ' + new_package['organization']['name'] + ' does not exist.' return new_package['owner_org'] = owner_org.name group_name = new_package.pop('owner_name', None) resources = [] for resource in new_package['resources']: resource.pop('resource_group_id', None) resource.pop('revision_id', None) resource.pop('id', None) resources.append(resource) new_package['resources'] = resources obj = HarvestObject( guid=uuid.uuid4().hex, job=context['harvest_job'], content=context['harvestobj']) obj.save() new_package["extras"].append({"key": "harvest_object_id", "value": obj.id}) context['return_id_only'] = True p.toolkit.get_action('package_update')(context, new_package) print str(datetime.datetime.now()) + ' Updated doi id ' + new_package['id']
def setup_class(cls): # Create package and its harvest object CreateTestData.create() harvest_setup() source = HarvestSource(url=u'http://test-source.org',type='test') source.save() job = HarvestJob(source=source) job.save() ho = HarvestObject(package=model.Package.by_name(u'annakarenina'), job=job, guid=u'test-guid', content=u'<xml>test content</xml>') ho.save() # Save a reference to the harvest object in the package rev = model.repo.new_revision() pkg = model.Package.by_name(u'annakarenina') pkg.extras['harvest_object_id'] = ho.id pkg.save() model.repo.commit_and_remove()
def _gather_ids(self, url=None, jar=None): log.debug('Page %s' % self.page) if jar is None: jar = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar)) url = url or self.INITIAL_INDEX fh = opener.open(url) doc = html.parse(fh) fh.close() new_ids = [] for a in doc.findall(".//div[@class='main']//a"): href = a.get('href').split('?', 1)[0] id = href.split('/').pop() if not id in self.gathered_ids: log.debug('Got Id: %s' % id) obj = HarvestObject(guid=sha1(id).hexdigest(), job=self.job, content=id) obj.save() self.object_ids.append(obj.id) new_ids.append(id) if len(new_ids) == 0: #or self.page == 2: return self.gathered_ids else: self.gathered_ids.extend(new_ids) inputs = [] for input in doc.findall(".//form[@id='main_form']//input"): inputs.append((input.get('name'), input.get('value'))) inputs.append(('listbox_nextPage:method', '')) next_url = self.INDEX_URL + '?' + urllib.urlencode(inputs) self.page = self.page + 1 return self._gather_ids(url=next_url, jar=jar)
def _gather_object(self, job, product, resources, manifest_content, last_harvest_date): name = parse_filename(product).lower() status, package = self._was_harvested(name, self.update_all) extras = [HOExtra(key='status', value=status)] content = json.dumps( { 'name': name, 'restart_date': last_harvest_date.strftime('%Y-%m-%d'), 'manifest_content': manifest_content, 'resources': resources }, default=str) obj = HarvestObject(job=job, guid=unicode(uuid.uuid4()), extras=extras, content=content) obj.package = package obj.save() return obj.id
def gather_stage(self, harvest_job): log.debug('In ZhGisHarvester gather_stage') ids = [] for dataset_id, dataset in self.DATASETS.iteritems(): csw = ckan_csw.ZhGisCkanMetadata() metadata = csw.get_ckan_metadata_by_id(dataset_id).copy() log.debug(metadata) # Fix metadata information metadata['name'] = munge_title_to_name(metadata['name']) metadata['service_type'] = (metadata['service_type'].replace( 'OGC:', '')) # Enrich metadata with hardcoded values metadata['url'] = dataset['geolion_url'] metadata['tags'].extend(dataset['tags']) metadata['translations'] = self._generate_term_translations() log.debug("Translations: %s" % metadata['translations']) metadata['resources'] = ( self._generate_resource_dict_array(metadata)) log.debug(metadata['resources']) metadata['license_id'] = self.LICENSE['name'] metadata['license_url'] = self.LICENSE['url'] obj = HarvestObject(guid=metadata['id'], job=harvest_job, content=json.dumps(metadata)) obj.save() log.debug('adding ' + metadata['name'] + ' to the queue') ids.append(obj.id) return ids
def _run_job_for_single_document(self, harvest_job, object_id): harvester = FisbrokerPlugin() # we circumvent gather_stage() and fetch_stage() and just load the # content with a known object_id and create the harvest object: url = harvest_job.source.url # _get_content() returns XML content = harvester._get_content(url) obj = HarvestObject(guid=object_id, job=harvest_job, content=content, extras=[HarvestObjectExtra(key='status',value='new')]) obj.save() assert obj, obj.content harvester.import_stage(obj) Session.refresh(obj) harvest_job.status = u'Finished' harvest_job.save() return obj
def _crawl_results(self, harvest_url, limit=100, timeout=5, username=None, password=None, provider=None): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 update_counter = 0 while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, auth=HTTPBasicAuth(username, password), verify=False, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format(r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') # Get the URL for the next loop, or None to break the loop harvest_url = self._get_next_url(soup) # Get the entries from the results entries = self._get_entries_from_results(soup) # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] entry_restart_date = entry['restart_date'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. # We need package_show to ensure that all the conversions # are carried out. context = {"user": "******", "ignore_auth": True, "model": model, "session": Session} pkg_dict = logic.get_action('package_show')(context, {"id": package.name}) # noqa: E501 previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 # E.g., a Sentinel dataset exists, # but doesn't have a NOA resource yet. elif self.flagged_extra and not get_pkg_dict_extra(pkg_dict, self.flagged_extra): # noqa: E501 log.debug('{} already exists and will be extended.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 else: log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date)]) obj.content = entry['content'] obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date)]) new_counter += 1 obj.content = entry['content'] obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, update_counter)) # noqa: E128, E501 return ids
def _mark_datasets_for_deletion(self, guids_in_source, harvest_job): # This is the same as the method in the base class, except that a different query is used. object_ids = [] portal = self._get_portal_from_config(harvest_job.source.config) starttime = time.time() # Get all previous current guids and dataset ids for this harvested portal independent of # the harvest objects. This allows cleaning the harvest data without loosing the # dataset mappings. # Build a subquery to get all active packages having a GUID first subquery = model.Session.query(model.PackageExtra.value, model.Package.id) \ .join(model.Package, model.Package.id == model.PackageExtra.package_id)\ .filter(model.Package.state == model.State.ACTIVE) \ .filter(model.PackageExtra.state == model.State.ACTIVE) \ .filter(model.PackageExtra.key == 'guid') \ .subquery() # then get all active packages of the current portal and join with their GUIDs if # available (outer join) query = model.Session.query(model.Package.id, subquery.c.value) \ .join(model.PackageExtra, model.PackageExtra.package_id == model.Package.id)\ .outerjoin(subquery, subquery.c.id == model.Package.id)\ .filter(model.Package.state == model.State.ACTIVE) \ .filter(model.PackageExtra.state == model.State.ACTIVE) \ .filter(model.PackageExtra.key == EXTRA_KEY_HARVESTED_PORTAL) \ .filter(model.PackageExtra.value == portal) checkpoint_start = time.time() guid_to_package_id = {} for package_id, guid in query: if guid: guid_to_package_id[guid] = package_id # Also remove all packages without a GUID, use ID as GUID to share logic below else: guid_to_package_id[package_id] = package_id checkpoint_end = time.time() LOGGER.debug('Time for query harvest source related datasets : %s', str(checkpoint_end - checkpoint_start)) guids_in_db = guid_to_package_id.keys() # Get objects/datasets to delete (ie in the DB but not in the source) guids_to_delete = set(guids_in_db) - set(guids_in_source) # Create a harvest object for each of them, flagged for deletion for guid in guids_to_delete: obj = HarvestObject(guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HarvestObjectExtra(key='status', value='delete')]) # Mark the rest of objects for this guid as not current model.Session.query(HarvestObject) \ .filter_by(guid=guid) \ .update({'current': False}, False) obj.save() object_ids.append(obj.id) endtime = time.time() LOGGER.debug('Found %s packages for deletion. Time total: %s', len(guids_to_delete), str(endtime - starttime)) return object_ids
def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('SatcenBetter Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) self.update_all = self.source_config.get('update_all', False) interface = INTERFACE(self.source_config, COLLECTION) last_product_index = (self._get_last_harvesting_index( harvest_job.source_id, interface)) interface.update_index(last_product_index) interface.build_url() log.debug('URL: {}'.format(interface.current_url)) # noqa: E501 ids = [] try: results = interface.get_results() except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 return ids if type(results) is not list: self._save_gather_error('{} error: {}'.format( results['status_code'], results['message']), self.job) # noqa: E501 return ids for entry in results: name_path = interface.get_name_path() name_url = get_field(entry, name_path['relative_location'].split(","), name_path['fixed_attributes']) entry_name = parse_name(name_url).lower() entry_guid = unicode(uuid.uuid4()) package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' . # noqa: E501 format(entry_name)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key=interface.get_pagination_mechanism(), value=interface.get_index()) ]) obj.content = json.dumps(entry) obj.package = None if status == 'new' else package obj.save() interface.increment_index() ids.append(obj.id) return ids
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package.get('name', None), utils.pid_to_name(package.get('id', None))) self.assertEquals(utils.get_primary_pid(package), u'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = { u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'relation', u'relation': u'generalRelation' } self.assertTrue(expected_pid not in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
def gather_stage(self, harvest_job, collection_package_id=None): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('WafHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL source_url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: response = requests.get(source_url, timeout=60) response.raise_for_status() except requests.exceptions.RequestException as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (source_url, e),harvest_job) return None content = response.content scraper = _get_scraper(response.headers.get('server')) ###### Get current harvest object out of db ###### url_to_modified_db = {} ## mapping of url to last_modified in db url_to_ids = {} ## mapping of url to guid in db HOExtraAlias1 = aliased(HOExtra) HOExtraAlias2 = aliased(HOExtra) query = model.Session.query(HarvestObject.guid, HarvestObject.package_id, HOExtraAlias1.value, HOExtraAlias2.value).\ join(HOExtraAlias1, HarvestObject.extras).\ join(HOExtraAlias2, HarvestObject.extras).\ filter(HOExtraAlias1.key=='waf_modified_date').\ filter(HOExtraAlias2.key=='waf_location').\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) for guid, package_id, modified_date, url in query: url_to_modified_db[url] = modified_date url_to_ids[url] = (guid, package_id) ###### Get current list of records from source ###### url_to_modified_harvest = { } ## mapping of url to last_modified in harvest try: for url, modified_date in _extract_waf(content, source_url, scraper): url_to_modified_harvest[url] = modified_date except Exception as e: msg = 'Error extracting URLs from %s, error was %s' % (source_url, e) self._save_gather_error(msg, harvest_job) return None ###### Compare source and db ###### harvest_locations = set(url_to_modified_harvest.keys()) old_locations = set(url_to_modified_db.keys()) new = harvest_locations - old_locations delete = old_locations - harvest_locations possible_changes = old_locations & harvest_locations change = [] for item in possible_changes: if (not url_to_modified_harvest[item] or not url_to_modified_db[ item] #if there is no date assume change or url_to_modified_harvest[item] > url_to_modified_db[item]): change.append(item) def create_extras(url, date, status): extras = [ HOExtra(key='waf_modified_date', value=date), HOExtra(key='waf_location', value=url), HOExtra(key='status', value=status) ] if collection_package_id: extras.append( HOExtra(key='collection_package_id', value=collection_package_id)) return extras ids = [] for location in new: guid = hashlib.md5(location.encode('utf8', 'ignore')).hexdigest() obj = HarvestObject(job=harvest_job, extras=create_extras( location, url_to_modified_harvest[location], 'new'), guid=guid) obj.save() ids.append(obj.id) for location in change: obj = HarvestObject( job=harvest_job, extras=create_extras(location, url_to_modified_harvest[location], 'change'), guid=url_to_ids[location][0], package_id=url_to_ids[location][1], ) obj.save() ids.append(obj.id) for location in delete: obj = HarvestObject( job=harvest_job, extras=create_extras('', '', 'delete'), guid=url_to_ids[location][0], package_id=url_to_ids[location][1], ) model.Session.query(HarvestObject).\ filter_by(guid=url_to_ids[location][0]).\ update({'current': False}, False) obj.save() ids.append(obj.id) if len(ids) > 0: log.debug( '{0} objects sent to the next stage: {1} new, {2} change, {3} delete' .format(len(ids), len(new), len(change), len(delete))) return ids else: self._save_gather_error('No records to change', harvest_job) return []
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.individual.gather') log.debug('DocHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: content = self._get_content_as_unicode(url) except Exception as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id).\ first() def create_extras(url, status): return [ HOExtra(key='doc_location', value=url), HOExtra(key='status', value=status) ] if not existing_object: guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest() harvest_object = HarvestObject(job=harvest_job, extras=create_extras(url, 'new'), guid=guid) else: harvest_object = HarvestObject( job=harvest_job, extras=create_extras(url, 'change'), guid=existing_object.guid, package_id=existing_object.package_id) harvest_object.add() # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content else: extra = HOExtra(object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra(object=harvest_object, key='original_format', value=document_format) extra.save() harvest_object.save() return [harvest_object.id]
def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('SCENT Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) max_dataset = self.source_config.get('max_dataset', 100) wfs_url = self.source_config.get('wfs_url') wfs_version = self.source_config.get('wfs_version') collection = self.source_config.get('collection') typename = COLLECTION[collection].get('collection_typename') tag_typename = COLLECTION[collection].get('tag_typename', None) self.update_all = self.source_config.get('update_all', False) last_product_index = ( self._get_last_harvesting_index(harvest_job.source_id) ) if last_product_index: last_product_index = last_product_index + 1 else: last_product_index = 0 wfs = WFS(url=wfs_url, version=wfs_version) wfs.set_collection(typename) sortby=['When'] result = wfs.make_request(max_dataset, sortby, last_product_index) entries = result['features'] name = '{}_{}'.format(collection.lower(), '{}') ids = [] for entry in entries: entry_guid = unicode(uuid.uuid4()) entry_name = name.format(convert_to_clean_snakecase(entry['id'])) log.debug('gathering %s', entry_name) content = {} content['collection_content'] = entry if tag_typename: wfs.set_collection(tag_typename) filterxml = wfs.set_filter_equal_to('image_id', entry['id']) result = wfs.make_request(constraint=filterxml) result = wfs.get_request(constraint=filterxml) content['tag_url'] = result package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug( '{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='index', value=last_product_index) ]) obj.content = json.dumps(content) obj.package = None if status == 'new' else package obj.save() last_product_index += 1 ids.append(obj.id) return ids
def gather_stage(self, harvest_job): ''' analyze the source, return a list of IDs and create one HarvestObject per dataset ''' logger.info('Starts Gather SIU Transp') # load paths self.set_paths() self.siu_data_lib.get_query_files() # basic things you'll need self.source = harvest_job.source self.source_config = json.loads(self.source.config) # allow to get config from URL # Sample: https://raw.githubusercontent.com/avdata99/ckan-env/develop/docs/full_config.json config_from_url = self.source_config.get('from_url', None) if config_from_url is not None: logger.info('Updating config from URL') response = requests.get(config_from_url) update_config = response.json() self.source_config.update(update_config) self.siu_data_lib.base_url = self.source.url self.siu_data_lib.username = self.source_config['username'] self.siu_data_lib.password = self.source_config['password'] # #################################### # get previous harvested packages pfr = self.get_packages_for_source(harvest_source_id=self.source.id) prev_names = [pkg['name'] for pkg in pfr['results']] logger.info('Get previous harvested objects {}'.format(prev_names)) # TODO # #################################### object_ids = [] # lista de IDs a procesar, esto se devuelve en esta funcion self.source_dataset = get_harvest_source(self.source.id) owner_org = self.source_dataset.get('owner_org') logger.info('Gather SIU Transp to ORG {}'.format(owner_org)) # Iterar por cada query para obtener diferentes conjuntos de datos # Por cada archivo en siu_transp_data/queries se generarán múltiples datasets para publicar report = [] # resumen de todos los resultados logger.info('Iter files') # ver si la config me pide sobreescribir metadatos en los datasets de cada archivo override = self.source_config.get('override', {}) logger.info("General override {}".format(override)) for qf in self.siu_data_lib.query_files: only_files = self.source_config.get('only_files', None) query_file_name = qf.split('/')[-1] if only_files is not None: if query_file_name not in only_files: logger.info('Skipping file by config {}'.format(query_file_name)) continue logger.info('Gather SIU Transp FILE {}'.format(qf)) stqf = SIUTranspQueryFile(portal=self.siu_data_lib, path=qf) # open to read query params stqf.open() # request all data stqf.request_all(results_folder_path=self.results_folder_path) for err in stqf.errors: hgerr = HarvestGatherError(message=err, job=harvest_job) hgerr.save() # ====== Prepare dict to override datasets metadata ============ override_this = override.get(query_file_name, {}) logger.info("To override {}: {}".format(query_file_name, override_this)) # extras need to be {"key": "extra name", "value": "extra value"} extras = override_this.get('extras', {}) new_extras = [] for extra_key, extra_value in extras.iteritems(): logger.info("Override extra found {}: {}".format(extra_key, extra_value)) if not isinstance(extra_value, str): extra_value = str(extra_value) new_extras.append({"key": extra_key, "value": extra_value}) if len(new_extras) > 0: override_this['extras'] = new_extras # tags need to be {"name": "tag name"} tags = override_this.get('tags', []) new_tags = [] for tag in tags: logger.info("Override tag found {}".format(unicode(tag).encode("utf-8"))) new_tags.append({"name": tag}) if len(new_tags) > 0: override_this['tags'] = new_tags # groups need to be {"name": "tag name"} groups = override_this.get('groups', []) new_groups = [] for group in groups: logger.info("Override group found {}".format(group)) # check if groups must be created context = {'model': model, 'session': model.Session, 'user': self._get_user_name()} try: p.toolkit.get_action('group_create')(context, {"name": group}) except Exception as e: logger.error('Error creating group (skipped) {}: {}'.format(group, e)) new_groups.append({"name": group}) if len(new_groups) > 0: override_this['groups'] = new_groups # ================================ report += stqf.requests for dataset in stqf.datasets: if dataset['name'] in prev_names: action = 'update' # leave this list just with packages to remove prev_names.remove(dataset['name']) else: action = 'create' logger.info('Dataset {} to {}'.format(dataset['name'], action)) ho_dict = { 'title': dataset['title'], 'name': dataset['name'], 'owner_org': owner_org, 'notes': dataset['notes'], 'tags': dataset['tags'], 'resources': dataset['resources'], 'action': action } # fix extras if they exists ho_dict.update(override_this) logger.info("Overrided ho_dict {}".format(ho_dict)) # Each harvest object will be passed to other stages in harvest process obj = HarvestObject(guid=dataset['name'], job=harvest_job, content=json.dumps(ho_dict)) obj.save() logger.info('Objects ID appends {}'.format(obj.id)) object_ids.append(obj.id) # TODO compare with previous harvested data to remove dataset no more at harvest source # resumen final logger.info('REQUESTS: \n{}'.format('\n\t'.join(report))) return object_ids
def gather_stage(self, harvest_job): # The gather stage scans a remote resource (in our case, the /data.json file) for # a list of datasets to import. log.debug('In datajson harvester gather_stage (%s)' % harvest_job.source.url) source = json.load(urllib2.urlopen(harvest_job.source.url)) if len(source) == 0: return None # Loop through the packages we've already imported from this source # and go into their extra fields to get their source_datajson_identifier, # which corresponds to the /data.json 'identifier' field. Make a mapping # so we know how to update existing records. existing_datasets = {} for hobj in model.Session.query(HarvestObject).filter_by( source=harvest_job.source, current=True): try: pkg = get_action('package_show')(self.context(), { "id": hobj.package_id }) except: # reference is broken continue for extra in pkg["extras"]: if extra["key"] == "source_datajson_identifier": existing_datasets[extra["value"]] = hobj.package_id # If we've lost an association to the HarvestSource, scan all packages in the database. if False: for pkg in model.Session.query(Package): if pkg.extras.get("source_datajson_url") == harvest_job.source.url \ and pkg.extras.get("source_datajson_identifier"): existing_datasets[ pkg.extras["source_datajson_identifier"]] = pkg.id # Create HarvestObjects for any records in the /data.json file. object_ids = [] seen_datasets = set() for dataset in source: # Create a new HarvestObject for this identifier and save the # dataset metdata inside it for later. # Get the package_id of this resource if we've already imported # it into our system. Otherwise, assign a brand new GUID to the # HarvestObject. I'm not sure what the point is of that. if dataset['identifier'] in existing_datasets: pkg_id = existing_datasets[dataset["identifier"]] seen_datasets.add(pkg_id) else: pkg_id = uuid.uuid4().hex # Create a new HarvestObject and store in it the GUID of the # existing dataset (if it exists here already) and the dataset's # metadata from the /data.json file. obj = HarvestObject(guid=pkg_id, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) # Remove packages no longer in the /data.json file. for id in existing_datasets.values(): if id not in seen_datasets: log.warn('deleting package %s because it is no longer in %s' % (id, harvest_job.source.url)) Session.query(Package).filter(Package.id == id) return object_ids
def _gather_entry(self, entry, path, row, update_all=False): # Create a harvest object for each entry entry_guid = unicode(uuid.uuid4()) entry_name = entry.lower() # noqa: E501 log.debug('gathering %s', entry) package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = package obj.save() return obj.id else: log.debug( '{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = package obj.save() return obj.id elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = None obj.save() return obj.id
def _parse_products(self, products): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 # Create a harvest object for each entry for entry in products: entry_guid = entry['imgtif'].split('/')[1].lower( ) + "_" + entry['type'] + "_" + str(entry['intid']) entry_name = entry['imgtif'].split('/')[1].lower( ) + "_" + entry['type'] + "_" + str(entry['intid']) entry_restart_date = entry['master'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry) obj.package = None obj.save() ids.append(obj.id) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids
def gather_stage(self, harvest_job): requests_cache.install_cache() requests_cache.clear() session = requests_cache.CachedSession() self.log = logging.getLogger(__file__) self.log.debug('OSCAR Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) base_url = self.source_config.get('oai_pmh_url') metadata_prefix = self.source_config.get('metadata_prefix') start_date = self.source_config.get('start_date', None) self.update_all = self.source_config.get('update_all', False) last_token = self._get_last_harvesting_index(self.job.source_id, 'last_token') next_token = self._get_last_harvesting_index(self.job.source_id, 'next_token') next_station = self._get_last_harvesting_index(self.job.source_id, 'next_station') restart_date = self._get_last_harvesting_index(self.job.source_id, 'restart_date') restart_date = restart_date if last_token else None ids = [] first_query = True while (ids == [] and next_token) or first_query: first_query = False current_token = last_token if next_station else next_token if current_token: query_url = "{}?verb=ListIdentifiers&resumptionToken={}".format( base_url, current_token) elif restart_date: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format( base_url, metadata_prefix, restart_date) elif start_date: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format( base_url, metadata_prefix, start_date) else: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}".format( base_url, metadata_prefix) self.log.debug('Querying: {}.'.format(query_url)) raw_list_ids = self.get_list_identifiers(session, query_url) list_stations, largest_datastamp = self.get_station_ids( raw_list_ids) next_token = self.get_resumption_token(raw_list_ids) last_token = current_token restart_date = restart_date if restart_date else '' restart_date = largest_datastamp if largest_datastamp > restart_date else restart_date if list_stations == []: next_station = None else: valid_deployment = None station_index = 0 while not valid_deployment and station_index <= len( list_stations) - 1: station = list_stations[station_index] next_station = None if (next_station == station) else next_station if not next_station: station_query = '{}?verb=GetRecord&metadataPrefix={}&identifier={}'.format( base_url, metadata_prefix, station) print('Querying station: {}.'.format(station)) record = self.get_record(session, station_query) if record: station_info = StationInfo(record) if station_info.isValid(): station_info.id = station observation_list = station_info.get_observations( ) station_dict = station_info.get_dict() station_info = None for observation in observation_list: observation_info = ObservationInfo( session, observation) deployments_list = observation_info.get_deployments( ) observation_dict = observation_info.get_dict( ) observation_info = None for deployment in deployments_list: deployment_info = DeploymentInfo( session, deployment) if deployment_info.isValid(): deployment_dict = deployment_info.get_dict( ) deployment_info = None valid_deployment = True if station_index + 1 <= len( list_stations) - 1: next_station = list_stations[ station_index + 1] else: next_station = None entry_guid = unicode(uuid.uuid4()) entry_id = '{}_{}'.format( station_dict['id'], deployment_dict['id']) entry_name = clean_snakecase( entry_id) self.log.debug( 'Gathering %s', entry_name) content = {} content['station'] = station_dict content[ 'observation'] = observation_dict content[ 'deployment'] = deployment_dict package_query = Session.query( Package) query_filtered = package_query.filter( Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: self.log.debug( '{} already exists and will be updated.' .format(entry_name) ) # noqa: E501 status = 'change' else: self.log.debug( '{} will not be updated.' .format(entry_name) ) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. self.log.debug( '{} has not been harvested before. Creating a new harvest object.' . # noqa: E501 format(entry_name )) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='last_token', value=last_token), HOExtra(key='next_token', value=next_token), HOExtra( key='next_station', value=next_station), HOExtra(key='restart_date', value=restart_date) ]) obj.content = json.dumps(content) obj.package = None if status == 'new' else package obj.save() ids.append(obj.id) if not valid_deployment: self.log.debug( 'Station {} does not have valid deployments.' .format(station)) else: self.log.debug( 'Station {} is not valid.'.format(station)) station_index += 1 return ids
def reimport_batch(self, package_ids, context): '''Batch-reimport all packages in `package_ids` from their original harvest source.''' ckan_fb_mapping = {} # first, do checks that can be done without connection to FIS-Broker for package_id in package_ids: package = Package.get(package_id) if not package: raise PackageIdDoesNotExistError(package_id) if not dataset_was_harvested(package): raise PackageNotHarvestedError(package_id) harvester = harvester_for_package(package) harvester_url = harvester.url harvester_type = harvester.type if not harvester_type == HARVESTER_ID: raise PackageNotHarvestedInFisbrokerError(package_id) fb_guid = fisbroker_guid(package) if not fb_guid: raise NoFisbrokerIdError(package_id) ckan_fb_mapping[package.id] = fb_guid # get the harvest source for FIS-Broker datasets fb_source = get_fisbroker_source() if not fb_source: raise NoFBHarvesterDefined() source_id = fb_source.get('id', None) # Create and start a new harvest job job_dict = toolkit.get_action('harvest_job_create')(context, {'source_id': source_id}) harvest_job = HarvestJob.get(job_dict['id']) harvest_job.gather_started = datetime.datetime.utcnow() assert harvest_job # instatiate the CSW connector (on the reasonable assumption that harvester_url is # the same for all package_ids) package_id = None reimported_packages = [] try: csw = CatalogueServiceWeb(harvester_url) for package_id, fb_guid in ckan_fb_mapping.items(): # query connector to get resource document csw.getrecordbyid([fb_guid], outputschema=namespaces['gmd']) # show resource document record = csw.records.get(fb_guid, None) if record: obj = HarvestObject(guid=fb_guid, job=harvest_job, content=record.xml, package_id=package_id, extras=[ HarvestObjectExtra(key='status',value='change'), HarvestObjectExtra(key='type',value='reimport'), ]) obj.save() assert obj, obj.content harvester = FisbrokerPlugin() harvester.force_import = True harvester.import_stage(obj) rejection_reason = self._dataset_rejected(obj) if rejection_reason: raise FBImportError(package_id, rejection_reason) harvester.force_import = False Session.refresh(obj) reimported_packages.append(record) else: raise NotFoundInFisbrokerError(package_id, fb_guid) except RequestException as error: raise NoConnectionError(package_id, harvester_url, str(error.__class__.__name__)) # successfully finish harvest job harvest_job.status = u'Finished' harvest_job.finished = datetime.datetime.utcnow() harvest_job.save() return reimported_packages