def gather_stage(self, harvest_job): log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url) get_all_packages = True dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/') log.debug(dcatUrl) adaptorInstance = socrataAdaptor() package_ids = adaptorInstance.listDatasetIds(dcatUrl) print('****') print(len(package_ids)) print(package_ids) try: object_ids = [] if len(package_ids): for package_id in package_ids: if "http" not in package_id: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
def import_stage(self, harvest_object): ''' Imports each dataset from Socrata, into the CKAN server ''' log.debug('In SocrataHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) log.debug(harvest_object.job.source.config) try: d = socrataAdaptor() log.debug("Converting View") stripped_source = harvest_object.source.url.rstrip('/') package_dict = d.convertViewXml(harvest_object.id, stripped_source, harvest_object.content) package_dict.update({"catalogue_url": str(harvest_object.source.url.rstrip('/'))}) package_dict.update({"platform": "socrata"}) if 'category' in package_dict.keys(): package_dict['extras'].update({'category': package_dict['category']}) del package_dict['category'] log.debug(package_dict) if package_dict['id'] not in ids: metadata_created = datetime.datetime.now() package_dict.update({"metadata_created": str(metadata_created)}) socrata_db.save(package_dict) log.info('Metadata saved succesfully to MongoDb.') else: document = socrata_db.find_one({"id": package_dict['id']}) met_created = document['metadata_created'] package_dict.update({'metadata_created': met_created}) package_dict.update({'metadata_updated': str(datetime.datetime.now())}) package_dict.update({'updated_dataset': True}) socrata_db.remove({"id": package_dict['id']}) socrata_db.save(package_dict) log.info('Metadata updated succesfully to MongoDb.') # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if 'tags' not in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: if 'groups' not in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) log.debug(package_dict) result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get('read_only', False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True except ValidationError, e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') log.debug("Validation Error: %s", harvest_object.guid)
def gather_stage(self,harvest_job): log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url) get_all_packages = True dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/') log.debug(dcatUrl) adaptorInstance = socrataAdaptor() package_ids = adaptorInstance.listDatasetIds(dcatUrl) #print('****') #print(len(package_ids)) #print(package_ids) ##load existing datasets names and ids from mongoDb datasets=list(socrata_db.find({'catalogue_url':harvest_job.source.url.rstrip('/')})) datasets_ids=[] datasets_names=[] j=0 while j<len(datasets): datasets_ids.append(datasets[j]['id']) datasets_names.append(datasets[j]['name']) j+=1 #print(datasets_names) ##check for deleted datasets that exist in mongo count_pkg_ids=0 while count_pkg_ids<len(package_ids): temp_pckg_id=package_ids[count_pkg_ids] if temp_pckg_id in datasets_ids: datasets_ids.remove(temp_pckg_id) if temp_pckg_id in datasets_names: datasets_names.remove(temp_pckg_id) count_pkg_ids+=1 if len(datasets_names)<len(datasets_ids): #print(datasets_names) j=0 #print(harvest_job.source.url.rstrip('/')) while j<len(datasets_names): i=0 while i<len(datasets): if datasets_names[j] in datasets[i]['name']: document=datasets[i] document.update({"deleted_dataset":True}) socrata_db.save(document) i+=1 j+=1 else: #print(datasets_ids) j=0 while j<len(datasets_ids): i=0 while i<len(datasets): if datasets_ids[j] in datasets[i]['id']: document=datasets[i] document.update({"deleted_dataset":True}) socrata_db.save(document) i+=1 j+=1 try: object_ids = [] if len(package_ids): for package_id in package_ids: if "http" not in package_id: # Create a new HarvestObject for this identifier obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def import_stage(self,harvest_object): ''' Imports each dataset from Socrata, into the CKAN server ''' log.debug('In SocrataHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) log.debug(harvest_object.job.source.config) try: #log.debug(harvest_object.content) language="" try: doc=db_jobs.find_one({"cat_url":str(base_url)}) language=doc['language'] except:pass d = socrataAdaptor() log.debug("Converting View") package_dict = d.convertViewXml(harvest_object.id, harvest_object.source.url.rstrip('/'), harvest_object.content) package_dict.update({"catalogue_url":str(harvest_object.source.url.rstrip('/'))}) package_dict.update({"platform":"socrata"}) package_dict.update({"language":language}) if 'notes_rendered' in package_dict.keys(): package_dict.update({"notes":package_dict['notes_rendered']}) del package_dict['notes_rendered'] if 'category' in package_dict.keys(): package_dict['extras'].update({'category':package_dict['category']}) del package_dict['category'] log.debug(package_dict) mainurl=str(harvest_object.source.url.rstrip('/')) #if package_dict['id'] not in ids: document=socrata_db.find_one({"catalogue_url":harvest_object.source.url.rstrip('/'),'id':package_dict['id']}) if document==None: metadata_created=datetime.datetime.now() package_dict.update({"metadata_created":str(metadata_created)}) socrata_db.save(package_dict) log.info('Metadata saved succesfully to MongoDb.') fetch_document=db_fetch_temp.find_one() if fetch_document==None: fetch_document={} fetch_document.update({"cat_url":mainurl}) fetch_document.update({"new":1}) fetch_document.update({"updated":0}) db_fetch_temp.save(fetch_document) else: if mainurl==fetch_document['cat_url']: new_count=fetch_document['new'] new_count+=1 fetch_document.update({"new":new_count}) db_fetch_temp.save(fetch_document) else: last_cat_url=fetch_document['cat_url'] doc=db_jobs.find_one({'cat_url':fetch_document['cat_url']}) if 'new' in fetch_document.keys(): new=fetch_document['new'] if 'new' in doc.keys(): last_new=doc['new'] doc.update({"last_new":last_new}) doc.update({"new":new}) db_jobs.save(doc) if 'updated' in fetch_document.keys(): updated=fetch_document['updated'] if 'updated' in doc.keys(): last_updated=doc['updated'] doc.update({"last_updated":last_updated}) doc.update({"updated":updated}) db_jobs.save(doc) fetch_document.update({"cat_url":mainurl}) fetch_document.update({"new":1}) fetch_document.update({"updated":0}) db_fetch_temp.save(fetch_document) else: #document=socrata_db.find_one({"id":package_dict['id']}) met_created=document['metadata_created'] if 'copied' in document.keys(): package_dict.update({'copied':document['copied']}) package_dict.update({'metadata_created':met_created}) package_dict.update({'metadata_updated':str(datetime.datetime.now())}) package_dict.update({'updated_dataset':True}) #existing_dataset=socrata_db.find_one({"id":package_dict['id'],"catalogue_url":mainurl}) objectid=document['_id'] package_dict.update({'_id':objectid}) socrata_db.save(package_dict) log.info('Metadata updated succesfully to MongoDb.') fetch_document=db_fetch_temp.find_one() if fetch_document==None: fetch_document={} fetch_document.update({"cat_url":mainurl}) fetch_document.update({"updated":1}) fetch_document.update({"new":0}) db_fetch_temp.save(fetch_document) else: if mainurl==fetch_document['cat_url']: updated_count=fetch_document['updated'] updated_count+=1 fetch_document.update({"updated":updated_count}) db_fetch_temp.save(fetch_document) else: last_cat_url=fetch_document['cat_url'] doc=db_jobs.find_one({'cat_url':fetch_document['cat_url']}) if 'new' in fetch_document.keys(): new=fetch_document['new'] if 'new' in doc.keys(): last_new=doc['new'] doc.update({"last_new":last_new}) doc.update({"new":new}) db_jobs.save(doc) if 'updated' in fetch_document.keys(): updated=fetch_document['updated'] if 'updated' in doc.keys(): last_updated=doc['updated'] doc.update({"last_updated":last_updated}) doc.update({"updated":updated}) db_jobs.save(doc) fetch_document.update({"cat_url":mainurl}) fetch_document.update({"updated":1}) fetch_document.update({"new":0}) db_fetch_temp.save(fetch_document) # Set default tags if needed default_tags = self.config.get('default_tags',[]) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) # Set default groups if needed default_groups = self.config.get('default_groups',[]) if default_groups: if not 'groups' in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) log.debug(package_dict) result = self._create_or_update_package(package_dict,harvest_object) #log.debug(result) if result and self.config.get('read_only',False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user',u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor',u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True except ValidationError,e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') print('ValidationErrorr')
def import_stage(self,harvest_object): ''' Imports each dataset from Socrata, into the CKAN server ''' log.debug('In SocrataHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) log.debug(harvest_object.job.source.config) try: #log.debug(harvest_object.content) d = socrataAdaptor() log.debug("Converting View") package_dict = d.convertViewXml(harvest_object.id, harvest_object.source.url.rstrip('/'), harvest_object.content) log.debug(package_dict) # Set default tags if needed default_tags = self.config.get('default_tags',[]) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) # Set default groups if needed default_groups = self.config.get('default_groups',[]) if default_groups: if not 'groups' in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) log.debug(package_dict) result = self._create_or_update_package(package_dict,harvest_object) #log.debug(result) if result and self.config.get('read_only',False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user',u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor',u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) except ValidationError,e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')