def validation_report(package_id=None): ''' Looks at every harvested metadata record and compares the validation errors that it had on last import and what it would be with the current validators. Useful when going to update the validators. Returns a ReportTable. ''' validators = SpatialHarvester()._get_validator() log.debug('Validators: %r', validators.profiles) query = model.Session.query(HarvestObject).\ filter_by(current=True).\ order_by(HarvestObject.fetch_finished.desc()) if package_id: query = query.filter(HarvestObject.package_id==package_id) report = ReportTable([ 'Harvest Object id', 'GEMINI2 id', 'Date fetched', 'Dataset name', 'Publisher', 'Source URL', 'Old validation errors', 'New validation errors']) for harvest_object in query: validation_errors = [] for err in harvest_object.errors: if 'not a valid Gemini' in err.message or \ 'Validating against' in err.message: validation_errors.append(err.message) groups = harvest_object.package.get_groups() publisher = groups[0].title if groups else '(none)' xml = etree.fromstring(harvest_object.content.encode("utf-8")) valid, errors = validators.is_valid(xml) report.add_row_dict({ 'Harvest Object id': harvest_object.id, 'GEMINI2 id': harvest_object.guid, 'Date fetched': harvest_object.fetch_finished, 'Dataset name': harvest_object.package.name, 'Publisher': publisher, 'Source URL': harvest_object.source.url, 'Old validation errors': '; '.join(validation_errors), 'New validation errors': '; '.join(errors), }) log.debug('%i results', query.count()) return report
def validate_file(self): ''' ''' from ckanext.spatial.harvesters import SpatialHarvester from ckanext.spatial.model import ISODocument if len(self.args) > 2: print u'Too many parameters %i' % len(self.args) sys.exit(1) if len(self.args) < 2: print u'Not enough parameters %i' % len(self.args) sys.exit(1) metadata_filepath = self.args[1] if not os.path.exists(metadata_filepath): print u'Filepath %s not found' % metadata_filepath sys.exit(1) with open(metadata_filepath, u'rb') as f: metadata_xml = f.read() validators = SpatialHarvester()._get_validator() print u'Validators: %r' % validators.profiles try: xml_string = metadata_xml.encode(u'utf-8') except UnicodeDecodeError, e: print u'ERROR: Unicode Error reading file \'%s\': %s' % \ (metadata_filepath, e) sys.exit(1)
def validate_file(metadata_filepath): from ckanext.spatial.harvesters import SpatialHarvester from ckanext.spatial.model import ISODocument if not os.path.exists(metadata_filepath): print('Filepath %s not found' % metadata_filepath) sys.exit(1) with open(metadata_filepath, 'rb') as f: metadata_xml = f.read() validators = SpatialHarvester()._get_validator() print('Validators: %r' % validators.profiles) try: xml_string = metadata_xml.encode("utf-8") except UnicodeDecodeError as e: print('ERROR: Unicode Error reading file \'%s\': %s' % \ (metadata_filepath, e)) sys.exit(1) #import pdb; pdb.set_trace() xml = etree.fromstring(xml_string) # XML validation valid, errors = validators.is_valid(xml) # CKAN read of values if valid: try: iso_document = ISODocument(xml_string) iso_values = iso_document.read_values() except Exception as e: valid = False errors.append( 'CKAN exception reading values from ISODocument: %s' % e) print('***************') print('Summary') print('***************') print('File: \'%s\'' % metadata_filepath) print('Valid: %s' % valid) if not valid: print('Errors:') print(pprint(errors)) print('***************')
def validate_file(self): from ckanext.spatial.harvesters import SpatialHarvester if len(self.args) > 2: print 'Too many parameters %i' % len(self.args) sys.exit(1) if len(self.args) < 2: print 'Not enough parameters %i' % len(self.args) sys.exit(1) metadata_filepath = self.args[1] if not os.path.exists(metadata_filepath): print 'Filepath %s not found' % metadata_filepath sys.exit(1) with open(metadata_filepath, 'rb') as f: metadata_xml = f.read() validators = SpatialHarvester()._get_validator() print 'Validators: %r' % validators.profiles xml = etree.fromstring(metadata_xml.encode("utf-8")) valid, errors = validators.is_valid(xml) print 'Valid: %s' % valid if not valid: print 'Errors:' print pprint(errors)
def validation_report(package_id=None): ''' Looks at every harvested metadata record and compares the validation errors that it had on last import and what it would be with the current validators. Useful when going to update the validators. Returns a ReportTable. ''' log = logging.getLogger(__name__ + '.validation_report') validators = SpatialHarvester()._get_validator() log.debug('Validators: %r', validators.profiles) query = model.Session.query(HarvestObject).\ filter_by(current=True).\ order_by(HarvestObject.fetch_finished.desc()) if package_id: query = query.filter(HarvestObject.package_id == package_id) report = ReportTable([ 'Harvest Object id', 'GEMINI2 id', 'Date fetched', 'Dataset name', 'Publisher', 'Source URL', 'Old validation errors', 'New validation errors' ]) old_validation_failure_count = 0 new_validation_failure_count = 0 for harvest_object in query: validation_errors = [] for err in harvest_object.errors: if 'not a valid Gemini' in err.message or \ 'Validating against' in err.message: validation_errors.append(err.message) if validation_errors: old_validation_failure_count += 1 groups = harvest_object.package.get_groups() publisher = groups[0].title if groups else '(none)' xml = etree.fromstring(harvest_object.content.encode("utf-8")) valid, errors = validators.is_valid(xml) if not valid: new_validation_failure_count += 1 report.add_row_dict({ 'Harvest Object id': harvest_object.id, 'GEMINI2 id': harvest_object.guid, 'Date fetched': harvest_object.fetch_finished, 'Dataset name': harvest_object.package.name, 'Publisher': publisher, 'Source URL': harvest_object.source.url, 'Old validation errors': '; '.join(validation_errors), 'New validation errors': '; '.join(errors), }) log.debug('%i results', query.count()) log.debug('%i failed old validation', old_validation_failure_count) log.debug('%i failed new validation', new_validation_failure_count) return report
def wms_check(self): assert len(self.args) == 2, \ 'Wrong number of args. Got %s rather than 2' % len(self.args) wms_url = self.args[1] from ckanext.spatial.harvesters import SpatialHarvester print SpatialHarvester._is_wms(wms_url)
class Validation(CkanCommand): '''Validation commands Usage: validation report [package-name] Performs validation on the harvested metadata, either for all packages or the one specified. validation report-csv <filename>.csv Performs validation on all the harvested metadata in the db and writes a report in CSV format to the given filepath. validation file <filename>.xml Performs validation on the given metadata file. ''' summary = __doc__.split('\n')[0] usage = __doc__ max_args = 3 min_args = 0 def command(self): if not self.args or self.args[0] in ['--help', '-h', 'help']: print self.usage sys.exit(1) self._load_config() cmd = self.args[0] if cmd == 'report': self.report() elif cmd == 'report-csv': self.report_csv() elif cmd == 'file': self.validate_file() else: print 'Command %s not recognized' % cmd def report(self): from ckan import model from ckanext.harvest.model import HarvestObject from ckanext.spatial.lib.reports import validation_report if len(self.args) >= 2: package_ref = unicode(self.args[1]) pkg = model.Package.get(package_ref) if not pkg: print 'Package ref "%s" not recognised' % package_ref sys.exit(1) else: pkg = None report = validation_report(package_id=pkg.id) for row in report.get_rows_html_formatted(): print for i, col_name in enumerate(report.column_names): print ' %s: %s' % (col_name, row[i]) def validate_file(self): from ckanext.spatial.harvesters import SpatialHarvester from ckanext.spatial.model import GeminiDocument if len(self.args) > 2: print 'Too many parameters %i' % len(self.args) sys.exit(1) if len(self.args) < 2: print 'Not enough parameters %i' % len(self.args) sys.exit(1) metadata_filepath = self.args[1] if not os.path.exists(metadata_filepath): print 'Filepath %s not found' % metadata_filepath sys.exit(1) with open(metadata_filepath, 'rb') as f: metadata_xml = f.read() # this is still encoded - hopefully as UTF8. If not, then it needs # decoding and recoding as UTF8. # Check it is UTF8, as that's what etree expects. try: decoded = metadata_xml.decode("utf-8") reencoded = decoded.encode("utf-8") except UnicodeDecodeError, e: print 'ERROR: File was not UTF8 \'%s\': %s' % \ (metadata_filepath, e) sys.exit(1) # etree.fromstring accepts either a unicode string or the encoding is # expressed in the <xml> tag. NB 'UTF-8' is correct, 'UTF8' is wrong. xml = etree.fromstring(metadata_xml) # XML validation validators = SpatialHarvester()._get_validator() print 'Validators: %r' % validators.profiles valid, errors = validators.is_valid(xml) # CKAN read of values if valid: try: gemini_document = GeminiDocument(metadata_xml) gemini_values = gemini_document.read_values() except Exception, e: valid = False errors.append( 'CKAN exception reading values from GeminiDocument: %s' % e)