def score_by_sniffing_data(archival, resource, score_reasons, log): ''' Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' if not archival or not archival.cache_filepath: score_reasons.append( _('This file had not been downloaded at the time of scoring it.')) return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append( _('Cache filepath does not exist: "%s".') % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath, log) score = lib.resource_format_scores().get(sniffed_format['format']) \ if sniffed_format else None if sniffed_format: score_reasons.append( _('Content of file appeared to be format "%s" which receives openness score: %s.' ) % (sniffed_format['format'], score)) return score, sniffed_format['format'] else: score_reasons.append( _('The format of the file was not recognized from its contents.' )) return (None, None) else: # No cache_url if archival.status_id == Status.by_text('Chose not to download'): score_reasons.append( _('File was not downloaded deliberately') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append( _('A system error occurred during downloading this file') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) else: score_reasons.append( _('This file had not been downloaded at the time of scoring it.' )) return (None, None)
def sniff(args): from ckanext.qa.sniff_format import sniff_file_format if len(args) < 1: print('Not enough arguments', args) sys.exit(1) for filepath in args[0:]: format_ = sniff_file_format(filepath) if format_: print('Detected as: %s - %s' % (format_['format'], filepath)) else: print('ERROR: Could not recognise format of: %s' % filepath)
def sniff(self): from ckanext.qa.sniff_format import sniff_file_format if len(self.args) < 2: print 'Not enough arguments', self.args sys.exit(1) for filepath in self.args[1:]: format_ = sniff_file_format(filepath, logging.getLogger('ckanext.qa.sniffer')) if format_: print 'Detected as: %s - %s' % (format_['display_name'], filepath) else: print 'ERROR: Could not recognise format of: %s' % filepath
def assert_file_has_format_sniffed_correctly(cls, format_extension, filepath): '''Given a filepath, checks the sniffed format matches the format_extension.''' expected_format = format_extension sniffed_format = sniff_file_format(filepath, log) assert sniffed_format, expected_format expected_format_without_zip = expected_format.replace('.zip', '') assert_equal(sniffed_format['format'].lower(), expected_format_without_zip) expected_container = None if expected_format.endswith('.zip'): expected_container = 'ZIP' elif expected_format.endswith('.gzip'): expected_container = 'ZIP' # lumped together with zip for simplicity now assert_equal(sniffed_format.get('container'), expected_container)
def sniff(self): from ckanext.qa.sniff_format import sniff_file_format if len(self.args) < 2: print 'Not enough arguments', self.args sys.exit(1) for filepath in self.args[1:]: format_ = sniff_file_format( filepath, logging.getLogger('ckanext.qa.sniffer')) if format_: print 'Detected as: %s - %s' % (format_['display_name'], filepath) else: print 'ERROR: Could not recognise format of: %s' % filepath
def assert_file_has_format_sniffed_correctly(cls, format_extension, filepath): """Given a filepath, checks the sniffed format matches the format_extension.""" expected_format = format_extension sniffed_format = sniff_file_format(filepath, log) assert sniffed_format, expected_format expected_format_without_zip = expected_format.replace(".zip", "") assert_equal(sniffed_format["format"].lower(), expected_format_without_zip) expected_container = None if expected_format.endswith(".zip"): expected_container = "ZIP" elif expected_format.endswith(".gzip"): expected_container = "ZIP" # lumped together with zip for simplicity now assert_equal(sniffed_format.get("container"), expected_container)
def score_by_sniffing_data(archival, resource, score_reasons, log): """ Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None """ if not archival or not archival.cache_filepath: score_reasons.append("This file had not been downloaded at the time of scoring it.") return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append('Cache filepath does not exist: "%s".' % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath, log) score = lib.resource_format_scores().get(sniffed_format["format"]) if sniffed_format else None if sniffed_format: score_reasons.append( 'Content of file appeared to be format "%s" which receives openness score: %s.' % (sniffed_format["format"], score) ) return score, sniffed_format["format"] else: score_reasons.append("The format of the file was not recognized from its contents.") return (None, None) else: # No cache_url if archival.status_id == Status.by_text("Chose not to download"): score_reasons.append( "File was not downloaded deliberately. Reason: %s. Using other methods to determine file openness." % archival.reason ) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append( "A system error occurred during downloading this file. Reason: %s. Using other methods to determine file openness." % archival.reason ) return (None, None) else: score_reasons.append("This file had not been downloaded at the time of scoring it.") return (None, None)
def score_by_sniffing_data(archival, resource, score_reasons, log): ''' Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' if not archival or not archival.cache_filepath: score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.') return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append('Putanja predmemorije ne postoji: "%s".' % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath, log) if sniffed_format: score_reasons.append( 'Podaci su u formatu "%s" s ocjenom otvorenosti: %s.' % (sniffed_format['display_name'], sniffed_format['openness'])) return sniffed_format['openness'], sniffed_format[ 'display_name'] else: score_reasons.append('Format je nepoznat.') return (None, None) else: # No cache_url if archival.status_id == Status.by_text('Chose not to download'): score_reasons.append('Datoteka nije preuzeta namjerno. Razlog: %s.' % \ archival.reason) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append('Dogodio se problem prilikom preuzimanja datoteke. Razlog: %s.' % \ archival.reason) return (None, None) else: score_reasons.append( 'Datoteka nije preuzeta u vrijeme ocijenjivanja.') return (None, None)
def score_by_sniffing_data(archival, resource, score_reasons, log): ''' Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' if not archival or not archival.cache_filepath: score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.') return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append('Putanja predmemorije ne postoji: "%s".' % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath, log) if sniffed_format: score_reasons.append('Podaci su u formatu "%s" s ocjenom otvorenosti: %s.' % (sniffed_format['display_name'], sniffed_format['openness'])) return sniffed_format['openness'], sniffed_format['display_name'] else: score_reasons.append('Format je nepoznat.') return (None, None) else: # No cache_url if archival.status_id == Status.by_text('Chose not to download'): score_reasons.append('Datoteka nije preuzeta namjerno. Razlog: %s.' % \ archival.reason) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append('Dogodio se problem prilikom preuzimanja datoteke. Razlog: %s.' % \ archival.reason) return (None, None) else: score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.') return (None, None)
def validate_resource(self, id): if toolkit.request.method == 'POST': data = dict_fns.unflatten( tuplize_dict(parse_params(toolkit.request.POST))) check_schema = toolkit.request.params.get('check_schema') upload_data = toolkit.request.params.get('upload_data') file_path = data.get('file_path') # Logic for validating a resource against a specified schema if check_schema: schema = {'fields': []} fields = data.get('field_name') field_type = data.get('field_type') # Schema is populated from data entered by the user for i, field in enumerate(fields): schema['fields'].append({ 'name': field, 'type': field_type[i] }) # File is validated with Goodtables report = validate(file_path, schema=schema) log = logging.getLogger('ckanext.tayside') # Score is calculated based on Sir Tim Berners-Lee's five star # of openness sniffed_format = sniff_file_format(file_path, log) score = resource_format_scores().get(sniffed_format['format']) vars = { 'report': report, 'pkg_name': id, 'stars': score, 'file_path': file_path } return toolkit.render('tayside/package/validate_resource.html', extra_vars=vars) elif upload_data: # Handles creating a resource in CKAN # Metadata for the resource is stored in Redis r = redis.StrictRedis() data = json.loads(r.get(file_path)) data['package_id'] = id # Dataset's state is changed from 'draft' to 'active' toolkit.get_action('package_patch')({}, { 'id': id, 'state': 'active' }) # FieldStorage instance is created which is needed to upload # the file to Filestore and Datastore fs = cgi.FieldStorage() fs.file = fs.make_file() fs.filename = data.get('url') f = open(file_path, 'r') fs.file.write(f.read()) fs.file.seek(0) f.close() data['upload'] = fs try: toolkit.get_action('resource_create')({}, data) except Exception as e: vars = { 'upload_error': 'An error occured while creating the ' 'resource.', 'pkg_name': id } return toolkit.render( 'tayside/package/validate_resource.html', extra_vars=vars) # File is uploaded on Filestore, and now it is safe to be # removed from the temporary location os.remove(file_path) toolkit.redirect_to(controller='package', action='read', id=id) else: is_upload = isinstance(data.get('upload'), cgi.FieldStorage) supported_formats = ['csv', 'tsv', 'xls', 'xlsx', 'ods'] current_format = data.get('url').split('.')[-1] if is_upload: if current_format in supported_formats: # Logic for storing the file locally and extracting # it's headers (fields) storage_path = config.get('ckan.storage_path') file_path = storage_path + '/' + data.get('url') # Read the file buffer = data.get('upload').file buffer.seek(0) # Write the file locally f = open(file_path, 'w') f.write(buffer.read()) f.close() # Inspect the headers (fields) of the file with Stream(file_path, headers=1) as stream: fields = stream.headers vars = { 'fields': fields, 'pkg_name': id, 'file_path': file_path } if is_redis_available(): # Store the metadata of the resource in Redis for # later usage r = redis.StrictRedis() resource_data = { 'name': data.get('name'), 'description': data.get('description'), 'format': data.get('format'), 'url': data.get('url'), } r.set(file_path, json.dumps(resource_data)) # Store it for 1 day r.expire(file_path, 86400) else: return toolkit.render( 'tayside/package/validate_resource.html', {'redis_error': 'Redis not available'}) return toolkit.render( 'tayside/package/validate_resource.html', extra_vars=vars) else: vars = { 'format_error': 'Format not supported.', 'pkg_name': id } return toolkit.render( 'tayside/package/validate_resource.html', extra_vars=vars) vars = { 'format_error': 'No file provided for validation.', 'pkg_name': id } return toolkit.render('tayside/package/validate_resource.html', extra_vars=vars) else: return toolkit.render('tayside/package/validate_resource.html', {'pkg_name': id})