def get_package_dict(self, harvest_object, package_dict_defaults, source_config, existing_dataset): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details * name - a new package must have a unique name; if it had a name in the previous harvest, that will be in the package_dict_defaults. * resource.id - should be the same as the old object if updating a package * errors - call self._save_object_error() and return False * default values for name, owner_org, tags etc can be merged in using: package_dict = package_dict_defaults.merge(package_dict_harvested) ''' inv_dataset = InventoryDocument.dataset_to_dict( InventoryDocument.parse_xml_string(harvest_object.content)) pkg = dict(title=inv_dataset['title'], notes=inv_dataset['description'], state='active' if inv_dataset['active'] else 'deleted', resources=[], extras={ self.IDENTIFIER_KEY: inv_dataset['identifier'], 'harvest_source_reference': harvest_object.guid }) # License rights = inv_dataset.get('rights') if rights: register = model.Package.get_license_register() if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/': pkg['license_id'] = 'uk-ogl' else: for l in register.values(): if l.url == rights: pkg['license_id'] = l.id break else: # just save it as it is pkg['license_id'] = register log.info('Did not recognize license %r', register) else: pkg['license_id'] = None # Resources inv_resources = [r for r in inv_dataset['resources'] if r['active']] existing_resource_urls = dict((r.url, r.id) for r in existing_dataset.resources) \ if existing_dataset else {} pkg['resources'] = [] for inv_resource in inv_resources: format_ = Formats.by_mime_type().get(inv_resource['mimetype']) if format_: format_ = format_['display_name'] else: format_ = inv_resource['mimetype'] description = inv_resource['title'] if inv_resource['availability']: description += ' - %s' % inv_resource['availability'] # if it is temporal, it should be a timeseries, # if it is not data, it should be an additional resource resource_type = 'file' if inv_resource['resource_type'] == 'Data' \ else 'documentation' # Schema if inv_resource['conforms_to']: schema_url = inv_resource['conforms_to'] schema_type = SCHEMA_TYPE_MAP.get(format_) else: schema_url = schema_type = '' res = { 'url': inv_resource['url'], 'format': format_, 'description': description, 'resource_type': resource_type, 'schema-url': schema_url, 'schema-type': schema_type, } if res['url'] in existing_resource_urls: res['id'] = existing_resource_urls[res['url']] pkg['resources'].append(res) # Local Authority Services and Functions if inv_dataset['services']: log.info('Local Authority Services: %r', inv_dataset['services']) # e.g. {http://id.esd.org.uk/service/190} pkg['extras']['la_service'] = ' '.join(inv_dataset['services']) else: pkg['extras']['la_service'] = '' if inv_dataset['functions']: log.info('Local Authority Functions %r', inv_dataset['functions']) pkg['extras']['la_function'] = ' '.join(inv_dataset['functions']) else: pkg['extras']['la_function'] = '' pkg = package_dict_defaults.merge(pkg) if not pkg.get('name'): # append the publisher name to differentiate similar titles better # than just a numbers suffix publisher = model.Group.get(harvest_object.job.source.publisher_id) publisher_abbrev = self._get_publisher_abbreviation(publisher) pkg['name'] = self.check_name( self.munge_title_to_name('%s %s' % (pkg['title'], publisher_abbrev))) # Themes based on services/functions if 'tags' not in pkg: pkg['tags'] = [] themes = dgutheme.categorize_package(pkg) log.debug('%s given themes: %r', pkg['name'], themes) if themes: pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0] if len(themes) == 2: pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1] pkg['extras'] = self.extras_from_dict(pkg['extras']) return pkg
def test_by_mime_type(self): assert_equal(Formats.by_mime_type()['text/x-json']['display_name'], 'JSON')
def sniff_file_format(filepath, log): '''For a given filepath, work out what file format it is. Returns Format dict with a key to say if it is contained in a zip or something. e.g. {'display_name': 'CSV', 'container': 'zip', ...} or None if it can\'t tell what it is. Note, log is a logger, either a Celery one or a standard Python logging one. ''' format_ = None log.info('Sniffing file format of: %s', filepath) filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \ else filepath mime_type = magic.from_file(filepath_utf8, mime=True) log.info('Magic detects file as: %s', mime_type) if mime_type: if mime_type == 'application/xml': with open(filepath) as f: buf = f.read(5000) format_ = get_xml_variant_including_xml_declaration(buf, log) elif mime_type == 'application/zip': format_ = get_zipped_format(filepath, log) elif mime_type == 'application/msword': # Magic gives this mime-type for other MS Office files too format_ = run_bsd_file(filepath, log) if not format_ and is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] elif mime_type == 'application/octet-stream': # Excel files sometimes come up as this if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] else: # e.g. Shapefile format_ = run_bsd_file(filepath, log) if not format_: with open(filepath) as f: buf = f.read(500) format_ = is_html(buf, log) elif mime_type == 'text/html': # Magic can mistake IATI for HTML with open(filepath) as f: buf = f.read(100) if is_iati(buf, log): format_ = Formats.by_display_name()['IATI'] if format_: return format_ format_ = Formats.by_mime_type().get(mime_type) if not format_: if mime_type.startswith('text/'): # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] if not format_: log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type) if format_: log.info('Mimetype translates to filetype: %s', format_['display_name']) if format_['display_name'] == 'TXT': # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] # XML files without the "<?xml ... ?>" tag end up here elif is_xml_but_without_declaration(buf, log): format_ = get_xml_variant_without_xml_declaration(buf, log) elif is_ttl(buf, log): format_ = Formats.by_extension()['ttl'] elif format_['display_name'] == 'HTML': # maybe it has RDFa in it with open(filepath) as f: buf = f.read(100000) if has_rdfa(buf, log): format_ = Formats.by_display_name()['RDFa'] else: # Excel files sometimes not picked up by magic, so try alternative if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] # BSD file picks up some files that Magic misses # e.g. some MS Word files if not format_: format_ = run_bsd_file(filepath, log) if not format_: log.warning('Could not detect format of file: %s', filepath) return format_
def sniff_file_format(filepath, log): '''For a given filepath, work out what file format it is. Returns Format dict with a key to say if it is contained in a zip or something. e.g. {'display_name': 'CSV', 'container': 'zip', ...} or None if it can\'t tell what it is. Note, log is a logger, either a Celery one or a standard Python logging one. ''' format_ = None log.info('Sniffing file format of: %s', filepath) filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \ else filepath mime_type = magic.from_file(filepath_utf8, mime=True) log.info('Magic detects file as: %s', mime_type) if mime_type: if mime_type == 'application/xml': with open(filepath) as f: buf = f.read(5000) format_ = get_xml_variant_including_xml_declaration(buf, log) elif mime_type == 'application/zip': format_ = get_zipped_format(filepath, log) elif mime_type == 'application/msword': # Magic gives this mime-type for other MS Office files too format_ = run_bsd_file(filepath, log) if not format_ and is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] elif mime_type == 'application/octet-stream': # Excel files sometimes come up as this if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] else: # e.g. Shapefile format_ = run_bsd_file(filepath, log) if not format_: with open(filepath) as f: buf = f.read(500) format_ = is_html(buf, log) elif mime_type == 'text/html': # Magic can mistake IATI for HTML with open(filepath) as f: buf = f.read(100) if is_iati(buf, log): format_ = Formats.by_display_name()['IATI'] if format_: return format_ format_ = Formats.by_mime_type().get(mime_type) if not format_: if mime_type.startswith('text/'): # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] if not format_: log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type) if format_: log.info('Mimetype translates to filetype: %s', format_['display_name']) if format_['display_name'] == 'TXT': # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = Formats.by_extension()['json'] # is it CSV? elif is_csv(buf, log): format_ = Formats.by_extension()['csv'] elif is_psv(buf, log): format_ = Formats.by_extension()['psv'] # XML files without the "<?xml ... ?>" tag end up here elif is_xml_but_without_declaration(buf, log): format_ = get_xml_variant_without_xml_declaration(buf, log) elif is_ttl(buf, log): format_ = Formats.by_extension()['ttl'] elif format_['display_name'] == 'HTML': # maybe it has RDFa in it with open(filepath) as f: buf = f.read(100000) if has_rdfa(buf, log): format_ = Formats.by_display_name()['RDFa'] else: # Excel files sometimes not picked up by magic, so try alternative if is_excel(filepath, log): format_ = Formats.by_display_name()['XLS'] # BSD file picks up some files that Magic misses # e.g. some MS Word files if not format_: format_ = run_bsd_file(filepath, log) if not format_: log.warning('Could not detect format of file: %s', filepath) return format_
def get_package_dict(self, harvest_object, package_dict_defaults, source_config, existing_dataset): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details * name - a new package must have a unique name; if it had a name in the previous harvest, that will be in the package_dict_defaults. * resource.id - should be the same as the old object if updating a package * errors - call self._save_object_error() and return False * default values for name, owner_org, tags etc can be merged in using: package_dict = package_dict_defaults.merge(package_dict_harvested) ''' inv_dataset = InventoryDocument.dataset_to_dict( InventoryDocument.parse_xml_string(harvest_object.content) ) pkg = dict( title=inv_dataset['title'], notes=inv_dataset['description'], state='active' if inv_dataset['active'] else 'deleted', resources=[], extras={self.IDENTIFIER_KEY: inv_dataset['identifier'], 'harvest_source_reference': harvest_object.guid } ) # License rights = inv_dataset.get('rights') if rights: register = model.Package.get_license_register() if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/': pkg['license_id'] = 'uk-ogl' else: for l in register.values(): if l.url == rights: pkg['license_id'] = l.id break else: # just save it as it is pkg['license_id'] = register log.info('Did not recognize license %r', register) else: pkg['license_id'] = None # Resources inv_resources = [r for r in inv_dataset['resources'] if r['active']] existing_resource_urls = dict((r.url, r.id) for r in existing_dataset.resources) \ if existing_dataset else {} pkg['resources'] = [] for inv_resource in inv_resources: format_ = Formats.by_mime_type().get(inv_resource['mimetype']) if format_: format_ = format_['display_name'] else: format_ = inv_resource['mimetype'] description = inv_resource['title'] if inv_resource['availability']: description += ' - %s' % inv_resource['availability'] # if it is temporal, it should be a timeseries, # if it is not data, it should be an additional resource resource_type = 'file' if inv_resource['resource_type'] == 'Data' \ else 'documentation' # Schema if inv_resource['conforms_to']: schema_url = inv_resource['conforms_to'] schema_type = SCHEMA_TYPE_MAP.get(format_) else: schema_url = schema_type = '' res = {'url': inv_resource['url'], 'format': format_, 'description': description, 'resource_type': resource_type, 'schema-url': schema_url, 'schema-type': schema_type, } if res['url'] in existing_resource_urls: res['id'] = existing_resource_urls[res['url']] pkg['resources'].append(res) # Local Authority Services and Functions if inv_dataset['services']: log.info('Local Authority Services: %r', inv_dataset['services']) # e.g. {http://id.esd.org.uk/service/190} pkg['extras']['la_service'] = ' '.join(inv_dataset['services']) else: pkg['extras']['la_service'] = '' if inv_dataset['functions']: log.info('Local Authority Functions %r', inv_dataset['functions']) pkg['extras']['la_function'] = ' '.join(inv_dataset['functions']) else: pkg['extras']['la_function'] = '' pkg = package_dict_defaults.merge(pkg) if not pkg.get('name'): # append the publisher name to differentiate similar titles better # than just a numbers suffix publisher = model.Group.get(harvest_object.job.source.publisher_id) publisher_abbrev = self._get_publisher_abbreviation(publisher) pkg['name'] = self.check_name(self.munge_title_to_name( '%s %s' % (pkg['title'], publisher_abbrev))) # Themes based on services/functions if 'tags' not in pkg: pkg['tags'] = [] themes = dgutheme.categorize_package(pkg) log.debug('%s given themes: %r', pkg['name'], themes) if themes: pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0] if len(themes) == 2: pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1] pkg['extras'] = self.extras_from_dict(pkg['extras']) return pkg
def test_by_mime_type(self): assert_equal(Formats.by_mime_type()["text/x-json"]["display_name"], "JSON")