def _extract_file_format(self, url, headers): """ Makes a best guess at the file format. /path/to/a_file.csv has format "CSV" /path/to/a_file.csv.zip has format "CSV / Zip" First this function tries to extract the file-extensions from the url, and deduce the format from there. If no file-extension is found, then the mimetype from the headers is passed to `mimetypes.guess_extension()`. """ formats = [] parsed_url = urlparse.urlparse(url) path = parsed_url.path base, extension = posixpath.splitext(path) while extension: formats.append(extension[1:].upper()) # strip leading '.' from extension base, extension = posixpath.splitext(base) if formats: extension = ".".join(formats[::-1]).lower() format_tuple = ckan_helpers.resource_formats().get(extension) if format_tuple: return format_tuple[1] return " / ".join(formats[::-1]) # No file extension found, attempt to extract format using the mimetype stripped_mimetype = self._extract_mimetype(headers) # stripped of charset format_tuple = ckan_helpers.resource_formats().get(stripped_mimetype) if format_tuple: return format_tuple[1] extension = mimetypes.guess_extension(stripped_mimetype) if extension: return extension[1:].upper()
def score_by_format_field(resource, score_reasons): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append(_('Format field is blank.')) return (None, None) format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \ ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field)) if not format_tuple: score_reasons.append( _('Format field "%s" does not correspond to a known format.') % format_field) return (None, None) score = lib.resource_format_scores().get(format_tuple[1]) score_reasons.append( _('Format field "%s" receives score: %s.') % (format_field, score)) return (score, format_tuple[1])
def _extract_file_format(url, headers): """ Makes a best guess at the file format. /path/to/a_file.csv has format "CSV" /path/to/a_file.csv.zip has format "CSV / Zip" First this function tries to extract the file-extensions from the url, and deduce the format from there. If no file-extension is found, then the mimetype from the headers is passed to `mimetypes.guess_extension()`. """ formats = [] parsed_url = urlparse.urlparse(url) path = parsed_url.path base, extension = posixpath.splitext(path) while extension: formats.append( extension[1:].upper()) # strip leading '.' from extension base, extension = posixpath.splitext(base) if formats: extension = '.'.join(formats[::-1]).lower() format_tuple = ckan_helpers.resource_formats().get(extension) if format_tuple: return format_tuple[1] return ' / '.join(formats[::-1]) # No file extension found, attempt to extract format using the mimetype stripped_mimetype = _extract_mimetype(headers) # stripped of charset format_tuple = ckan_helpers.resource_formats().get(stripped_mimetype) if format_tuple: return format_tuple[1] extension = mimetypes.guess_extension(stripped_mimetype) if extension: return extension[1:].upper()
def set_sniffed_format(format_name): global sniffed_format if format_name: format_tuple = ckan_helpers.resource_formats().get(format_name.lower()) sniffed_format = {'format': format_tuple[1]} else: sniffed_format = None
def run_bsd_file(filepath, log): '''Run the BSD command-line tool "file" to determine file type. Returns a format dict or None if it fails.''' result = check_output(['file', filepath]) match = re.search('Name of Creating Application: ([^,]*),', result) if match: app_name = match.groups()[0] format_map = {'Microsoft Office PowerPoint': 'ppt', 'Microsoft PowerPoint': 'ppt', 'Microsoft Excel': 'xls', 'Microsoft Office Word': 'doc', 'Microsoft Word 10.0': 'doc', 'Microsoft Macintosh Word': 'doc', } if app_name in format_map: extension = format_map[app_name] format_tuple = ckan_helpers.resource_formats()[extension] log.info('"file" detected file format: %s', format_tuple[2]) return {'format': format_tuple[1]} match = re.search(': ESRI Shapefile', result) if match: format_ = {'format': 'SHP'} log.info('"file" detected file format: %s', format_['format']) return format_ log.info('"file" could not determine file format of "%s": %s', filepath, result)
def run_bsd_file(filepath, log): '''Run the BSD command-line tool "file" to determine file type. Returns a format dict or None if it fails.''' result = check_output(['file', filepath]) match = re.search('Name of Creating Application: ([^,]*),', result) if match: app_name = match.groups()[0] format_map = { 'Microsoft Office PowerPoint': 'ppt', 'Microsoft PowerPoint': 'ppt', 'Microsoft Excel': 'xls', 'Microsoft Office Word': 'doc', 'Microsoft Word 10.0': 'doc', 'Microsoft Macintosh Word': 'doc', } if app_name in format_map: extension = format_map[app_name] format_tuple = ckan_helpers.resource_formats()[extension] log.info('"file" detected file format: %s', format_tuple[2]) return {'format': format_tuple[1]} match = re.search(': ESRI Shapefile', result) if match: format_ = {'format': 'SHP'} log.info('"file" detected file format: %s', format_['format']) return format_ log.info('"file" could not determine file format of "%s": %s', filepath, result)
def _clean_format(cls, format_string): if isinstance(format_string, basestring): matched_format = helpers.resource_formats().get(format_string.lower().strip(' .')) if matched_format: return matched_format[1] return re.sub(cls._disallowed_characters, '', format_string).strip() else: return format_string
def _clean_format(cls, format_string): if isinstance(format_string, basestring): matched_format = helpers.resource_formats().get( format_string.lower().strip(' .')) if matched_format: return matched_format[1] return re.sub(cls._disallowed_characters, '', format_string).strip() else: return format_string
def format_get(key): '''Returns a resource format, as defined in ckan. :param key: format extension / mimetype / title e.g. 'CSV', 'application/msword', 'Word document' :param key: string :returns: format string ''' format_tuple = ckan_helpers.resource_formats().get(key.lower()) if not format_tuple: return return format_tuple[1] # short name
def score_by_format_field(resource, score_reasons): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append(_('Format field is blank.')) return (None, None) format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \ ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field)) if not format_tuple: score_reasons.append(_('Format field "%s" does not correspond to a known format.') % format_field) return (None, None) score = lib.resource_format_scores().get(format_tuple[1]) score_reasons.append(_('Format field "%s" receives score: %s.') % (format_field, score)) return (score, format_tuple[1])
def get_xml_variant_without_xml_declaration(buf): '''If this buffer is in a format based on XML, without any XML declaration or other boilerplate, return the format type.''' # Parse the XML to find the first tag name. # Using expat directly, rather than go through xml.sax, since using I # couldn't see how to give it a string, so used StringIO which failed # for some files curiously. import xml.parsers.expat class GotFirstTag(Exception): pass def start_element(name, attrs): raise GotFirstTag(name) p = xml.parsers.expat.ParserCreate() p.StartElementHandler = start_element try: p.Parse(buf) except GotFirstTag as e: top_level_tag_name = six.text_type(e).lower() except xml.sax.SAXException as e: log.info('Sax parse error: %s %s', e, buf) return {'format': 'XML'} log.info('Top level tag detected as: %s', top_level_tag_name) top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf') top_level_tag_name = top_level_tag_name.replace('wms_capabilities', 'wms') # WMS 1.3 top_level_tag_name = top_level_tag_name.replace('wmt_ms_capabilities', 'wms') # WMS 1.1.1 top_level_tag_name = re.sub('wfs:.*', 'wfs', top_level_tag_name) # WFS 2.0 top_level_tag_name = top_level_tag_name.replace('wfs_capabilities', 'wfs') # WFS 1.0/1.1 top_level_tag_name = top_level_tag_name.replace('feed', 'atom feed') if top_level_tag_name.lower() == 'capabilities' and \ 'xmlns="http://www.opengis.net/wmts/' in buf: top_level_tag_name = 'wmts' if top_level_tag_name.lower() in ('coveragedescriptions', 'capabilities') and \ 'xmlns="http://www.opengis.net/wcs/' in buf: top_level_tag_name = 'wcs' format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name) if format_tuple: format_ = {'format': format_tuple[1]} log.info('XML variant detected: %s', format_tuple[2]) return format_ log.warning('Did not recognise XML format: %s', top_level_tag_name) return {'format': 'XML'}
def get_xml_variant_without_xml_declaration(buf, log): '''If this buffer is in a format based on XML, without any XML declaration or other boilerplate, return the format type.''' xml_re = '.{0,3}\s*<([^>\s]*)' match = re.match(xml_re, buf) if match: top_level_tag_name = match.groups()[-1].lower() top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf') top_level_tag_name = top_level_tag_name.replace('wms_capabilities', 'wms') # WMS 1.3 top_level_tag_name = top_level_tag_name.replace('wmt_ms_capabilities', 'wms') # WMS 1.1.1 format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name) if format_tuple: format_ = {'format': format_tuple[1]} log.info('XML variant detected: %s', format_tuple[2]) return format_ log.warning('Did not recognise XML format: %s', top_level_tag_name) return {'format': 'XML'} log.debug('XML tags not found: %s', buf)
def get_xml_variant_without_xml_declaration(buf, log): '''If this buffer is in a format based on XML, without any XML declaration or other boilerplate, return the format type.''' xml_re = '.{0,3}\s*<([^>\s]*)' match = re.match(xml_re, buf) if match: top_level_tag_name = match.groups()[-1].lower() top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf') top_level_tag_name = top_level_tag_name.replace( 'wms_capabilities', 'wms') # WMS 1.3 top_level_tag_name = top_level_tag_name.replace( 'wmt_ms_capabilities', 'wms') # WMS 1.1.1 format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name) if format_tuple: format_ = {'format': format_tuple[1]} log.info('XML variant detected: %s', format_tuple[2]) return format_ log.warning('Did not recognise XML format: %s', top_level_tag_name) return {'format': 'XML'} log.debug('XML tags not found: %s', buf)
def hdx_unified_resource_format(format): ''' This function is based on the unified_resource_format() function from ckan.lib.helpers. As the one from core ckan it checks the resource formats configuration to translate the format string to a standard format. The difference is that in case nothing is found in 'resource_formats.json' then it's turned to lowercase. :param format: resource format as written by the user :type format: string :return: :rtype: ''' formats = h.resource_formats() format_clean = format.lower() if format_clean in formats: format_new = formats[format_clean][1] else: format_new = format_clean return format_new
def sniff_file_format(filepath, log): '''For a given filepath, work out what file format it is. Returns a dict with format as a string, which is the format's canonical shortname (as defined by ckan's resource_formats.json) and a key that says if it is contained in a zip or something. e.g. {'format': 'CSV', 'container': 'zip', } or None if it can\'t tell what it is. ''' format_ = None log.info('Sniffing file format of: %s', filepath) filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \ else filepath mime_type = magic.from_file(filepath_utf8, mime=True) log.info('Magic detects file as: %s', mime_type) if mime_type: if mime_type == 'application/xml': with open(filepath) as f: buf = f.read(5000) format_ = get_xml_variant_including_xml_declaration(buf, log) elif mime_type == 'application/zip': format_ = get_zipped_format(filepath, log) elif mime_type in ('application/msword', 'application/vnd.ms-office'): # In the past Magic gives the msword mime-type for Word and other # MS Office files too, so use BSD File to be sure which it is. format_ = run_bsd_file(filepath, log) if not format_ and is_excel(filepath, log): format_ = {'format': 'XLS'} elif mime_type == 'application/octet-stream': # Excel files sometimes come up as this if is_excel(filepath, log): format_ = {'format': 'XLS'} else: # e.g. Shapefile format_ = run_bsd_file(filepath, log) if not format_: with open(filepath) as f: buf = f.read(500) format_ = is_html(buf, log) elif mime_type == 'text/html': # Magic can mistake IATI for HTML with open(filepath) as f: buf = f.read(100) if is_iati(buf, log): format_ = {'format': 'IATI'} if format_: return format_ format_tuple = ckan_helpers.resource_formats().get(mime_type) if format_tuple: format_ = {'format': format_tuple[1]} if not format_: if mime_type.startswith('text/'): # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = {'format': 'JSON'} # is it CSV? elif is_csv(buf, log): format_ = {'format': 'CSV'} elif is_psv(buf, log): format_ = {'format': 'PSV'} if not format_: log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type) if format_: log.info('Mimetype translates to filetype: %s', format_['format']) if format_['format'] == 'TXT': # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = {'format': 'JSON'} # is it CSV? elif is_csv(buf, log): format_ = {'format': 'CSV'} elif is_psv(buf, log): format_ = {'format': 'PSV'} # XML files without the "<?xml ... ?>" tag end up here elif is_xml_but_without_declaration(buf, log): format_ = get_xml_variant_without_xml_declaration(buf, log) elif is_ttl(buf, log): format_ = {'format': 'TTL'} elif format_['format'] == 'HTML': # maybe it has RDFa in it with open(filepath) as f: buf = f.read(100000) if has_rdfa(buf, log): format_ = {'format': 'RDFa'} else: # Excel files sometimes not picked up by magic, so try alternative if is_excel(filepath, log): format_ = {'format': 'XLS'} # BSD file picks up some files that Magic misses # e.g. some MS Word files if not format_: format_ = run_bsd_file(filepath, log) if not format_: log.warning('Could not detect format of file: %s', filepath) return format_
top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf') top_level_tag_name = top_level_tag_name.replace('wms_capabilities', 'wms') # WMS 1.3 top_level_tag_name = top_level_tag_name.replace('wmt_ms_capabilities', 'wms') # WMS 1.1.1 top_level_tag_name = re.sub('wfs:.*', 'wfs', top_level_tag_name) # WFS 2.0 top_level_tag_name = top_level_tag_name.replace('wfs_capabilities', 'wfs') # WFS 1.0/1.1 top_level_tag_name = top_level_tag_name.replace('feed', 'atom feed') if top_level_tag_name.lower() == 'capabilities' and \ 'xmlns="http://www.opengis.net/wmts/' in buf: top_level_tag_name = 'wmts' if top_level_tag_name.lower() in ('coveragedescriptions', 'capabilities') and \ 'xmlns="http://www.opengis.net/wcs/' in buf: top_level_tag_name = 'wcs' format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name) if format_tuple: format_ = {'format': format_tuple[1]} log.info('XML variant detected: %s', format_tuple[2]) return format_ log.warning('Did not recognise XML format: %s', top_level_tag_name) return {'format': 'XML'} def has_rdfa(buf, log): '''If the buffer HTML contains RDFa then this returns True''' # quick check for the key words if 'about=' not in buf or 'property=' not in buf: log.debug('Not RDFA') return False
def sniff_file_format(filepath, log): '''For a given filepath, work out what file format it is. Returns a dict with format as a string, which is the format's canonical shortname (as defined by ckan's resource_formats.json) and a key that says if it is contained in a zip or something. e.g. {'format': 'CSV', 'container': 'zip', } or None if it can\'t tell what it is. Note, log is a logger, either a Celery one or a standard Python logging one. ''' format_ = None log.info('Sniffing file format of: %s', filepath) filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \ else filepath mime_type = magic.from_file(filepath_utf8, mime=True) log.info('Magic detects file as: %s', mime_type) if mime_type: if mime_type == 'application/xml': with open(filepath) as f: buf = f.read(5000) format_ = get_xml_variant_including_xml_declaration(buf, log) elif mime_type == 'application/zip': format_ = get_zipped_format(filepath, log) elif mime_type in ('application/msword', 'application/vnd.ms-office'): # In the past Magic gives the msword mime-type for Word and other # MS Office files too, so use BSD File to be sure which it is. format_ = run_bsd_file(filepath, log) if not format_ and is_excel(filepath, log): format_ = {'format': 'XLS'} elif mime_type == 'application/octet-stream': # Excel files sometimes come up as this if is_excel(filepath, log): format_ = {'format': 'XLS'} else: # e.g. Shapefile format_ = run_bsd_file(filepath, log) if not format_: with open(filepath) as f: buf = f.read(500) format_ = is_html(buf, log) elif mime_type == 'text/html': # Magic can mistake IATI for HTML with open(filepath) as f: buf = f.read(100) if is_iati(buf, log): format_ = {'format': 'IATI'} if format_: return format_ format_tuple = ckan_helpers.resource_formats().get(mime_type) if format_tuple: format_ = {'format': format_tuple[1]} if not format_: if mime_type.startswith('text/'): # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = {'format': 'JSON'} # is it CSV? elif is_csv(buf, log): format_ = {'format': 'CSV'} elif is_psv(buf, log): format_ = {'format': 'PSV'} if not format_: log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type) if format_: log.info('Mimetype translates to filetype: %s', format_['format']) if format_['format'] == 'TXT': # is it JSON? with open(filepath, 'rU') as f: buf = f.read(10000) if is_json(buf, log): format_ = {'format': 'JSON'} # is it CSV? elif is_csv(buf, log): format_ = {'format': 'CSV'} elif is_psv(buf, log): format_ = {'format': 'PSV'} # XML files without the "<?xml ... ?>" tag end up here elif is_xml_but_without_declaration(buf, log): format_ = get_xml_variant_without_xml_declaration(buf, log) elif is_ttl(buf, log): format_ = {'format': 'TTL'} elif format_['format'] == 'HTML': # maybe it has RDFa in it with open(filepath) as f: buf = f.read(100000) if has_rdfa(buf, log): format_ = {'format': 'RDFa'} else: # Excel files sometimes not picked up by magic, so try alternative if is_excel(filepath, log): format_ = {'format': 'XLS'} # BSD file picks up some files that Magic misses # e.g. some MS Word files if not format_: format_ = run_bsd_file(filepath, log) if not format_: log.warning('Could not detect format of file: %s', filepath) return format_
filenames = zip.namelist() finally: zip.close() except zipfile.BadZipfile, e: log.info('Zip file open raised error %s: %s', e, e.args) return except Exception, e: log.warning('Zip file open raised exception %s: %s', e, e.args) return top_score = 0 top_scoring_extension_counts = defaultdict(int) # extension: number_of_files for filename in filenames: extension = os.path.splitext(filename)[-1][1:].lower() format_tuple = ckan_helpers.resource_formats().get(extension) if format_tuple: score = lib.resource_format_scores().get(format_tuple[1]) if score is not None and score > top_score: top_score = score top_scoring_extension_counts = defaultdict(int) if score == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return {'format': 'ZIP'} top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1])
def _distribution_format(self, distribution, normalize_ckan_format=True): ''' Returns the Internet Media Type and format label for a distribution Given a reference (URIRef or BNode) to a dcat:Distribution, it will try to extract the media type (previously knowm as MIME type), eg `text/csv`, and the format label, eg `CSV` Values for the media type will be checked in the following order: 1. literal value of dcat:mediaType 2. literal value of dct:format if it contains a '/' character 3. value of dct:format if it is an instance of dct:IMT, eg: <dct:format> <dct:IMT rdf:value="text/html" rdfs:label="HTML"/> </dct:format> Values for the label will be checked in the following order: 1. literal value of dct:format if it not contains a '/' character 2. label of dct:format if it is an instance of dct:IMT (see above) If `normalize_ckan_format` is True and using CKAN>=2.3, the label will be tried to match against the standard list of formats that is included with CKAN core (https://github.com/ckan/ckan/blob/master/ckan/config/resource_formats.json) This allows for instance to populate the CKAN resource format field with a format that view plugins, etc will understand (`csv`, `xml`, etc.) Return a tuple with the media type and the label, both set to None if they couldn't be found. ''' imt = None label = None imt = self._object_value(distribution, DCAT.mediaType) _format = self._object(distribution, DCT['format']) if isinstance(_format, Literal): if not imt and '/' in _format: imt = unicode(_format) else: label = unicode(_format) elif isinstance(_format, (BNode, URIRef)): if self._object(_format, RDF.type) == DCT.IMT: if not imt: imt = unicode(self.g.value(_format, default=None)) label = unicode(self.g.label(_format, default=None)) if ((imt or label) and normalize_ckan_format and toolkit.check_ckan_version(min_version='2.3')): import ckan.config from ckan.lib import helpers format_registry = helpers.resource_formats() if imt in format_registry: label = format_registry[imt][1] elif label in format_registry: label = format_registry[label][1] return imt, label
return {'format': 'XML'} log.info('Top level tag detected as: %s', top_level_tag_name) top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf') top_level_tag_name = top_level_tag_name.replace('wms_capabilities', 'wms') # WMS 1.3 top_level_tag_name = top_level_tag_name.replace('wmt_ms_capabilities', 'wms') # WMS 1.1.1 top_level_tag_name = re.sub('wfs:.*', 'wfs', top_level_tag_name) # WFS 2.0 top_level_tag_name = top_level_tag_name.replace('wfs_capabilities', 'wfs') # WFS 1.0/1.1 top_level_tag_name = top_level_tag_name.replace('feed', 'atom feed') if top_level_tag_name.lower() == 'capabilities' and \ 'xmlns="http://www.opengis.net/wmts/' in buf: top_level_tag_name = 'wmts' if top_level_tag_name.lower() in ('coveragedescriptions', 'capabilities') and \ 'xmlns="http://www.opengis.net/wcs/' in buf: top_level_tag_name = 'wcs' format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name) if format_tuple: format_ = {'format': format_tuple[1]} log.info('XML variant detected: %s', format_tuple[2]) return format_ log.warning('Did not recognise XML format: %s', top_level_tag_name) return {'format': 'XML'} def has_rdfa(buf, log): '''If the buffer HTML contains RDFa then this returns True''' # quick check for the key words if 'about=' not in buf or 'property=' not in buf: log.debug('Not RDFA') return False # more rigorous check for them as tag attributes
def get_package_dict(self, harvest_object, package_dict_defaults, source_config, existing_dataset): ''' Constructs a package_dict suitable to be passed to package_create or package_update. See documentation on ckan.logic.action.create.package_create for more details * name - a new package must have a unique name; if it had a name in the previous harvest, that will be in the package_dict_defaults. * resource.id - should be the same as the old object if updating a package * errors - call self._save_object_error() and return False * default values for name, owner_org, tags etc can be merged in using: package_dict = package_dict_defaults.merge(package_dict_harvested) ''' import ckanext.dgu.lib.theme as dgutheme from ckan.lib.helpers import resource_formats from ckan import model from ckanext.harvest.model import (HarvestObjectExtra as HOExtra, HarvestGatherError) res_formats = resource_formats() inv_dataset = InventoryDocument.dataset_to_dict( InventoryDocument.parse_xml_string(harvest_object.content)) pkg = dict(title=inv_dataset['title'], notes=inv_dataset['description'], state='active' if inv_dataset['active'] else 'deleted', resources=[], extras={ self.IDENTIFIER_KEY: inv_dataset['identifier'], 'harvest_source_reference': harvest_object.guid }) # License rights = inv_dataset.get('rights') if rights: license_id, licence = \ dgu_helpers.get_licence_fields_from_free_text(rights) pkg['license_id'] = license_id if licence: pkg['extras']['licence'] = licence log.info('Custom licence %r', rights) else: pkg['license_id'] = '' # Resources inv_resources = [r for r in inv_dataset['resources'] if r['active']] existing_resource_urls = dict((r.url, r.id) for r in existing_dataset.resources) \ if existing_dataset else {} pkg['resources'] = [] for inv_resource in inv_resources: format_ = res_formats.get(inv_resource['mimetype'].lower().strip()) if format_: format_ = format_[1] else: format_ = inv_resource['mimetype'] description = inv_resource['title'] if inv_resource['availability']: description += ' - %s' % inv_resource['availability'] # if it is temporal, it should be a timeseries, # if it is not data, it should be an additional resource resource_type = 'file' if inv_resource['resource_type'] == 'Data' \ else 'documentation' # Schema if inv_resource['conforms_to']: schema_url = inv_resource['conforms_to'] schema_type = SCHEMA_TYPE_MAP.get(format_) else: schema_url = schema_type = '' res = { 'url': inv_resource['url'], 'format': format_, 'description': description, 'resource_type': resource_type, 'schema-url': schema_url, 'schema-type': schema_type, } if res['url'] in existing_resource_urls: res['id'] = existing_resource_urls[res['url']] pkg['resources'].append(res) # Local Authority Services and Functions if inv_dataset['services']: log.info('Local Authority Services: %r', inv_dataset['services']) # e.g. {http://id.esd.org.uk/service/190} pkg['extras']['la_service'] = ' '.join(inv_dataset['services']) else: pkg['extras']['la_service'] = '' if inv_dataset['functions']: log.info('Local Authority Functions %r', inv_dataset['functions']) pkg['extras']['la_function'] = ' '.join(inv_dataset['functions']) else: pkg['extras']['la_function'] = '' pkg = package_dict_defaults.merge(pkg) if not pkg.get('name'): # append the publisher name to differentiate similar titles better # than just a numbers suffix publisher = model.Group.get(harvest_object.job.source.publisher_id) publisher_abbrev = self._get_publisher_abbreviation(publisher) pkg['name'] = self._gen_new_name('%s %s' % (pkg['title'], publisher_abbrev)) # Themes based on services/functions if 'tags' not in pkg: pkg['tags'] = [] try: themes = dgutheme.categorize_package(pkg) log.debug('%s given themes: %r', pkg['name'], themes) except ImportError, e: log.debug('Theme cannot be given: %s', e) themes = []
def get_zipped_format(filepath): '''For a given zip file, return the format of file inside. For multiple files, choose by the most open, and then by the most popular extension.''' # just check filename extension of each file inside try: # note: Cannot use "with" with a zipfile before python 2.7 # so we have to close it manually. zip = zipfile.ZipFile(filepath, 'r') try: filepaths = zip.namelist() finally: zip.close() except zipfile.BadZipfile as e: log.info('Zip file open raised error %s: %s', e, e.args) return except Exception as e: log.warning('Zip file open raised exception %s: %s', e, e.args) return # Shapefile check - a Shapefile is a zip containing specific files: # .shp, .dbf and .shx amongst others extensions = set([f.split('.')[-1].lower() for f in filepaths]) if len(extensions & set(('shp', 'dbf', 'shx'))) == 3: log.info('Shapefile detected') return {'format': 'SHP'} # GTFS check - a GTFS is a zip which containing specific filenames filenames = set((os.path.basename(f) for f in filepaths)) if not (set(('agency.txt', 'stops.txt', 'routes.txt', 'trips.txt', 'stop_times.txt', 'calendar.txt')) - set(filenames)): log.info('GTFS detected') return {'format': 'GTFS'} top_score = 0 top_scoring_extension_counts = defaultdict( int) # extension: number_of_files for filepath in filepaths: extension = os.path.splitext(filepath)[-1][1:].lower() format_tuple = ckan_helpers.resource_formats().get(extension) if format_tuple: score = lib.resource_format_scores().get(format_tuple[1]) if score is not None and score > top_score: top_score = score top_scoring_extension_counts = defaultdict(int) if score == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return {'format': 'ZIP'} top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1]) top_extension = top_scoring_extension_counts[-1][0] log.info('Zip file\'s most popular extension is "%s" (All extensions: %r)', top_extension, top_scoring_extension_counts) format_tuple = ckan_helpers.resource_formats()[top_extension] format_ = {'format': format_tuple[1], 'container': 'ZIP'} log.info('Zipped file format detected: %s', format_tuple[2]) return format_
try: filenames = zip.namelist() finally: zip.close() except zipfile.BadZipfile, e: log.info('Zip file open raised error %s: %s', e, e.args) return except Exception, e: log.warning('Zip file open raised exception %s: %s', e, e.args) return top_score = 0 top_scoring_extension_counts = defaultdict( int) # extension: number_of_files for filename in filenames: extension = os.path.splitext(filename)[-1][1:].lower() format_tuple = ckan_helpers.resource_formats().get(extension) if format_tuple: score = lib.resource_format_scores().get(format_tuple[1]) if score is not None and score > top_score: top_score = score top_scoring_extension_counts = defaultdict(int) if score == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return {'format': 'ZIP'} top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
def format_mapping(self): try: tk.check_access('sysadmin', {'user': g.user, model: model}) except tk.NotAuthorized: return tk.abort(403) if request.method == 'POST': old = request.POST.get('from') new = request.POST.get('to') if old and new: ids = set() res_query = model.Session.query(model.Resource).filter_by( format=old, state='active' ) for res in res_query: ids.add(res.package_id) res_query.update({'format': new}) model.Session.commit() for id in ids: clear(id) rebuild(id, defer_commit=True) commit() tk.h.flash_success( 'Updated. Records changed: {}'.format(len(ids)) ) return tk.redirect_to('format_mapping') defined = set( map(lambda (_1, fmt, _3): fmt, h.resource_formats().values()) ) db_formats = model.Session.query( model.Resource.format, func.count(model.Resource.id), func.count(model.PackageExtra.value) ).outerjoin( model.PackageExtra, (model.Resource.package_id == model.PackageExtra.package_id) & ((model.PackageExtra.key == 'harvest_portal') | (model.PackageExtra.key.is_(None))) ).group_by(model.Resource.format).filter( model.Resource.format != '', model.Resource.state == 'active' ) db_formats = db_formats.all() format_types = { f: { True: 'Partially external', e == 0: 'Local', t - e == 0: 'External' }[True] for (f, t, e) in db_formats } used = set(format_types) undefined = used - defined extra_vars = { 'undefined': undefined, 'defined': defined, 'format_types': format_types } return tk.render('admin/format_mapping.html', extra_vars)