Пример #1
    def _extract_file_format(self, url, headers):
        Makes a best guess at the file format.

        /path/to/a_file.csv has format "CSV"
        /path/to/a_file.csv.zip has format "CSV / Zip"

        First this function tries to extract the file-extensions from the url,
        and deduce the format from there.  If no file-extension is found, then
        the mimetype from the headers is passed to `mimetypes.guess_extension()`.
        formats = []
        parsed_url = urlparse.urlparse(url)
        path = parsed_url.path
        base, extension = posixpath.splitext(path)
        while extension:
            formats.append(extension[1:].upper())  # strip leading '.' from extension
            base, extension = posixpath.splitext(base)
        if formats:
            extension = ".".join(formats[::-1]).lower()
            format_tuple = ckan_helpers.resource_formats().get(extension)
            if format_tuple:
                return format_tuple[1]
            return " / ".join(formats[::-1])

        # No file extension found, attempt to extract format using the mimetype
        stripped_mimetype = self._extract_mimetype(headers)  # stripped of charset
        format_tuple = ckan_helpers.resource_formats().get(stripped_mimetype)
        if format_tuple:
            return format_tuple[1]

        extension = mimetypes.guess_extension(stripped_mimetype)
        if extension:
            return extension[1:].upper()
def score_by_format_field(resource, score_reasons):
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append(_('Format field is blank.'))
        return (None, None)
    format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \
    if not format_tuple:
            _('Format field "%s" does not correspond to a known format.') %
        return (None, None)
    score = lib.resource_format_scores().get(format_tuple[1])
        _('Format field "%s" receives score: %s.') % (format_field, score))
    return (score, format_tuple[1])
Пример #6
def run_bsd_file(filepath, log):
    '''Run the BSD command-line tool "file" to determine file type. Returns
    a format dict or None if it fails.'''
    result = check_output(['file', filepath])
    match = re.search('Name of Creating Application: ([^,]*),', result)
    if match:
        app_name = match.groups()[0]
        format_map = {'Microsoft Office PowerPoint': 'ppt',
                      'Microsoft PowerPoint': 'ppt',
                      'Microsoft Excel': 'xls',
                      'Microsoft Office Word': 'doc',
                      'Microsoft Word 10.0': 'doc',
                      'Microsoft Macintosh Word': 'doc',
        if app_name in format_map:
            extension = format_map[app_name]
            format_tuple = ckan_helpers.resource_formats()[extension]
            log.info('"file" detected file format: %s',
            return {'format': format_tuple[1]}
    match = re.search(': ESRI Shapefile', result)
    if match:
        format_ = {'format': 'SHP'}
        log.info('"file" detected file format: %s',
        return format_
    log.info('"file" could not determine file format of "%s": %s',
             filepath, result)
Пример #10
def format_get(key):
    '''Returns a resource format, as defined in ckan.

    :param key: format extension / mimetype / title e.g. 'CSV',
                'application/msword', 'Word document'
    :param key: string
    :returns: format string
    format_tuple = ckan_helpers.resource_formats().get(key.lower())
    if not format_tuple:
    return format_tuple[1]  # short name
Пример #13
def get_xml_variant_without_xml_declaration(buf):
    '''If this buffer is in a format based on XML, without any XML declaration
    or other boilerplate, return the format type.'''
    # Parse the XML to find the first tag name.
    # Using expat directly, rather than go through xml.sax, since using I
    # couldn't see how to give it a string, so used StringIO which failed
    # for some files curiously.
    import xml.parsers.expat

    class GotFirstTag(Exception):

    def start_element(name, attrs):
        raise GotFirstTag(name)

    p = xml.parsers.expat.ParserCreate()
    p.StartElementHandler = start_element
    except GotFirstTag as e:
        top_level_tag_name = six.text_type(e).lower()
    except xml.sax.SAXException as e:
        log.info('Sax parse error: %s %s', e, buf)
        return {'format': 'XML'}

    log.info('Top level tag detected as: %s', top_level_tag_name)
    top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf')
    top_level_tag_name = top_level_tag_name.replace('wms_capabilities',
                                                    'wms')  # WMS 1.3
    top_level_tag_name = top_level_tag_name.replace('wmt_ms_capabilities',
                                                    'wms')  # WMS 1.1.1
    top_level_tag_name = re.sub('wfs:.*', 'wfs', top_level_tag_name)  # WFS 2.0
    top_level_tag_name = top_level_tag_name.replace('wfs_capabilities',
                                                    'wfs')  # WFS 1.0/1.1
    top_level_tag_name = top_level_tag_name.replace('feed', 'atom feed')
    if top_level_tag_name.lower() == 'capabilities' and \
            'xmlns="http://www.opengis.net/wmts/' in buf:
        top_level_tag_name = 'wmts'
    if top_level_tag_name.lower() in ('coveragedescriptions', 'capabilities') and \
            'xmlns="http://www.opengis.net/wcs/' in buf:
        top_level_tag_name = 'wcs'
    format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name)
    if format_tuple:
        format_ = {'format': format_tuple[1]}
        log.info('XML variant detected: %s', format_tuple[2])
        return format_
    log.warning('Did not recognise XML format: %s', top_level_tag_name)
    return {'format': 'XML'}
Пример #16
def hdx_unified_resource_format(format):
    This function is based on the unified_resource_format() function from ckan.lib.helpers.
    As the one from core ckan it checks the resource formats configuration to translate the
    format string to a standard format.
    The difference is that in case nothing is found in 'resource_formats.json' then it's
    turned to lowercase.

    :param format: resource format as written by the user
    :type format: string
    formats = h.resource_formats()
    format_clean = format.lower()
    if format_clean in formats:
        format_new = formats[format_clean][1]
        format_new = format_clean
    return format_new
def sniff_file_format(filepath, log):
    '''For a given filepath, work out what file format it is.

    Returns a dict with format as a string, which is the format's canonical
    shortname (as defined by ckan's resource_formats.json) and a key that says
    if it is contained in a zip or something.

    e.g. {'format': 'CSV',
          'container': 'zip',
    or None if it can\'t tell what it is.
    format_ = None
    log.info('Sniffing file format of: %s', filepath)
    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
        else filepath
    mime_type = magic.from_file(filepath_utf8, mime=True)
    log.info('Magic detects file as: %s', mime_type)
    if mime_type:
        if mime_type == 'application/xml':
            with open(filepath) as f:
                buf = f.read(5000)
            format_ = get_xml_variant_including_xml_declaration(buf, log)
        elif mime_type == 'application/zip':
            format_ = get_zipped_format(filepath, log)
        elif mime_type in ('application/msword', 'application/vnd.ms-office'):
            # In the past Magic gives the msword mime-type for Word and other
            # MS Office files too, so use BSD File to be sure which it is.
            format_ = run_bsd_file(filepath, log)
            if not format_ and is_excel(filepath, log):
                format_ = {'format': 'XLS'}
        elif mime_type == 'application/octet-stream':
            # Excel files sometimes come up as this
            if is_excel(filepath, log):
                format_ = {'format': 'XLS'}
                # e.g. Shapefile
                format_ = run_bsd_file(filepath, log)
            if not format_:
                with open(filepath) as f:
                    buf = f.read(500)
                format_ = is_html(buf, log)
        elif mime_type == 'text/html':
            # Magic can mistake IATI for HTML
            with open(filepath) as f:
                buf = f.read(100)
            if is_iati(buf, log):
                format_ = {'format': 'IATI'}

        if format_:
            return format_

        format_tuple = ckan_helpers.resource_formats().get(mime_type)
        if format_tuple:
            format_ = {'format': format_tuple[1]}

        if not format_:
            if mime_type.startswith('text/'):
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = {'format': 'JSON'}
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = {'format': 'CSV'}
                elif is_psv(buf, log):
                    format_ = {'format': 'PSV'}

        if not format_:
            log.warning('Mimetype not recognised by CKAN as a data format: %s',

        if format_:
            log.info('Mimetype translates to filetype: %s', format_['format'])

            if format_['format'] == 'TXT':
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = {'format': 'JSON'}
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = {'format': 'CSV'}
                elif is_psv(buf, log):
                    format_ = {'format': 'PSV'}
                # XML files without the "<?xml ... ?>" tag end up here
                elif is_xml_but_without_declaration(buf, log):
                    format_ = get_xml_variant_without_xml_declaration(buf, log)
                elif is_ttl(buf, log):
                    format_ = {'format': 'TTL'}

            elif format_['format'] == 'HTML':
                # maybe it has RDFa in it
                with open(filepath) as f:
                    buf = f.read(100000)
                if has_rdfa(buf, log):
                    format_ = {'format': 'RDFa'}

        # Excel files sometimes not picked up by magic, so try alternative
        if is_excel(filepath, log):
            format_ = {'format': 'XLS'}
        # BSD file picks up some files that Magic misses
        # e.g. some MS Word files
        if not format_:
            format_ = run_bsd_file(filepath, log)

    if not format_:
        log.warning('Could not detect format of file: %s', filepath)
    return format_
    top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf')
    top_level_tag_name = top_level_tag_name.replace('wms_capabilities',
                                                    'wms')  # WMS 1.3
    top_level_tag_name = top_level_tag_name.replace('wmt_ms_capabilities',
                                                    'wms')  # WMS 1.1.1
    top_level_tag_name = re.sub('wfs:.*', 'wfs', top_level_tag_name)  # WFS 2.0
    top_level_tag_name = top_level_tag_name.replace('wfs_capabilities',
                                                    'wfs')  # WFS 1.0/1.1
    top_level_tag_name = top_level_tag_name.replace('feed', 'atom feed')
    if top_level_tag_name.lower() == 'capabilities' and \
            'xmlns="http://www.opengis.net/wmts/' in buf:
        top_level_tag_name = 'wmts'
    if top_level_tag_name.lower() in ('coveragedescriptions', 'capabilities') and \
            'xmlns="http://www.opengis.net/wcs/' in buf:
        top_level_tag_name = 'wcs'
    format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name)
    if format_tuple:
        format_ = {'format': format_tuple[1]}
        log.info('XML variant detected: %s', format_tuple[2])
        return format_
    log.warning('Did not recognise XML format: %s', top_level_tag_name)
    return {'format': 'XML'}

def has_rdfa(buf, log):
    '''If the buffer HTML contains RDFa then this returns True'''
    # quick check for the key words
    if 'about=' not in buf or 'property=' not in buf:
        log.debug('Not RDFA')
        return False
Пример #24
def get_zipped_format(filepath):
    '''For a given zip file, return the format of file inside.
    For multiple files, choose by the most open, and then by the most
    popular extension.'''
    # just check filename extension of each file inside
        # note: Cannot use "with" with a zipfile before python 2.7
        #       so we have to close it manually.
        zip = zipfile.ZipFile(filepath, 'r')
            filepaths = zip.namelist()
    except zipfile.BadZipfile as e:
        log.info('Zip file open raised error %s: %s', e, e.args)
    except Exception as e:
        log.warning('Zip file open raised exception %s: %s', e, e.args)

    # Shapefile check - a Shapefile is a zip containing specific files:
    # .shp, .dbf and .shx amongst others
    extensions = set([f.split('.')[-1].lower() for f in filepaths])
    if len(extensions & set(('shp', 'dbf', 'shx'))) == 3:
        log.info('Shapefile detected')
        return {'format': 'SHP'}

    # GTFS check - a GTFS is a zip which containing specific filenames
    filenames = set((os.path.basename(f) for f in filepaths))
    if not (set(('agency.txt', 'stops.txt', 'routes.txt', 'trips.txt',
                 'stop_times.txt', 'calendar.txt')) - set(filenames)):
        log.info('GTFS detected')
        return {'format': 'GTFS'}

    top_score = 0
    top_scoring_extension_counts = defaultdict(
        int)  # extension: number_of_files
    for filepath in filepaths:
        extension = os.path.splitext(filepath)[-1][1:].lower()
        format_tuple = ckan_helpers.resource_formats().get(extension)
        if format_tuple:
            score = lib.resource_format_scores().get(format_tuple[1])
            if score is not None and score > top_score:
                top_score = score
                top_scoring_extension_counts = defaultdict(int)
            if score == top_score:
                top_scoring_extension_counts[extension] += 1
            log.info('Zipped file of unknown extension: "%s" (%s)', extension,
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return {'format': 'ZIP'}

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
    top_extension = top_scoring_extension_counts[-1][0]
    log.info('Zip file\'s most popular extension is "%s" (All extensions: %r)',
             top_extension, top_scoring_extension_counts)
    format_tuple = ckan_helpers.resource_formats()[top_extension]
    format_ = {'format': format_tuple[1], 'container': 'ZIP'}
    log.info('Zipped file format detected: %s', format_tuple[2])
    return format_
Пример #26
    def _distribution_format(self, distribution, normalize_ckan_format=True):
        Returns the Internet Media Type and format label for a distribution

        Given a reference (URIRef or BNode) to a dcat:Distribution, it will
        try to extract the media type (previously knowm as MIME type), eg
        `text/csv`, and the format label, eg `CSV`

        Values for the media type will be checked in the following order:

        1. literal value of dcat:mediaType
        2. literal value of dct:format if it contains a '/' character
        3. value of dct:format if it is an instance of dct:IMT, eg:

                <dct:IMT rdf:value="text/html" rdfs:label="HTML"/>

        Values for the label will be checked in the following order:

        1. literal value of dct:format if it not contains a '/' character
        2. label of dct:format if it is an instance of dct:IMT (see above)

        If `normalize_ckan_format` is True and using CKAN>=2.3, the label will
        be tried to match against the standard list of formats that is included
        with CKAN core
        This allows for instance to populate the CKAN resource format field
        with a format that view plugins, etc will understand (`csv`, `xml`,

        Return a tuple with the media type and the label, both set to None if
        they couldn't be found.

        imt = None
        label = None

        imt = self._object_value(distribution, DCAT.mediaType)

        _format = self._object(distribution, DCT['format'])
        if isinstance(_format, Literal):
            if not imt and '/' in _format:
                imt = unicode(_format)
                label = unicode(_format)
        elif isinstance(_format, (BNode, URIRef)):
            if self._object(_format, RDF.type) == DCT.IMT:
                if not imt:
                    imt = unicode(self.g.value(_format, default=None))
                label = unicode(self.g.label(_format, default=None))

        if ((imt or label) and normalize_ckan_format and
            import ckan.config
            from ckan.lib import helpers

            format_registry = helpers.resource_formats()

            if imt in format_registry:
                label = format_registry[imt][1]
            elif label in format_registry:
                label = format_registry[label][1]

        return imt, label
