Exemplo n.º 1
0
    def get_package_dict(self, harvest_object, package_dict_defaults,
                         source_config, existing_dataset):
        '''
        Constructs a package_dict suitable to be passed to package_create or
        package_update. See documentation on
        ckan.logic.action.create.package_create for more details

        * name - a new package must have a unique name; if it had a name in the
          previous harvest, that will be in the package_dict_defaults.
        * resource.id - should be the same as the old object if updating a
          package
        * errors - call self._save_object_error() and return False
        * default values for name, owner_org, tags etc can be merged in using:
            package_dict = package_dict_defaults.merge(package_dict_harvested)
        '''
        inv_dataset = InventoryDocument.dataset_to_dict(
            InventoryDocument.parse_xml_string(harvest_object.content))

        pkg = dict(title=inv_dataset['title'],
                   notes=inv_dataset['description'],
                   state='active' if inv_dataset['active'] else 'deleted',
                   resources=[],
                   extras={
                       self.IDENTIFIER_KEY: inv_dataset['identifier'],
                       'harvest_source_reference': harvest_object.guid
                   })
        # License
        rights = inv_dataset.get('rights')
        if rights:
            register = model.Package.get_license_register()
            if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/':
                pkg['license_id'] = 'uk-ogl'
            else:
                for l in register.values():
                    if l.url == rights:
                        pkg['license_id'] = l.id
                        break
                else:
                    # just save it as it is
                    pkg['license_id'] = register
                    log.info('Did not recognize license %r', register)
        else:
            pkg['license_id'] = None

        # Resources
        inv_resources = [r for r in inv_dataset['resources'] if r['active']]
        existing_resource_urls = dict((r.url, r.id)
                                      for r in existing_dataset.resources) \
                                 if existing_dataset else {}
        pkg['resources'] = []
        for inv_resource in inv_resources:
            format_ = Formats.by_mime_type().get(inv_resource['mimetype'])
            if format_:
                format_ = format_['display_name']
            else:
                format_ = inv_resource['mimetype']
            description = inv_resource['title']
            if inv_resource['availability']:
                description += ' - %s' % inv_resource['availability']
            # if it is temporal, it should be a timeseries,
            # if it is not data, it should be an additional resource
            resource_type = 'file' if inv_resource['resource_type'] == 'Data' \
                else 'documentation'
            # Schema
            if inv_resource['conforms_to']:
                schema_url = inv_resource['conforms_to']
                schema_type = SCHEMA_TYPE_MAP.get(format_)
            else:
                schema_url = schema_type = ''
            res = {
                'url': inv_resource['url'],
                'format': format_,
                'description': description,
                'resource_type': resource_type,
                'schema-url': schema_url,
                'schema-type': schema_type,
            }
            if res['url'] in existing_resource_urls:
                res['id'] = existing_resource_urls[res['url']]
            pkg['resources'].append(res)

        # Local Authority Services and Functions
        if inv_dataset['services']:
            log.info('Local Authority Services: %r', inv_dataset['services'])
            # e.g. {http://id.esd.org.uk/service/190}
            pkg['extras']['la_service'] = ' '.join(inv_dataset['services'])
        else:
            pkg['extras']['la_service'] = ''
        if inv_dataset['functions']:
            log.info('Local Authority Functions %r', inv_dataset['functions'])
            pkg['extras']['la_function'] = ' '.join(inv_dataset['functions'])
        else:
            pkg['extras']['la_function'] = ''

        pkg = package_dict_defaults.merge(pkg)
        if not pkg.get('name'):
            # append the publisher name to differentiate similar titles better
            # than just a numbers suffix
            publisher = model.Group.get(harvest_object.job.source.publisher_id)
            publisher_abbrev = self._get_publisher_abbreviation(publisher)
            pkg['name'] = self.check_name(
                self.munge_title_to_name('%s %s' %
                                         (pkg['title'], publisher_abbrev)))

        # Themes based on services/functions
        if 'tags' not in pkg:
            pkg['tags'] = []
        themes = dgutheme.categorize_package(pkg)
        log.debug('%s given themes: %r', pkg['name'], themes)
        if themes:
            pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0]
            if len(themes) == 2:
                pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1]

        pkg['extras'] = self.extras_from_dict(pkg['extras'])
        return pkg
Exemplo n.º 2
0
 def test_by_mime_type(self):
     assert_equal(Formats.by_mime_type()['text/x-json']['display_name'],
                  'JSON')
Exemplo n.º 3
0
def sniff_file_format(filepath, log):
    '''For a given filepath, work out what file format it is.
    Returns Format dict with a key to say if it is contained
    in a zip or something.
    e.g. {'display_name': 'CSV',
          'container': 'zip',
           ...}
    or None if it can\'t tell what it is.

    Note, log is a logger, either a Celery one or a standard
    Python logging one.
    '''
    format_ = None
    log.info('Sniffing file format of: %s', filepath)
    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
                    else filepath
    mime_type = magic.from_file(filepath_utf8, mime=True)
    log.info('Magic detects file as: %s', mime_type)
    if mime_type:
        if mime_type == 'application/xml':
            with open(filepath) as f:
                buf = f.read(5000)
            format_ = get_xml_variant_including_xml_declaration(buf, log)
        elif mime_type == 'application/zip':
            format_ = get_zipped_format(filepath, log)
        elif mime_type == 'application/msword':
            # Magic gives this mime-type for other MS Office files too
            format_ = run_bsd_file(filepath, log)
            if not format_ and is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
        elif mime_type == 'application/octet-stream':
            # Excel files sometimes come up as this
            if is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
            else:
                # e.g. Shapefile
                format_ = run_bsd_file(filepath, log)
            if not format_:
                with open(filepath) as f:
                    buf = f.read(500)
                format_ = is_html(buf, log)
        elif mime_type == 'text/html':
            # Magic can mistake IATI for HTML
            with open(filepath) as f:
                buf = f.read(100)
            if is_iati(buf, log):
                format_ = Formats.by_display_name()['IATI']

        if format_:
            return format_

        format_ = Formats.by_mime_type().get(mime_type)

        if not format_:
            if mime_type.startswith('text/'):
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']

        if not format_:
            log.warning('Mimetype not recognised by CKAN as a data format: %s',
                        mime_type)

        if format_:
            log.info('Mimetype translates to filetype: %s',
                     format_['display_name'])

            if format_['display_name'] == 'TXT':
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']
                # XML files without the "<?xml ... ?>" tag end up here
                elif is_xml_but_without_declaration(buf, log):
                    format_ = get_xml_variant_without_xml_declaration(buf, log)
                elif is_ttl(buf, log):
                    format_ = Formats.by_extension()['ttl']

            elif format_['display_name'] == 'HTML':
                # maybe it has RDFa in it
                with open(filepath) as f:
                    buf = f.read(100000)
                if has_rdfa(buf, log):
                    format_ = Formats.by_display_name()['RDFa']

    else:
        # Excel files sometimes not picked up by magic, so try alternative
        if is_excel(filepath, log):
            format_ = Formats.by_display_name()['XLS']
        # BSD file picks up some files that Magic misses
        # e.g. some MS Word files
        if not format_:
            format_ = run_bsd_file(filepath, log)

    if not format_:
        log.warning('Could not detect format of file: %s', filepath)
    return format_
Exemplo n.º 4
0
def sniff_file_format(filepath, log):
    '''For a given filepath, work out what file format it is.
    Returns Format dict with a key to say if it is contained
    in a zip or something.
    e.g. {'display_name': 'CSV',
          'container': 'zip',
           ...}
    or None if it can\'t tell what it is.

    Note, log is a logger, either a Celery one or a standard
    Python logging one.
    '''
    format_ = None
    log.info('Sniffing file format of: %s', filepath)
    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
                    else filepath
    mime_type = magic.from_file(filepath_utf8, mime=True)
    log.info('Magic detects file as: %s', mime_type)
    if mime_type:
        if mime_type == 'application/xml':
            with open(filepath) as f:
                buf = f.read(5000)
            format_ = get_xml_variant_including_xml_declaration(buf, log)
        elif mime_type == 'application/zip':
            format_ = get_zipped_format(filepath, log)
        elif mime_type == 'application/msword':
            # Magic gives this mime-type for other MS Office files too
            format_ = run_bsd_file(filepath, log)
            if not format_ and is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
        elif mime_type == 'application/octet-stream':
            # Excel files sometimes come up as this
            if is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
            else:
                # e.g. Shapefile
                format_ = run_bsd_file(filepath, log)
            if not format_:
                with open(filepath) as f:
                    buf = f.read(500)
                format_ = is_html(buf, log)
        elif mime_type == 'text/html':
            # Magic can mistake IATI for HTML
            with open(filepath) as f:
                buf = f.read(100)
            if is_iati(buf, log):
                format_ = Formats.by_display_name()['IATI']
                
        if format_:
            return format_
                
        format_ = Formats.by_mime_type().get(mime_type)

        if not format_:
            if mime_type.startswith('text/'):
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']

        if not format_:
            log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type)
            
        if format_:
            log.info('Mimetype translates to filetype: %s', format_['display_name'])

            if format_['display_name'] == 'TXT':
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']
                # XML files without the "<?xml ... ?>" tag end up here
                elif is_xml_but_without_declaration(buf, log):
                    format_ = get_xml_variant_without_xml_declaration(buf, log)
                elif is_ttl(buf, log):
                    format_ = Formats.by_extension()['ttl']

            elif format_['display_name'] == 'HTML':
                # maybe it has RDFa in it
                with open(filepath) as f:
                    buf = f.read(100000)
                if has_rdfa(buf, log):
                    format_ = Formats.by_display_name()['RDFa']

    else:
        # Excel files sometimes not picked up by magic, so try alternative
        if is_excel(filepath, log):
            format_ = Formats.by_display_name()['XLS']
        # BSD file picks up some files that Magic misses
        # e.g. some MS Word files
        if not format_:
            format_ = run_bsd_file(filepath, log)
                
    if not format_:
        log.warning('Could not detect format of file: %s', filepath)
    return format_
Exemplo n.º 5
0
    def get_package_dict(self, harvest_object, package_dict_defaults,
                         source_config, existing_dataset):
        '''
        Constructs a package_dict suitable to be passed to package_create or
        package_update. See documentation on
        ckan.logic.action.create.package_create for more details

        * name - a new package must have a unique name; if it had a name in the
          previous harvest, that will be in the package_dict_defaults.
        * resource.id - should be the same as the old object if updating a
          package
        * errors - call self._save_object_error() and return False
        * default values for name, owner_org, tags etc can be merged in using:
            package_dict = package_dict_defaults.merge(package_dict_harvested)
        '''
        inv_dataset = InventoryDocument.dataset_to_dict(
                       InventoryDocument.parse_xml_string(harvest_object.content)
                       )

        pkg = dict(
            title=inv_dataset['title'],
            notes=inv_dataset['description'],
            state='active' if inv_dataset['active'] else 'deleted',
            resources=[],
            extras={self.IDENTIFIER_KEY: inv_dataset['identifier'],
                    'harvest_source_reference': harvest_object.guid
                    }
            )
        # License
        rights = inv_dataset.get('rights')
        if rights:
            register = model.Package.get_license_register()
            if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/':
                pkg['license_id'] = 'uk-ogl'
            else:
                for l in register.values():
                    if l.url == rights:
                        pkg['license_id'] = l.id
                        break
                else:
                    # just save it as it is
                    pkg['license_id'] = register
                    log.info('Did not recognize license %r', register)
        else:
            pkg['license_id'] = None

        # Resources
        inv_resources = [r for r in inv_dataset['resources'] if r['active']]
        existing_resource_urls = dict((r.url, r.id)
                                      for r in existing_dataset.resources) \
                                 if existing_dataset else {}
        pkg['resources'] = []
        for inv_resource in inv_resources:
            format_ = Formats.by_mime_type().get(inv_resource['mimetype'])
            if format_:
                format_ = format_['display_name']
            else:
                format_ = inv_resource['mimetype']
            description = inv_resource['title']
            if inv_resource['availability']:
                description += ' - %s' % inv_resource['availability']
            # if it is temporal, it should be a timeseries,
            # if it is not data, it should be an additional resource
            resource_type = 'file' if inv_resource['resource_type'] == 'Data' \
                else 'documentation'
            # Schema
            if inv_resource['conforms_to']:
                schema_url = inv_resource['conforms_to']
                schema_type = SCHEMA_TYPE_MAP.get(format_)
            else:
                schema_url = schema_type = ''
            res = {'url': inv_resource['url'],
                   'format': format_,
                   'description': description,
                   'resource_type': resource_type,
                   'schema-url': schema_url,
                   'schema-type': schema_type,
                   }
            if res['url'] in existing_resource_urls:
                res['id'] = existing_resource_urls[res['url']]
            pkg['resources'].append(res)

        # Local Authority Services and Functions
        if inv_dataset['services']:
            log.info('Local Authority Services: %r', inv_dataset['services'])
            # e.g. {http://id.esd.org.uk/service/190}
            pkg['extras']['la_service'] = ' '.join(inv_dataset['services'])
        else:
            pkg['extras']['la_service'] = ''
        if inv_dataset['functions']:
            log.info('Local Authority Functions %r', inv_dataset['functions'])
            pkg['extras']['la_function'] = ' '.join(inv_dataset['functions'])
        else:
            pkg['extras']['la_function'] = ''

        pkg = package_dict_defaults.merge(pkg)
        if not pkg.get('name'):
            # append the publisher name to differentiate similar titles better
            # than just a numbers suffix
            publisher = model.Group.get(harvest_object.job.source.publisher_id)
            publisher_abbrev = self._get_publisher_abbreviation(publisher)
            pkg['name'] = self.check_name(self.munge_title_to_name(
                '%s %s' % (pkg['title'], publisher_abbrev)))

        # Themes based on services/functions
        if 'tags' not in pkg:
            pkg['tags'] = []
        themes = dgutheme.categorize_package(pkg)
        log.debug('%s given themes: %r', pkg['name'], themes)
        if themes:
            pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0]
            if len(themes) == 2:
                pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1]

        pkg['extras'] = self.extras_from_dict(pkg['extras'])
        return pkg
Exemplo n.º 6
0
 def test_by_mime_type(self):
     assert_equal(Formats.by_mime_type()["text/x-json"]["display_name"], "JSON")